示例#1
0
    def calculateScore(self, dataTable, functionTable, performanceTable):
        """Calculate the score of this model.

        This method is called by C{calculate} to separate operations
        that are performed by all models (in C{calculate}) from
        operations that are performed by specific models (in
        C{calculateScore}).

        @type subTable: DataTable
        @param subTable: The DataTable representing this model's lexical scope.
        @type functionTable: FunctionTable or None
        @param functionTable: A table of functions.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: A DataColumn containing the score.
        """

        performanceTable.begin("ClusteringModel")

        performanceTable.begin("set up")

        distributionBased = (self["modelClass"] == "distributionBased")
        clusteringFields = self.xpath("pmml:ClusteringField[not(@isCenterField='false')]")
        fieldWeights = [clusteringField.get("fieldWeight", defaultFromXsd=True, convertType=True) for clusteringField in clusteringFields]
        for fieldWeight in fieldWeights:
            if fieldWeight < 0.0:
                raise defs.PmmlValidationError("ClusteringField fieldWeights must all be non-negative (encountered %g)" % fieldWeight)
        clusters = self.xpath("pmml:Cluster")
        comparisonMeasure = self.childOfClass(ComparisonMeasure)
        defaultCompareFunction = comparisonMeasure.get("compareFunction", defaultFromXsd=True)
        metric = comparisonMeasure.childOfClass(PmmlClusteringMetric)
        metrictag = metric.t

        performanceTable.end("set up")

        for clusteringField in clusteringFields:
            dataType = dataTable.fields[clusteringField["field"]].fieldType.dataType
            if dataType == "string":
                raise defs.PmmlValidationError("ClusteringField \"%s\" has dataType \"%s\", which cannot be used for clustering" % (clusteringField["field"], dataType))

        missingValueWeights = self.childOfTag("MissingValueWeights")
        if missingValueWeights is None:
            adjustM = None

        else:
            performanceTable.begin("MissingValueWeights")

            missingWeights = missingValueWeights.childOfClass(PmmlArray).values(convertType=True)

            sumNMqi = NP("zeros", len(dataTable), dtype=NP.dtype(float))
            for clusteringField, missingWeight in zip(clusteringFields, missingWeights):
                clusteringField.addToAdjustM(dataTable, functionTable, performanceTable, sumNMqi, missingWeight)

            adjustM = NP(sum(missingWeights) / sumNMqi)
            adjustM[NP(sumNMqi == 0.0)] = 1.0

            performanceTable.end("MissingValueWeights")

        anyInvalid = NP("zeros", len(dataTable), dtype=NP.dtype(bool))
        for clusteringField in clusteringFields:
            mask = dataTable.fields[clusteringField["field"]].mask
            if mask is not None:
                NP("logical_or", anyInvalid, NP(mask == defs.INVALID), anyInvalid)

        bestClusterId = None
        bestClusterAffinity = None
        allClusterAffinities = {}

        for index, cluster in enumerate(clusters):
            array = cluster.childOfClass(PmmlArray)
            if array is None:
                raise defs.PmmlValidationError("Cluster must have an array to designate its center")

            centerStrings = array.values(convertType=False)
            if len(centerStrings) != len(clusteringFields):
                raise defs.PmmlValidationError("Cluster array has %d components, but there are %d ClusteringFields with isCenterField=true" % (len(centerStrings), len(clusteringFields)))

            performanceTable.begin(metrictag)

            if distributionBased:
                matrix = cluster.xpath("pmml:Covariances/pmml:Matrix")
                if len(matrix) != 1:
                    raise defs.PmmlValidationError("In distribution-based clustering, all clusters must have a Covariances/Matrix")
                try:
                    covarianceMatrix = NP("array", matrix[0].values(), dtype=NP.dtype(float))
                except ValueError:
                    raise defs.PmmlValidationError("Covariances/Matrix must contain real numbers for distribution-based clustering")

            else:
                covarianceMatrix = None

            state = self._State()
            metric.initialize(state, len(dataTable), len(clusteringFields), distributionBased)

            for clusteringField, centerString, fieldWeight in zip(clusteringFields, centerStrings, fieldWeights):
                if isinstance(metric, PmmlClusteringMetricBinary):
                    metric.accumulateBinary(state, dataTable.fields[clusteringField["field"]], centerString, distributionBased)
                else:
                    performanceTable.pause(metrictag)
                    cxy = clusteringField.compare(dataTable, functionTable, performanceTable, centerString, defaultCompareFunction, anyInvalid)
                    performanceTable.unpause(metrictag)
                    metric.accumulate(state, cxy, fieldWeight, distributionBased)

            distance = metric.finalizeDistance(state, adjustM, distributionBased, covarianceMatrix)
            del state

            performanceTable.end(metrictag)

            if index == 0:
                bestClusterId = NP("ones", len(dataTable), dtype=NP.dtype(int))   # 1-based index
                bestClusterAffinity = distance

            better = NP(distance < bestClusterAffinity)
            bestClusterId[better] = index + 1   # 1-based index
            bestClusterAffinity[better] = distance[better]

            allClusterAffinities[cluster.get("id", "%d" % (index + 1))] = distance

        if not anyInvalid.any():
            scoreMask = None
        else:
            scoreMask = NP(anyInvalid * defs.INVALID)

        performanceTable.begin("set scores")
        score = {}

        performanceTable.begin("predictedValue")
        fieldType = FakeFieldType("string", "categorical")
        clusterIdentifiers = NP("empty", len(dataTable), dtype=fieldType.dtype)
        for index, cluster in enumerate(clusters):
            value = fieldType.stringToValue(cluster.get("id", "%d" % (index + 1)))
            clusterIdentifiers[NP(bestClusterId == (index + 1))] = value
        score[None] = DataColumn(fieldType, clusterIdentifiers, scoreMask)
        performanceTable.end("predictedValue")

        if self.subFields["predictedDisplayValue"]:
            performanceTable.begin("predictedDisplayValue")
            fieldType = FakeFieldType("string", "categorical")
            clusterNames = NP("empty", len(dataTable), dtype=fieldType.dtype)
            for index, cluster in enumerate(clusters):
                value = fieldType.stringToValue(cluster.get("name", ""))
                clusterNames[NP(bestClusterId == (index + 1))] = value
            score["predictedDisplayValue"] = DataColumn(fieldType, clusterNames, scoreMask)
            performanceTable.end("predictedDisplayValue")

        if self.subFields["entity"]:
            performanceTable.begin("entity")
            fieldType = FakeFieldType("object", "any")
            entities = NP("empty", len(dataTable), dtype=fieldType.dtype)
            for index, cluster in enumerate(clusters):
                value = fieldType.stringToValue(cluster.get("name", ""))
                indexPlusOne = index + 1
                for i in xrange(len(entities)):
                    if bestClusterId[i] == indexPlusOne:
                        entities[i] = cluster
            score["entity"] = DataColumn(fieldType, entities, scoreMask)
            performanceTable.end("entity")

        if self.subFields["clusterId"]:
            performanceTable.begin("clusterId")
            fieldType = FakeFieldType("integer", "continuous")
            score["clusterId"] = DataColumn(fieldType, bestClusterId, scoreMask)
            performanceTable.end("clusterId")

        if self.subFields["entityId"]:
            performanceTable.begin("entityId")
            fieldType = FakeFieldType("integer", "continuous")
            score["entityId"] = DataColumn(fieldType, bestClusterId, scoreMask)
            performanceTable.end("entityId")

        if self.subFields["clusterAffinity"]:
            performanceTable.begin("clusterAffinity")
            fieldType = FakeFieldType("double", "continuous")
            score["clusterAffinity"] = DataColumn(fieldType, bestClusterAffinity, scoreMask)
            performanceTable.end("clusterAffinity")

        if self.subFields["affinity"]:
            performanceTable.begin("affinity")
            fieldType = FakeFieldType("double", "continuous")
            score["affinity"] = DataColumn(fieldType, bestClusterAffinity, scoreMask)
            performanceTable.end("affinity")

        if self.subFields["all"]:
            performanceTable.begin("all")
            fieldType = FakeFieldType("double", "continuous")
            for identifier, distance in allClusterAffinities.items():
                score["all.%s" % identifier] = DataColumn(fieldType, distance, scoreMask)
            performanceTable.end("all")

        performanceTable.end("set scores")
        performanceTable.end("ClusteringModel")
        return score
示例#2
0
    def calculateScore(self, dataTable, functionTable, performanceTable):
        """Calculate the score of this model.

        This method is called by C{calculate} to separate operations
        that are performed by all models (in C{calculate}) from
        operations that are performed by specific models (in
        C{calculateScore}).

        @type subTable: DataTable
        @param subTable: The DataTable representing this model's lexical scope.
        @type functionTable: FunctionTable or None
        @param functionTable: A table of functions.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: A DataColumn containing the score.
        """

        performanceTable.begin("ClusteringModel")

        performanceTable.begin("set up")

        distributionBased = (self["modelClass"] == "distributionBased")
        clusteringFields = self.xpath(
            "pmml:ClusteringField[not(@isCenterField='false')]")
        fieldWeights = [
            clusteringField.get("fieldWeight",
                                defaultFromXsd=True,
                                convertType=True)
            for clusteringField in clusteringFields
        ]
        for fieldWeight in fieldWeights:
            if fieldWeight < 0.0:
                raise defs.PmmlValidationError(
                    "ClusteringField fieldWeights must all be non-negative (encountered %g)"
                    % fieldWeight)
        clusters = self.xpath("pmml:Cluster")
        comparisonMeasure = self.childOfClass(ComparisonMeasure)
        defaultCompareFunction = comparisonMeasure.get("compareFunction",
                                                       defaultFromXsd=True)
        metric = comparisonMeasure.childOfClass(PmmlClusteringMetric)
        metrictag = metric.t

        performanceTable.end("set up")

        for clusteringField in clusteringFields:
            dataType = dataTable.fields[
                clusteringField["field"]].fieldType.dataType
            if dataType == "string":
                raise defs.PmmlValidationError(
                    "ClusteringField \"%s\" has dataType \"%s\", which cannot be used for clustering"
                    % (clusteringField["field"], dataType))

        missingValueWeights = self.childOfTag("MissingValueWeights")
        if missingValueWeights is None:
            adjustM = None

        else:
            performanceTable.begin("MissingValueWeights")

            missingWeights = missingValueWeights.childOfClass(
                PmmlArray).values(convertType=True)

            sumNMqi = NP("zeros", len(dataTable), dtype=NP.dtype(float))
            for clusteringField, missingWeight in zip(clusteringFields,
                                                      missingWeights):
                clusteringField.addToAdjustM(dataTable, functionTable,
                                             performanceTable, sumNMqi,
                                             missingWeight)

            adjustM = NP(sum(missingWeights) / sumNMqi)
            adjustM[NP(sumNMqi == 0.0)] = 1.0

            performanceTable.end("MissingValueWeights")

        anyInvalid = NP("zeros", len(dataTable), dtype=NP.dtype(bool))
        for clusteringField in clusteringFields:
            mask = dataTable.fields[clusteringField["field"]].mask
            if mask is not None:
                NP("logical_or", anyInvalid, NP(mask == defs.INVALID),
                   anyInvalid)

        bestClusterId = None
        bestClusterAffinity = None
        allClusterAffinities = {}

        for index, cluster in enumerate(clusters):
            array = cluster.childOfClass(PmmlArray)
            if array is None:
                raise defs.PmmlValidationError(
                    "Cluster must have an array to designate its center")

            centerStrings = array.values(convertType=False)
            if len(centerStrings) != len(clusteringFields):
                raise defs.PmmlValidationError(
                    "Cluster array has %d components, but there are %d ClusteringFields with isCenterField=true"
                    % (len(centerStrings), len(clusteringFields)))

            performanceTable.begin(metrictag)

            if distributionBased:
                matrix = cluster.xpath("pmml:Covariances/pmml:Matrix")
                if len(matrix) != 1:
                    raise defs.PmmlValidationError(
                        "In distribution-based clustering, all clusters must have a Covariances/Matrix"
                    )
                try:
                    covarianceMatrix = NP("array",
                                          matrix[0].values(),
                                          dtype=NP.dtype(float))
                except ValueError:
                    raise defs.PmmlValidationError(
                        "Covariances/Matrix must contain real numbers for distribution-based clustering"
                    )

            else:
                covarianceMatrix = None

            state = self._State()
            metric.initialize(state, len(dataTable), len(clusteringFields),
                              distributionBased)

            for clusteringField, centerString, fieldWeight in zip(
                    clusteringFields, centerStrings, fieldWeights):
                if isinstance(metric, PmmlClusteringMetricBinary):
                    metric.accumulateBinary(
                        state, dataTable.fields[clusteringField["field"]],
                        centerString, distributionBased)
                else:
                    performanceTable.pause(metrictag)
                    cxy = clusteringField.compare(dataTable, functionTable,
                                                  performanceTable,
                                                  centerString,
                                                  defaultCompareFunction,
                                                  anyInvalid)
                    performanceTable.unpause(metrictag)
                    metric.accumulate(state, cxy, fieldWeight,
                                      distributionBased)

            distance = metric.finalizeDistance(state, adjustM,
                                               distributionBased,
                                               covarianceMatrix)
            del state

            performanceTable.end(metrictag)

            if index == 0:
                bestClusterId = NP("ones", len(dataTable),
                                   dtype=NP.dtype(int))  # 1-based index
                bestClusterAffinity = distance

            better = NP(distance < bestClusterAffinity)
            bestClusterId[better] = index + 1  # 1-based index
            bestClusterAffinity[better] = distance[better]

            allClusterAffinities[cluster.get("id",
                                             "%d" % (index + 1))] = distance

        if not anyInvalid.any():
            scoreMask = None
        else:
            scoreMask = NP(anyInvalid * defs.INVALID)

        performanceTable.begin("set scores")
        score = {}

        performanceTable.begin("predictedValue")
        fieldType = FakeFieldType("string", "categorical")
        clusterIdentifiers = NP("empty", len(dataTable), dtype=fieldType.dtype)
        for index, cluster in enumerate(clusters):
            value = fieldType.stringToValue(
                cluster.get("id", "%d" % (index + 1)))
            clusterIdentifiers[NP(bestClusterId == (index + 1))] = value
        score[None] = DataColumn(fieldType, clusterIdentifiers, scoreMask)
        performanceTable.end("predictedValue")

        if self.subFields["predictedDisplayValue"]:
            performanceTable.begin("predictedDisplayValue")
            fieldType = FakeFieldType("string", "categorical")
            clusterNames = NP("empty", len(dataTable), dtype=fieldType.dtype)
            for index, cluster in enumerate(clusters):
                value = fieldType.stringToValue(cluster.get("name", ""))
                clusterNames[NP(bestClusterId == (index + 1))] = value
            score["predictedDisplayValue"] = DataColumn(
                fieldType, clusterNames, scoreMask)
            performanceTable.end("predictedDisplayValue")

        if self.subFields["entity"]:
            performanceTable.begin("entity")
            fieldType = FakeFieldType("object", "any")
            entities = NP("empty", len(dataTable), dtype=fieldType.dtype)
            for index, cluster in enumerate(clusters):
                value = fieldType.stringToValue(cluster.get("name", ""))
                indexPlusOne = index + 1
                for i in xrange(len(entities)):
                    if bestClusterId[i] == indexPlusOne:
                        entities[i] = cluster
            score["entity"] = DataColumn(fieldType, entities, scoreMask)
            performanceTable.end("entity")

        if self.subFields["clusterId"]:
            performanceTable.begin("clusterId")
            fieldType = FakeFieldType("integer", "continuous")
            score["clusterId"] = DataColumn(fieldType, bestClusterId,
                                            scoreMask)
            performanceTable.end("clusterId")

        if self.subFields["entityId"]:
            performanceTable.begin("entityId")
            fieldType = FakeFieldType("integer", "continuous")
            score["entityId"] = DataColumn(fieldType, bestClusterId, scoreMask)
            performanceTable.end("entityId")

        if self.subFields["clusterAffinity"]:
            performanceTable.begin("clusterAffinity")
            fieldType = FakeFieldType("double", "continuous")
            score["clusterAffinity"] = DataColumn(fieldType,
                                                  bestClusterAffinity,
                                                  scoreMask)
            performanceTable.end("clusterAffinity")

        if self.subFields["affinity"]:
            performanceTable.begin("affinity")
            fieldType = FakeFieldType("double", "continuous")
            score["affinity"] = DataColumn(fieldType, bestClusterAffinity,
                                           scoreMask)
            performanceTable.end("affinity")

        if self.subFields["all"]:
            performanceTable.begin("all")
            fieldType = FakeFieldType("double", "continuous")
            for identifier, distance in allClusterAffinities.items():
                score["all.%s" % identifier] = DataColumn(
                    fieldType, distance, scoreMask)
            performanceTable.end("all")

        performanceTable.end("set scores")
        performanceTable.end("ClusteringModel")
        return score
示例#3
0
    def evaluate(self, dataTable, functionTable, performanceTable):
        """Evaluate the expression, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this expression.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: The result of the calculation as a DataColumn.
        """

        performanceTable.begin("Discretize")

        dataColumn = dataTable.fields[self["field"]]
        if dataColumn.fieldType.dataType in ("object", "string", "boolean"):
            raise defs.PmmlValidationError("Discretize requires a numeric input field, but \"%s\" is" % dataColumn.fieldType.dataType)

        fieldType = FakeFieldType(self.get("dataType", "string"), self.get("optype", self._optype))
        fieldType._newValuesAllowed = True

        defaultValue = self.get("defaultValue")
        if defaultValue is not None:
            defaultValue = fieldType.stringToValue(defaultValue)

        data = NP("empty", len(dataTable), dtype=fieldType.dtype)
        mask = NP("empty", len(dataTable), dtype=defs.maskType)
        if defaultValue is None:
            mask[:] = defs.MISSING
        else:
            data[:] = defaultValue
            mask[:] = defs.VALID

        for discretizeBin in self.childrenOfTag("DiscretizeBin"):
            try:
                binValue = fieldType.stringToValue(discretizeBin["binValue"])
            except ValueError:
                raise defs.PmmlValidationError("Cannot cast DiscretizeBin binValue \"%s\" as %s %s" % (discretizeBin["binValue"], fieldType.optype, fieldType.dataType))

            fieldType.values.append(FakeFieldValue(value=binValue))

            interval = discretizeBin.childOfTag("Interval")

            closure = interval["closure"]
            leftMargin = interval.get("leftMargin")
            rightMargin = interval.get("rightMargin")
            selection = None

            if leftMargin is not None:
                try:
                    leftMargin = dataColumn.fieldType.stringToValue(leftMargin)
                except ValueError:
                    raise defs.PmmlValidationError("Improper value in Interval leftMargin specification: \"%s\"" % leftMargin)

                if closure in ("openClosed", "openOpen"):
                    if selection is None:
                        selection = NP(leftMargin < dataColumn.data)
                    else:
                        NP("logical_and", selection, NP(leftMargin < dataColumn.data), selection)

                elif closure in ("closedOpen", "closedClosed"):
                    if selection is None:
                        selection = NP(leftMargin <= dataColumn.data)
                    else:
                        NP("logical_and", selection, NP(leftMargin <= dataColumn.data), selection)

            if rightMargin is not None:
                try:
                    rightMargin = dataColumn.fieldType.stringToValue(rightMargin)
                except ValueError:
                    raise defs.PmmlValidationError("Improper value in Interval rightMargin specification: \"%s\"" % rightMargin)

                if closure in ("openOpen", "closedOpen"):
                    if selection is None:
                        selection = NP(dataColumn.data < rightMargin)
                    else:
                        NP("logical_and", selection, NP(dataColumn.data < rightMargin), selection)

                elif closure in ("openClosed", "closedClosed"):
                    if selection is None:
                        selection = NP(dataColumn.data <= rightMargin)
                    else:
                        NP("logical_and", selection, NP(dataColumn.data <= rightMargin), selection)
                
            if selection is not None:
                NP("logical_and", selection, NP(dataColumn.mask == defs.VALID), selection)
                data[selection] = binValue
                mask[selection] = defs.VALID

        mask[NP(dataColumn.mask == defs.MISSING)] = defs.MISSING
        mask[NP(dataColumn.mask == defs.INVALID)] = defs.INVALID

        data, mask = FieldCastMethods.applyMapMissingTo(fieldType, data, mask, self.get("mapMissingTo"))
        
        performanceTable.end("Discretize")
        return DataColumn(fieldType, data, mask)
示例#4
0
    def evaluate(self, dataTable, functionTable, performanceTable):
        """Evaluate the expression, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this expression.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: The result of the calculation as a DataColumn.
        """

        performanceTable.begin("Discretize")

        dataColumn = dataTable.fields[self["field"]]
        if dataColumn.fieldType.dataType in ("object", "string", "boolean"):
            raise defs.PmmlValidationError("Discretize requires a numeric input field, but \"%s\" is" % dataColumn.fieldType.dataType)

        fieldType = FakeFieldType(self.get("dataType", "string"), self.get("optype", self._optype))
        fieldType._newValuesAllowed = True

        defaultValue = self.get("defaultValue")
        if defaultValue is not None:
            defaultValue = fieldType.stringToValue(defaultValue)

        data = NP("empty", len(dataTable), dtype=fieldType.dtype)
        mask = NP("empty", len(dataTable), dtype=defs.maskType)
        if defaultValue is None:
            mask[:] = defs.MISSING
        else:
            data[:] = defaultValue
            mask[:] = defs.VALID

        for discretizeBin in self.childrenOfTag("DiscretizeBin"):
            try:
                binValue = fieldType.stringToValue(discretizeBin["binValue"])
            except ValueError:
                raise defs.PmmlValidationError("Cannot cast DiscretizeBin binValue \"%s\" as %s %s" % (discretizeBin["binValue"], fieldType.optype, fieldType.dataType))

            fieldType.values.append(FakeFieldValue(value=binValue))

            interval = discretizeBin.childOfTag("Interval")

            closure = interval["closure"]
            leftMargin = interval.get("leftMargin")
            rightMargin = interval.get("rightMargin")
            selection = None

            if leftMargin is not None:
                try:
                    leftMargin = dataColumn.fieldType.stringToValue(leftMargin)
                except ValueError:
                    raise defs.PmmlValidationError("Improper value in Interval leftMargin specification: \"%s\"" % leftMargin)

                if closure in ("openClosed", "openOpen"):
                    if selection is None:
                        selection = NP(leftMargin < dataColumn.data)
                    else:
                        NP("logical_and", selection, NP(leftMargin < dataColumn.data), selection)

                elif closure in ("closedOpen", "closedClosed"):
                    if selection is None:
                        selection = NP(leftMargin <= dataColumn.data)
                    else:
                        NP("logical_and", selection, NP(leftMargin <= dataColumn.data), selection)

            if rightMargin is not None:
                try:
                    rightMargin = dataColumn.fieldType.stringToValue(rightMargin)
                except ValueError:
                    raise defs.PmmlValidationError("Improper value in Interval rightMargin specification: \"%s\"" % rightMargin)

                if closure in ("openOpen", "closedOpen"):
                    if selection is None:
                        selection = NP(dataColumn.data < rightMargin)
                    else:
                        NP("logical_and", selection, NP(dataColumn.data < rightMargin), selection)

                elif closure in ("openClosed", "closedClosed"):
                    if selection is None:
                        selection = NP(dataColumn.data <= rightMargin)
                    else:
                        NP("logical_and", selection, NP(dataColumn.data <= rightMargin), selection)
                
            if selection is not None:
                NP("logical_and", selection, NP(dataColumn.mask == defs.VALID), selection)
                data[selection] = binValue
                mask[selection] = defs.VALID

        mask[NP(dataColumn.mask == defs.MISSING)] = defs.MISSING
        mask[NP(dataColumn.mask == defs.INVALID)] = defs.INVALID

        data, mask = FieldCastMethods.applyMapMissingTo(fieldType, data, mask, self.get("mapMissingTo"))
        
        performanceTable.end("Discretize")
        return DataColumn(fieldType, data, mask)
示例#5
0
    def verify(self, showSuccess=False, performanceTable=None):
        """Run the model verification tests defined by this element.

        The output is a list of results (all results or only failures,
        depending on C{showSuccess}), each of which is a dictionary of
        field names to values.  Fields are:

          - "success": was the comparison successful?
          - "expectedMissing", "observedMissing": is the
             expected/observed value missing?
          - "expectedValue", "observedValue": result as an internal
             value.
          - "expectedPythonValue", "observedPythonValue": result as a
             Python value.
          - "expectedDisplayValue", "observedDisplayValue": result as
             a string displayValue.

        Only "success", "expectedMissing", and "observedMissing" appear
        if the "is missing?" comparison was unsuccessful.

        @type showSuccess: bool
        @param showSuccess: If True, emit output even if the tests are successful.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: JSON-like list of dicts
        @return: As described above.
        """

        verificationFields = {}
        for verificationField in self.xpath("pmml:VerificationFields/pmml:VerificationField"):
            verificationField.column = verificationField.get("column", verificationField["field"])
            verificationField.precision = verificationField.get("precision", defaultFromXsd=True, convertType=True)
            verificationField.zeroThreshold = verificationField.get("zeroThreshold", defaultFromXsd=True, convertType=True)

            verificationField.data = []
            verificationField.mask = []
            verificationFields[verificationField.column] = verificationField

        inputData = {}
        inputMask = {}
        for index, row in enumerate(self.childOfClass(TableInterface).iterate()):
            for columnName, columnValue in row.items():
                verificationField = verificationFields.get(columnName)

                if verificationField is not None:
                    while len(verificationField.data) < index:
                        verificationField.data.append(defs.PADDING)
                        verificationField.mask.append(True)
                    
                    verificationField.data.append(columnValue)
                    verificationField.mask.append(False)

                else:
                    inputDataField = inputData.get(columnName)
                    if inputDataField is None:
                        inputDataField = []
                        inputData[columnName] = inputDataField
                        inputMask[columnName] = []
                    inputMaskField = inputMask[columnName]

                    while len(inputDataField) < index:
                        inputDataField.append(defs.PADDING)
                        inputMaskField.append(True)

                    inputDataField.append(columnValue)
                    inputMaskField.append(False)

        for verificationField in verificationFields.values():
            while len(verificationField.data) < index:
                verificationField.data.append(defs.PADDING)
                verificationField.mask.append(True)

        for columnName in inputData:
            inputDataField = inputData[columnName]
            inputMaskField = inputMask[columnName]
            while len(inputDataField) < index:
                inputDataField.append(defs.PADDING)
                inputMaskField.append(True)

        for columnName, verificationField in verificationFields.items():
            inputData[columnName] = verificationField.data
            inputMask[columnName] = verificationField.mask

        model = self.getparent()

        if performanceTable is None:
            performanceTable = FakePerformanceTable()

        performanceTable.begin("make DataTable")
        dataTable = DataTable(model, inputData, inputMask, inputState=None)
        performanceTable.end("make DataTable")

        functionTable = FunctionTable()

        for miningField in model.xpath("pmml:MiningSchema/pmml:MiningField"):
            miningField.replaceField(dataTable, functionTable, performanceTable)

        for calculable in model.calculableTrans():
            calculable.calculate(dataTable, functionTable, performanceTable)

        score = model.calculateScore(dataTable, functionTable, performanceTable)
        dataTable.score = score[None]
        if model.name is not None:
            for key, value in score.items():
                if key is None:
                    dataTable.fields[model.name] = value
                else:
                    dataTable.fields["%s.%s" % (model.name, key)] = value

        for outputField in self.xpath("../pmml:Output/pmml:OutputField"):
            displayName = outputField.get("displayName", outputField["name"])
            outputField.format(dataTable, functionTable, performanceTable, score)

        output = []
        for verificationField in verificationFields.values():
            observedOutput = dataTable.fields.get(verificationField["field"])

            if observedOutput is None:
                raise defs.PmmlValidationError("VerificationField references field \"%s\" but it was not produced by the model")
            fieldType = observedOutput.fieldType

            if fieldType.dataType == "object":
                try:
                    newArray = [float(x) for x in observedOutput.data]
                except ValueError:
                    pass
                else:
                    fieldType = FakeFieldType("double", "continuous")
                    observedOutput._data = newArray
                        
            for index in xrange(len(dataTable)):
                record = {"field": verificationField["field"], "index": index}

                record["expectedMissing"] = verificationField.mask[index]
                record["observedMissing"] = (observedOutput.mask is not None and observedOutput.mask[index] != defs.VALID)

                if record["expectedMissing"] != record["observedMissing"]:
                    record["success"] = False
                    output.append(record)

                elif not record["expectedMissing"]:
                    record["expectedValue"] = fieldType.stringToValue(verificationField.data[index])
                    record["observedValue"] = observedOutput.data[index]
                    record["expectedPythonValue"] = fieldType.valueToPython(record["expectedValue"])
                    record["observedPythonValue"] = fieldType.valueToPython(record["observedValue"])
                    record["expectedDisplayValue"] = fieldType.valueToString(record["expectedValue"])
                    record["observedDisplayValue"] = fieldType.valueToString(record["observedValue"])

                    if fieldType.optype == "continuous":
                        if (abs(record["expectedValue"]) <= verificationField.zeroThreshold) and (abs(record["observedValue"]) <= verificationField.zeroThreshold):
                            record["success"] = True
                        else:
                            record["success"] = ((record["expectedValue"] * (1.0 - verificationField.precision)) <= record["observedValue"] <= (record["expectedValue"] * (1.0 + verificationField.precision)))

                        if not record["success"] or showSuccess:
                            output.append(record)
                            
                    else:
                        if record["expectedValue"] != record["observedValue"]:
                            record["success"] = False
                            output.append(record)
                        else:
                            record["success"] = True
                            if showSuccess:
                                output.append(record)

        return output
示例#6
0
    def evaluate(self, dataTable, functionTable, performanceTable):
        """Evaluate the expression, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this expression.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: The result of the calculation as a DataColumn.
        """

        performanceTable.begin("MapValues")
        
        fieldType = FakeFieldType(self.get("dataType", "string"), self.get("optype", self._optype))
        fieldType._newValuesAllowed = True

        defaultValue = self.get("defaultValue")
        if defaultValue is not None:
            defaultValue = fieldType.stringToValue(defaultValue)

        data = NP("empty", len(dataTable), dtype=fieldType.dtype)
        if defaultValue is not None:
            data[:] = defaultValue

        outputColumn = self["outputColumn"]
        columnNameToField = {}
        for fieldColumnPair in self.childrenOfTag("FieldColumnPair"):
            dataColumn = dataTable.fields[fieldColumnPair["field"]]
            columnNameToField[fieldColumnPair["column"]] = dataColumn

        # cache partial selections because they'll be used over and over in intersecting sets
        dataSelections = {}
        missingSelections = {}
        coverage = NP("zeros", len(dataTable), dtype=NP.dtype(bool))

        for index, row in enumerate(self.childOfClass(TableInterface).iterate()):
            outputValue = row.get(outputColumn)
            if outputValue is None:
                raise defs.PmmlValidationError("MapValues has outputColumn \"%s\" but a column with that name does not appear in row %d of the table" % (outputColumn, index))
            del row[outputColumn]
            outputValue = fieldType.stringToValue(outputValue)

            # this is an intersection of all matching columns
            selection = NP("ones", len(dataTable), dtype=NP.dtype(bool))

            for columnName, columnValueString in row.items():
                dataColumn = columnNameToField.get(columnName)
                if dataColumn is not None:
                    columnValue = dataColumn.fieldType.stringToValue(columnValueString)

                    # one cached data array per column (name, value) pair
                    if (columnName, columnValueString) not in dataSelections:
                        selectData = NP(dataColumn.data == columnValue)
                        if dataColumn.mask is not None:
                            NP("logical_and", selectData, NP(dataColumn.mask == defs.VALID), selectData)
                        dataSelections[columnName, columnValueString] = selectData
                    NP("logical_and", selection, dataSelections[columnName, columnValueString], selection)

                    # one cached mask array per column name ("missing" has only one possible value, though I consider any non-VALID "missing")
                    if columnName not in missingSelections and dataColumn.mask is not None:
                        missingSelections[columnName] = NP(dataColumn.mask != defs.VALID)
                        
            # set the intersection to the output value
            data[selection] = outputValue
            NP("logical_or", coverage, selection, coverage)
        
        missing = NP("zeros", len(dataTable), dtype=NP.dtype(bool))
        for missingSelection in missingSelections.values():
            NP("logical_or", missing, missingSelection, missing)
        coverage -= missing

        mask = missing * defs.MISSING

        data, mask = FieldCastMethods.applyMapMissingTo(fieldType, data, mask, self.get("mapMissingTo"))

        if defaultValue is None:
            NP("logical_not", coverage, coverage)
            if mask is None:
                mask = NP(coverage * defs.MISSING)
            else:
                mask[coverage] = defs.MISSING

        performanceTable.end("MapValues")
        return DataColumn(fieldType, data, mask)
示例#7
0
    def verify(self, showSuccess=False, performanceTable=None):
        """Run the model verification tests defined by this element.

        The output is a list of results (all results or only failures,
        depending on C{showSuccess}), each of which is a dictionary of
        field names to values.  Fields are:

          - "success": was the comparison successful?
          - "expectedMissing", "observedMissing": is the
             expected/observed value missing?
          - "expectedValue", "observedValue": result as an internal
             value.
          - "expectedPythonValue", "observedPythonValue": result as a
             Python value.
          - "expectedDisplayValue", "observedDisplayValue": result as
             a string displayValue.

        Only "success", "expectedMissing", and "observedMissing" appear
        if the "is missing?" comparison was unsuccessful.

        @type showSuccess: bool
        @param showSuccess: If True, emit output even if the tests are successful.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: JSON-like list of dicts
        @return: As described above.
        """

        verificationFields = {}
        for verificationField in self.xpath(
                "pmml:VerificationFields/pmml:VerificationField"):
            verificationField.column = verificationField.get(
                "column", verificationField["field"])
            verificationField.precision = verificationField.get(
                "precision", defaultFromXsd=True, convertType=True)
            verificationField.zeroThreshold = verificationField.get(
                "zeroThreshold", defaultFromXsd=True, convertType=True)

            verificationField.data = []
            verificationField.mask = []
            verificationFields[verificationField.column] = verificationField

        inputData = {}
        inputMask = {}
        for index, row in enumerate(
                self.childOfClass(TableInterface).iterate()):
            for columnName, columnValue in row.items():
                verificationField = verificationFields.get(columnName)

                if verificationField is not None:
                    while len(verificationField.data) < index:
                        verificationField.data.append(defs.PADDING)
                        verificationField.mask.append(True)

                    verificationField.data.append(columnValue)
                    verificationField.mask.append(False)

                else:
                    inputDataField = inputData.get(columnName)
                    if inputDataField is None:
                        inputDataField = []
                        inputData[columnName] = inputDataField
                        inputMask[columnName] = []
                    inputMaskField = inputMask[columnName]

                    while len(inputDataField) < index:
                        inputDataField.append(defs.PADDING)
                        inputMaskField.append(True)

                    inputDataField.append(columnValue)
                    inputMaskField.append(False)

        for verificationField in verificationFields.values():
            while len(verificationField.data) < index:
                verificationField.data.append(defs.PADDING)
                verificationField.mask.append(True)

        for columnName in inputData:
            inputDataField = inputData[columnName]
            inputMaskField = inputMask[columnName]
            while len(inputDataField) < index:
                inputDataField.append(defs.PADDING)
                inputMaskField.append(True)

        for columnName, verificationField in verificationFields.items():
            inputData[columnName] = verificationField.data
            inputMask[columnName] = verificationField.mask

        model = self.getparent()

        if performanceTable is None:
            performanceTable = FakePerformanceTable()

        performanceTable.begin("make DataTable")
        dataTable = DataTable(model, inputData, inputMask, inputState=None)
        performanceTable.end("make DataTable")

        functionTable = FunctionTable()

        for miningField in model.xpath("pmml:MiningSchema/pmml:MiningField"):
            miningField.replaceField(dataTable, functionTable,
                                     performanceTable)

        for calculable in model.calculableTrans():
            calculable.calculate(dataTable, functionTable, performanceTable)

        score = model.calculateScore(dataTable, functionTable,
                                     performanceTable)
        dataTable.score = score[None]
        if model.name is not None:
            for key, value in score.items():
                if key is None:
                    dataTable.fields[model.name] = value
                else:
                    dataTable.fields["%s.%s" % (model.name, key)] = value

        for outputField in self.xpath("../pmml:Output/pmml:OutputField"):
            displayName = outputField.get("displayName", outputField["name"])
            outputField.format(dataTable, functionTable, performanceTable,
                               score)

        output = []
        for verificationField in verificationFields.values():
            observedOutput = dataTable.fields.get(verificationField["field"])

            if observedOutput is None:
                raise defs.PmmlValidationError(
                    "VerificationField references field \"%s\" but it was not produced by the model"
                )
            fieldType = observedOutput.fieldType

            if fieldType.dataType == "object":
                try:
                    newArray = [float(x) for x in observedOutput.data]
                except ValueError:
                    pass
                else:
                    fieldType = FakeFieldType("double", "continuous")
                    observedOutput._data = newArray

            for index in xrange(len(dataTable)):
                record = {"field": verificationField["field"], "index": index}

                record["expectedMissing"] = verificationField.mask[index]
                record["observedMissing"] = (
                    observedOutput.mask is not None
                    and observedOutput.mask[index] != defs.VALID)

                if record["expectedMissing"] != record["observedMissing"]:
                    record["success"] = False
                    output.append(record)

                elif not record["expectedMissing"]:
                    record["expectedValue"] = fieldType.stringToValue(
                        verificationField.data[index])
                    record["observedValue"] = observedOutput.data[index]
                    record["expectedPythonValue"] = fieldType.valueToPython(
                        record["expectedValue"])
                    record["observedPythonValue"] = fieldType.valueToPython(
                        record["observedValue"])
                    record["expectedDisplayValue"] = fieldType.valueToString(
                        record["expectedValue"])
                    record["observedDisplayValue"] = fieldType.valueToString(
                        record["observedValue"])

                    if fieldType.optype == "continuous":
                        if (abs(record["expectedValue"]) <=
                                verificationField.zeroThreshold) and (
                                    abs(record["observedValue"]) <=
                                    verificationField.zeroThreshold):
                            record["success"] = True
                        else:
                            record["success"] = (
                                (record["expectedValue"] *
                                 (1.0 - verificationField.precision)) <=
                                record["observedValue"] <=
                                (record["expectedValue"] *
                                 (1.0 + verificationField.precision)))

                        if not record["success"] or showSuccess:
                            output.append(record)

                    else:
                        if record["expectedValue"] != record["observedValue"]:
                            record["success"] = False
                            output.append(record)
                        else:
                            record["success"] = True
                            if showSuccess:
                                output.append(record)

        return output
示例#8
0
    def evaluate(self, dataTable, functionTable, performanceTable):
        """Evaluate the expression, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this expression.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: The result of the calculation as a DataColumn.
        """

        performanceTable.begin("MapValues")

        fieldType = FakeFieldType(self.get("dataType", "string"),
                                  self.get("optype", self._optype))
        fieldType._newValuesAllowed = True

        defaultValue = self.get("defaultValue")
        if defaultValue is not None:
            defaultValue = fieldType.stringToValue(defaultValue)

        data = NP("empty", len(dataTable), dtype=fieldType.dtype)
        if defaultValue is not None:
            data[:] = defaultValue

        outputColumn = self["outputColumn"]
        columnNameToField = {}
        for fieldColumnPair in self.childrenOfTag("FieldColumnPair"):
            dataColumn = dataTable.fields[fieldColumnPair["field"]]
            columnNameToField[fieldColumnPair["column"]] = dataColumn

        # cache partial selections because they'll be used over and over in intersecting sets
        dataSelections = {}
        missingSelections = {}
        coverage = NP("zeros", len(dataTable), dtype=NP.dtype(bool))

        for index, row in enumerate(
                self.childOfClass(TableInterface).iterate()):
            outputValue = row.get(outputColumn)
            if outputValue is None:
                raise defs.PmmlValidationError(
                    "MapValues has outputColumn \"%s\" but a column with that name does not appear in row %d of the table"
                    % (outputColumn, index))
            del row[outputColumn]
            outputValue = fieldType.stringToValue(outputValue)

            # this is an intersection of all matching columns
            selection = NP("ones", len(dataTable), dtype=NP.dtype(bool))

            for columnName, columnValueString in row.items():
                dataColumn = columnNameToField.get(columnName)
                if dataColumn is not None:
                    columnValue = dataColumn.fieldType.stringToValue(
                        columnValueString)

                    # one cached data array per column (name, value) pair
                    if (columnName, columnValueString) not in dataSelections:
                        selectData = NP(dataColumn.data == columnValue)
                        if dataColumn.mask is not None:
                            NP("logical_and", selectData,
                               NP(dataColumn.mask == defs.VALID), selectData)
                        dataSelections[columnName,
                                       columnValueString] = selectData
                    NP("logical_and", selection,
                       dataSelections[columnName,
                                      columnValueString], selection)

                    # one cached mask array per column name ("missing" has only one possible value, though I consider any non-VALID "missing")
                    if columnName not in missingSelections and dataColumn.mask is not None:
                        missingSelections[columnName] = NP(
                            dataColumn.mask != defs.VALID)

            # set the intersection to the output value
            data[selection] = outputValue
            NP("logical_or", coverage, selection, coverage)

        missing = NP("zeros", len(dataTable), dtype=NP.dtype(bool))
        for missingSelection in missingSelections.values():
            NP("logical_or", missing, missingSelection, missing)
        coverage -= missing

        mask = missing * defs.MISSING

        data, mask = FieldCastMethods.applyMapMissingTo(
            fieldType, data, mask, self.get("mapMissingTo"))

        if defaultValue is None:
            NP("logical_not", coverage, coverage)
            if mask is None:
                mask = NP(coverage * defs.MISSING)
            else:
                mask[coverage] = defs.MISSING

        performanceTable.end("MapValues")
        return DataColumn(fieldType, data, mask)