示例#1
0
    def evaluate(self, dataTable, functionTable, performanceTable):
        """Evaluate the expression, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this expression.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: The result of the calculation as a DataColumn.
        """

        performanceTable.begin("NormContinuous")
        
        dataColumn = dataTable.fields[self["field"]]
        if dataColumn.fieldType.dataType in ("object", "string", "boolean"):
            raise defs.PmmlValidationError("NormContinuous requires a numeric input field, but \"%s\" is" % dataColumn.fieldType.dataType)

        outliers = self.get("outliers")

        linearNorms = self.childrenOfTag("LinearNorm")
        for linearNorm in linearNorms:
            linearNorm.orig = float(linearNorm["orig"])
            linearNorm.norm = float(linearNorm["norm"])

        linearNorms.sort(lambda x, y: cmp(x.orig, y.orig))   # technically, it's invalid if not already sorted

        data = NP("empty", len(dataTable), self._fieldType.dtype)
        mask = dataColumn.mask

        # extrapolate before the first
        selection = NP(dataColumn.data <= linearNorms[0].orig)
        if outliers == "asMissingValues":
            mask = FieldCastMethods.outliersAsMissing(mask, dataColumn.mask, selection)
        elif outliers == "asExtremeValues":
            data[selection] = linearNorms[0].norm
        else:
            self.transformSelection(linearNorms[0], linearNorms[1], dataColumn.data, data, selection)

        for i in xrange(len(linearNorms) - 1):
            selection = NP(linearNorms[i].orig < dataColumn.data)
            NP("logical_and", selection, NP(dataColumn.data <= linearNorms[i+1].orig), selection)

            self.transformSelection(linearNorms[i], linearNorms[i+1], dataColumn.data, data, selection)

        selection = NP(linearNorms[-1].orig < dataColumn.data)
        if outliers == "asMissingValues":
            mask = FieldCastMethods.outliersAsMissing(mask, dataColumn.mask, selection)
        elif outliers == "asExtremeValues":
            data[selection] = linearNorms[-1].norm
        else:
            self.transformSelection(linearNorms[-2], linearNorms[-1], dataColumn.data, data, selection)

        data, mask = FieldCastMethods.applyMapMissingTo(self._fieldType, data, mask, self.get("mapMissingTo"))

        performanceTable.end("NormContinuous")
        return DataColumn(self._fieldType, data, mask)
示例#2
0
    def replaceField(self, dataTable, functionTable, performanceTable):
        """Replace a field in the DataTable for outlier removal,
        missing value handling, and invalid value treatment.

        @type dataTable: DataTable
        @param dataTable: The pre-built DataTable.
        @type functionTable: FunctionTable
        @param functionTable: A table of functions.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        """

        dataColumn = dataTable.fields.get(self.name)
        if dataColumn is None:
            return

        performanceTable.begin("MiningField")

        optype = self.get("optype", dataColumn.fieldType.optype)
        if optype != dataColumn.fieldType.optype:
            dataColumn = FieldCastMethods.cast(FakeFieldType(dataColumn.fieldType.dataType, optype), dataColumn)

        data = dataColumn.data
        mask = dataColumn.mask

        outliers = self.get("outliers")
        
        lowValue = self.get("lowValue")
        if lowValue is not None:
            lowValue = dataColumn.fieldType.stringToValue(lowValue)

            if outliers == "asMissingValues":
                selection = NP(dataColumn.data < lowValue)
                mask = FieldCastMethods.outliersAsMissing(mask, dataColumn.mask, selection)

            elif outliers == "asExtremeValues":
                selection = NP(dataColumn.data < lowValue)
                if data is dataColumn.data:
                    data = NP("copy", data)
                    data.setflags(write=True)
                    data[selection] = lowValue

        highValue = self.get("highValue")
        if highValue is not None:
            highValue = dataColumn.fieldType.stringToValue(highValue)

            if outliers == "asMissingValues":
                selection = NP(dataColumn.data > highValue)
                mask = FieldCastMethods.outliersAsMissing(mask, dataColumn.mask, selection)

            elif outliers == "asExtremeValues":
                selection = NP(dataColumn.data > highValue)
                if data is dataColumn.data:
                    data = NP("copy", data)
                    data.setflags(write=True)
                    data[selection] = highValue

        mask = FieldCastMethods.applyInvalidValueTreatment(mask, self.get("invalidValueTreatment"))
        data, mask = FieldCastMethods.applyMapMissingTo(dataColumn.fieldType, data, mask, self.get("missingValueReplacement"))

        dataTable.fields.replaceField(self.name, DataColumn(dataColumn.fieldType, data, mask))
        performanceTable.end("MiningField")
示例#3
0
    def replaceField(self, dataTable, functionTable, performanceTable):
        """Replace a field in the DataTable for outlier removal,
        missing value handling, and invalid value treatment.

        @type dataTable: DataTable
        @param dataTable: The pre-built DataTable.
        @type functionTable: FunctionTable
        @param functionTable: A table of functions.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        """

        dataColumn = dataTable.fields.get(self.name)
        if dataColumn is None:
            return

        performanceTable.begin("MiningField")

        optype = self.get("optype", dataColumn.fieldType.optype)
        if optype != dataColumn.fieldType.optype:
            dataColumn = FieldCastMethods.cast(
                FakeFieldType(dataColumn.fieldType.dataType, optype),
                dataColumn)

        data = dataColumn.data
        mask = dataColumn.mask

        outliers = self.get("outliers")

        lowValue = self.get("lowValue")
        if lowValue is not None:
            lowValue = dataColumn.fieldType.stringToValue(lowValue)

            if outliers == "asMissingValues":
                selection = NP(dataColumn.data < lowValue)
                mask = FieldCastMethods.outliersAsMissing(
                    mask, dataColumn.mask, selection)

            elif outliers == "asExtremeValues":
                selection = NP(dataColumn.data < lowValue)
                if data is dataColumn.data:
                    data = NP("copy", data)
                    data.setflags(write=True)
                    data[selection] = lowValue

        highValue = self.get("highValue")
        if highValue is not None:
            highValue = dataColumn.fieldType.stringToValue(highValue)

            if outliers == "asMissingValues":
                selection = NP(dataColumn.data > highValue)
                mask = FieldCastMethods.outliersAsMissing(
                    mask, dataColumn.mask, selection)

            elif outliers == "asExtremeValues":
                selection = NP(dataColumn.data > highValue)
                if data is dataColumn.data:
                    data = NP("copy", data)
                    data.setflags(write=True)
                    data[selection] = highValue

        mask = FieldCastMethods.applyInvalidValueTreatment(
            mask, self.get("invalidValueTreatment"))
        data, mask = FieldCastMethods.applyMapMissingTo(
            dataColumn.fieldType, data, mask,
            self.get("missingValueReplacement"))

        dataTable.fields.replaceField(
            self.name, DataColumn(dataColumn.fieldType, data, mask))
        performanceTable.end("MiningField")
示例#4
0
    def evaluate(self, dataTable, functionTable, performanceTable):
        """Evaluate the expression, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this expression.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: The result of the calculation as a DataColumn.
        """

        performanceTable.begin("NormContinuous")

        dataColumn = dataTable.fields[self["field"]]
        if dataColumn.fieldType.dataType in ("object", "string", "boolean"):
            raise defs.PmmlValidationError(
                "NormContinuous requires a numeric input field, but \"%s\" is"
                % dataColumn.fieldType.dataType)

        outliers = self.get("outliers")

        linearNorms = self.childrenOfTag("LinearNorm")
        for linearNorm in linearNorms:
            linearNorm.orig = float(linearNorm["orig"])
            linearNorm.norm = float(linearNorm["norm"])

        linearNorms.sort(lambda x, y: cmp(x.orig, y.orig)
                         )  # technically, it's invalid if not already sorted

        data = NP("empty", len(dataTable), self._fieldType.dtype)
        mask = dataColumn.mask

        # extrapolate before the first
        selection = NP(dataColumn.data <= linearNorms[0].orig)
        if outliers == "asMissingValues":
            mask = FieldCastMethods.outliersAsMissing(mask, dataColumn.mask,
                                                      selection)
        elif outliers == "asExtremeValues":
            data[selection] = linearNorms[0].norm
        else:
            self.transformSelection(linearNorms[0], linearNorms[1],
                                    dataColumn.data, data, selection)

        for i in xrange(len(linearNorms) - 1):
            selection = NP(linearNorms[i].orig < dataColumn.data)
            NP("logical_and", selection,
               NP(dataColumn.data <= linearNorms[i + 1].orig), selection)

            self.transformSelection(linearNorms[i], linearNorms[i + 1],
                                    dataColumn.data, data, selection)

        selection = NP(linearNorms[-1].orig < dataColumn.data)
        if outliers == "asMissingValues":
            mask = FieldCastMethods.outliersAsMissing(mask, dataColumn.mask,
                                                      selection)
        elif outliers == "asExtremeValues":
            data[selection] = linearNorms[-1].norm
        else:
            self.transformSelection(linearNorms[-2], linearNorms[-1],
                                    dataColumn.data, data, selection)

        data, mask = FieldCastMethods.applyMapMissingTo(
            self._fieldType, data, mask, self.get("mapMissingTo"))

        performanceTable.end("NormContinuous")
        return DataColumn(self._fieldType, data, mask)