示例#1
0
    def applyInvalidValueTreatment(mask, invalidValueTreatment, overwrite=False):
        """Replace INVALID values with MISSING if invalidValueTreatment is "asMissing".

        This function does not modify the original data (unless
        C{overwrite} is True), but it returns a substitute.  Example
        use::

            mask = dataColumn.mask
            mask = FieldCastMethods.applyInvalidValueTreatment(mask, pmml.get("invalidValueTreatment"))
            return DataColumn(dataColumn.fieldType, dataColumn.data, mask)

        It can also be used in conjunction with other FieldCastMethods.

        @type mask: 1d Numpy array of dtype defs.maskType, or None
        @param mask: The mask.
        @type invalidValueTreatment: string
        @param invalidValueTreatment: One of "returnInvalid", "asIs", "asMissing"; only "asMissing" has an effect.
        @type overwrite: bool
        @param overwrite: If True, temporarily unlike and overwrite the original mask.
        @rtype: 1d Numpy array of dtype defs.maskType
        @return: The new mask.
        """

        if mask is None: return mask

        if invalidValueTreatment == "asMissing":
            if overwrite:
                mask.setflags(write=True)
            else:
                mask = NP("copy", mask)
                mask.setflags(write=True)
            mask[NP(mask == defs.INVALID)] = defs.MISSING

        return mask
示例#2
0
    def maskInvalid(self, data, mask):
        """Helper method to replace NaN and infinite values with
        INVALID after a potentially dangerous operation.

        Example::

            result = NP("log", dataColumn.data)    # log(0) = -inf, log(-x) = nan
            resultMask = self.maskInvalid(result, dataColumn.mask)
            return DataColumn(fakeFieldType, result, resultMask)

        The input C{data} and C{mask} are not modified by this
        method; a substitute mask is returned.

        @type data: 1d Numpy array
        @param data: The dataset that may contain NaN and infinite values.
        @type mask: 1d Numpy array of C{defs.maskType}, or None
        @param mask: The original mask.
        @rtype: 1d Numpy array of C{defs.maskType}, or None
        @return: The new mask.
        """

        bad = NP("logical_not", NP("isfinite", data))
        if bad.any():
            if mask is None:
                mask = bad * defs.INVALID
            else:
                NP("logical_and", bad, NP(mask == defs.VALID), bad)
                if not mask.flags.writeable:
                    mask = NP("copy", mask)
                    mask.setflags(write=True)
                mask[bad] = defs.INVALID
        if mask is not None and not mask.any():
            mask = None
        return mask
示例#3
0
    def maskInvalid(self, data, mask):
        """Helper method to replace NaN and infinite values with
        INVALID after a potentially dangerous operation.

        Example::

            result = NP("log", dataColumn.data)    # log(0) = -inf, log(-x) = nan
            resultMask = self.maskInvalid(result, dataColumn.mask)
            return DataColumn(fakeFieldType, result, resultMask)

        The input C{data} and C{mask} are not modified by this
        method; a substitute mask is returned.

        @type data: 1d Numpy array
        @param data: The dataset that may contain NaN and infinite values.
        @type mask: 1d Numpy array of C{defs.maskType}, or None
        @param mask: The original mask.
        @rtype: 1d Numpy array of C{defs.maskType}, or None
        @return: The new mask.
        """

        bad = NP("logical_not", NP("isfinite", data))
        if bad.any():
            if mask is None:
                mask = bad * defs.INVALID
            else:
                NP("logical_and", bad, NP(mask == defs.VALID), bad)
                if not mask.flags.writeable:
                    mask = NP("copy", mask)
                    mask.setflags(write=True)
                mask[bad] = defs.INVALID
        if mask is not None and not mask.any():
            mask = None
        return mask
示例#4
0
    def outliersAsMissing(mask, originalMask, selection, overwrite=False):
        """Label all rows specified by a selection as MISSING.

        This function does not modify the original mask (unless
        C{overwrite} is True), but it returns a substitute.  Example
        use::

            mask = dataColumn.mask
            mask = FieldCastMethods.outliersAsMissing(mask, dataColumn.mask, dataColumn.data < MINIMUM_CUT)
            mask = FieldCastMethods.outliersAsMissing(mask, dataColumn.mask, dataColumn.data > MAXIMUM_CUT)
            return DataColumn(dataColumn.fieldType, dataColumn.data, mask)

        It can also be used in conjunction with other FieldCastMethods.

        @type mask: 1d Numpy array of type defs.maskType, or None
        @param mask: The mask to be updated.
        @type originalMask: 1d Numpy array of type defs.maskType, or None
        @param originalMask: The original mask.
        @type selection: 1d Numpy array of bool
        @param selection: The rows to label as MISSING.
        @type overwrite: bool
        @param overwrite: If True, temporarily unlock and overwrite the original mask.
        @rtype: 1d Numpy array of type defs.maskType
        @return: The new mask.
        """

        if mask is None:
            mask = selection * defs.MISSING

        elif mask is originalMask:
            NP("logical_and", selection, NP(mask == defs.VALID), selection)
            if overwrite:
                mask.setflags(write=True)
            else:
                mask = NP("copy", mask)
                mask.setflags(write=True)
            mask[selection] = defs.MISSING

        else:
            NP("logical_and", selection, NP(mask == defs.VALID), selection)
            mask[selection] = defs.MISSING

        return mask
示例#5
0
    def replaceField(self, dataTable, functionTable, performanceTable):
        """Replace a field in the DataTable for outlier removal,
        missing value handling, and invalid value treatment.

        @type dataTable: DataTable
        @param dataTable: The pre-built DataTable.
        @type functionTable: FunctionTable
        @param functionTable: A table of functions.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        """

        dataColumn = dataTable.fields.get(self.name)
        if dataColumn is None:
            return

        performanceTable.begin("MiningField")

        optype = self.get("optype", dataColumn.fieldType.optype)
        if optype != dataColumn.fieldType.optype:
            dataColumn = FieldCastMethods.cast(FakeFieldType(dataColumn.fieldType.dataType, optype), dataColumn)

        data = dataColumn.data
        mask = dataColumn.mask

        outliers = self.get("outliers")
        
        lowValue = self.get("lowValue")
        if lowValue is not None:
            lowValue = dataColumn.fieldType.stringToValue(lowValue)

            if outliers == "asMissingValues":
                selection = NP(dataColumn.data < lowValue)
                mask = FieldCastMethods.outliersAsMissing(mask, dataColumn.mask, selection)

            elif outliers == "asExtremeValues":
                selection = NP(dataColumn.data < lowValue)
                if data is dataColumn.data:
                    data = NP("copy", data)
                    data.setflags(write=True)
                    data[selection] = lowValue

        highValue = self.get("highValue")
        if highValue is not None:
            highValue = dataColumn.fieldType.stringToValue(highValue)

            if outliers == "asMissingValues":
                selection = NP(dataColumn.data > highValue)
                mask = FieldCastMethods.outliersAsMissing(mask, dataColumn.mask, selection)

            elif outliers == "asExtremeValues":
                selection = NP(dataColumn.data > highValue)
                if data is dataColumn.data:
                    data = NP("copy", data)
                    data.setflags(write=True)
                    data[selection] = highValue

        mask = FieldCastMethods.applyInvalidValueTreatment(mask, self.get("invalidValueTreatment"))
        data, mask = FieldCastMethods.applyMapMissingTo(dataColumn.fieldType, data, mask, self.get("missingValueReplacement"))

        dataTable.fields.replaceField(self.name, DataColumn(dataColumn.fieldType, data, mask))
        performanceTable.end("MiningField")
示例#6
0
    def replaceField(self, dataTable, functionTable, performanceTable):
        """Replace a field in the DataTable for outlier removal,
        missing value handling, and invalid value treatment.

        @type dataTable: DataTable
        @param dataTable: The pre-built DataTable.
        @type functionTable: FunctionTable
        @param functionTable: A table of functions.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        """

        dataColumn = dataTable.fields.get(self.name)
        if dataColumn is None:
            return

        performanceTable.begin("MiningField")

        optype = self.get("optype", dataColumn.fieldType.optype)
        if optype != dataColumn.fieldType.optype:
            dataColumn = FieldCastMethods.cast(
                FakeFieldType(dataColumn.fieldType.dataType, optype),
                dataColumn)

        data = dataColumn.data
        mask = dataColumn.mask

        outliers = self.get("outliers")

        lowValue = self.get("lowValue")
        if lowValue is not None:
            lowValue = dataColumn.fieldType.stringToValue(lowValue)

            if outliers == "asMissingValues":
                selection = NP(dataColumn.data < lowValue)
                mask = FieldCastMethods.outliersAsMissing(
                    mask, dataColumn.mask, selection)

            elif outliers == "asExtremeValues":
                selection = NP(dataColumn.data < lowValue)
                if data is dataColumn.data:
                    data = NP("copy", data)
                    data.setflags(write=True)
                    data[selection] = lowValue

        highValue = self.get("highValue")
        if highValue is not None:
            highValue = dataColumn.fieldType.stringToValue(highValue)

            if outliers == "asMissingValues":
                selection = NP(dataColumn.data > highValue)
                mask = FieldCastMethods.outliersAsMissing(
                    mask, dataColumn.mask, selection)

            elif outliers == "asExtremeValues":
                selection = NP(dataColumn.data > highValue)
                if data is dataColumn.data:
                    data = NP("copy", data)
                    data.setflags(write=True)
                    data[selection] = highValue

        mask = FieldCastMethods.applyInvalidValueTreatment(
            mask, self.get("invalidValueTreatment"))
        data, mask = FieldCastMethods.applyMapMissingTo(
            dataColumn.fieldType, data, mask,
            self.get("missingValueReplacement"))

        dataTable.fields.replaceField(
            self.name, DataColumn(dataColumn.fieldType, data, mask))
        performanceTable.end("MiningField")