示例#1
0
 def applyWithoutMask(self, data, mask, argument):
     data, allbad = data
     NP("logical_xor", data, argument.data, data)
     if argument.mask is not None:
         NP("logical_and", allbad, NP(argument.mask != defs.VALID),
            allbad)
     return (data, allbad), mask
示例#2
0
    def evaluate(self, dataTable, functionTable, performanceTable):
        """Evaluate the expression, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this expression.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: The result of the calculation as a DataColumn.
        """

        performanceTable.begin("NormDiscrete")

        dataColumn = dataTable.fields[self["field"]]
        value = dataColumn.fieldType.stringToValue(self["value"])
        data = NP("array",
                  NP(dataColumn.data == value),
                  dtype=self._fieldType.dtype)
        data, mask = FieldCastMethods.applyMapMissingTo(
            self._fieldType, data, dataColumn.mask, self.get("mapMissingTo"))

        performanceTable.end("NormDiscrete")
        return DataColumn(self._fieldType, data, mask)
示例#3
0
    def select(self, dataTable, functionTable, performanceTable):
        """Evaluate the expression or predicate, given input data and
        a function table.

        @type dataTable: DataTable
        @param dataTable: Contains the data to plot.
        @type functionTable: FunctionTable
        @param functionTable: Defines functions that may be used to transform data for plotting.
        @type performanceTable: PerformanceTable
        @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process.
        @rtype: 1d Numpy array of bool
        @return: The result of the expression or predicate as a Numpy mask.
        """

        predicate = self.childOfClass(PmmlPredicate)
        if predicate is not None:
            return predicate.evaluate(dataTable, functionTable,
                                      performanceTable)

        expression = self.childOfClass(PmmlExpression)
        dataColumn = expression.evaluate(dataTable, functionTable,
                                         performanceTable)

        if not dataColumn.fieldType.isboolean():
            raise defs.PmmlValidationError(
                "PlotSelection must evaluate to boolean, not %r" %
                dataColumn.fieldType)

        dataColumn._unlock()
        if dataColumn.mask is not None:
            NP("logical_and", dataColumn.data,
               NP(dataColumn.mask == defs.VALID), dataColumn.data)

        return dataColumn.data
示例#4
0
    def mapper(self, dataTable):
        dataTable = dataTable.subTable()  # ensure that the results of this calculation do not get propagated

        self.metadata["ClusteringModel"].calculate(dataTable, performanceTable=self.performanceTable)

        data = dataTable.score.data
        mask = dataTable.score.mask
        stringToValue = dataTable.score.fieldType.stringToValue
        for index, cluster in enumerate(self.clusters):
            clusterName = cluster.get("id", "%d" % (index + 1))
            value = stringToValue(clusterName)

            selection = NP(data == value)
            if mask is not None:
                NP("logical_and", selection, NP(mask == defs.VALID), selection)

            denominator = selection.sum()

            numer = dict((fieldName, 0.0) for fieldName in self.fieldNames)
            denom = dict((fieldName, 0.0) for fieldName in self.fieldNames)

            for fieldName in self.fieldNames:
                numer[fieldName] += dataTable.fields[fieldName].data[selection].sum()
                denom[fieldName] += denominator

            self.emit(clusterName, {"numer": numer, "denom": denom})
示例#5
0
        def evaluate(self, dataTable, functionTable, performanceTable,
                     arguments):
            arguments = [
                x.evaluate(dataTable, functionTable, performanceTable)
                for x in arguments
            ]
            performanceTable.begin("built-in \"%s\"" % self.name)

            fieldType = self.allBooleanType(arguments, atleast=2)

            data = NP("zeros", len(dataTable), dtype=fieldType.dtype)
            mask = None
            allbad = NP("ones", len(dataTable), dtype=NP.dtype(bool))

            (data, allbad), mask = self.applySkipMissing((data, allbad), mask,
                                                         arguments)

            if allbad.any():
                if mask is None:
                    mask = allbad * defs.MISSING
                else:
                    NP("logical_and", allbad, NP(mask == defs.VALID), allbad)
                    mask[allbad] = defs.MISSING

            performanceTable.end("built-in \"%s\"" % self.name)
            return DataColumn(fieldType, data, mask)
示例#6
0
    def applyInvalidValueTreatment(mask, invalidValueTreatment, overwrite=False):
        """Replace INVALID values with MISSING if invalidValueTreatment is "asMissing".

        This function does not modify the original data (unless
        C{overwrite} is True), but it returns a substitute.  Example
        use::

            mask = dataColumn.mask
            mask = FieldCastMethods.applyInvalidValueTreatment(mask, pmml.get("invalidValueTreatment"))
            return DataColumn(dataColumn.fieldType, dataColumn.data, mask)

        It can also be used in conjunction with other FieldCastMethods.

        @type mask: 1d Numpy array of dtype defs.maskType, or None
        @param mask: The mask.
        @type invalidValueTreatment: string
        @param invalidValueTreatment: One of "returnInvalid", "asIs", "asMissing"; only "asMissing" has an effect.
        @type overwrite: bool
        @param overwrite: If True, temporarily unlike and overwrite the original mask.
        @rtype: 1d Numpy array of dtype defs.maskType
        @return: The new mask.
        """

        if mask is None: return mask

        if invalidValueTreatment == "asMissing":
            if overwrite:
                mask.setflags(write=True)
            else:
                mask = NP("copy", mask)
                mask.setflags(write=True)
            mask[NP(mask == defs.INVALID)] = defs.MISSING

        return mask
示例#7
0
    def generateSamples(self, low, high):
        """Used by C{prepare} to generate an array of samples.

        @type low: number
        @param low: Minimum value to sample.
        @type high: number
        @param high: Maximum value to sample.
        @rtype: 1d Numpy array
        @return: An array of uniform, random, or adaptive samples of an interval.
        """

        numSamples = self.get("numSamples",
                              defaultFromXsd=True,
                              convertType=True)
        samplingMethod = self.get("samplingMethod", defaultFromXsd=True)

        if samplingMethod == "uniform":
            samples = NP("linspace", low, high, numSamples, endpoint=True)

        elif samplingMethod == "random":
            samples = NP(
                NP(NP(NP.random.rand(numSamples)) * (high - low)) + low)
            samples.sort()

        else:
            raise NotImplementedError("TODO: add 'adaptive'")

        return samples
示例#8
0
    def _checkNumpy(self, data, mask, tryToCast=True):
        if mask is None and isinstance(data, NP.ma.MaskedArray):
            m = NP.ma.getmask(data)
            if m is not None:
                mask = m

        if isinstance(data, NP.ma.MaskedArray):
            data = NP.ma.getdata(data)
        
        if isinstance(data, NP.ndarray):
            if len(data.shape) != 1:
                raise TypeError("DataColumns cannot be built from n > 1 dimensional arrays")
            if tryToCast and data.dtype != self.dtype:
                try:
                    data = NP("array", data, dtype=self.dtype)
                except (TypeError, ValueError):
                    pass

        if isinstance(mask, NP.ndarray):
            if mask.shape != data.shape:
                raise TypeError("Mask, if provided, must have the same shape as data")
            if mask.dtype != defs.maskType:
                mask = NP(NP(mask != 0) * defs.MISSING)
        
        return data, mask
示例#9
0
    def subDataColumn(self, selection=None):
        """Return or filter this DataColumn with C{selection}.

        If C{selection} is None, this function returns a shallow copy
        of the DataColumn.  It has a new Python C{id}, but the
        potentially large numerical array is not copied.  This
        function can therefore be used in performance-critical
        situtations.

        @type selection: 1d Numpy array of dtype bool, or None
        @param selection: If None, simply return the DataColumn; otherwise, use the boolean array to filter it.
        @rtype: DataColumn
        @return: A DataColumn of the same length or shorter.
        """

        if selection is None:
            return DataColumn(self._fieldType, self._data, self._mask)

        else:
            subData = self.data[selection]
            if self.mask is None:
                subMask = None
            else:
                subMask = self.mask[selection]

            if not isinstance(subData, NP.ndarray):
                subData = NP("array", [subData])
                if subMask != None:
                    subMask = NP("array", [subMask])

            return DataColumn(self._fieldType, subData, subMask)
示例#10
0
    def _toDataColumn_dateTime(self, data, mask):
        data, mask = self._checkNumpy(data, mask, tryToCast=False)
        data, mask = self._checkNonNumpy(data, mask)

        data2 = NP("empty", len(data), dtype=self.dtype)
        mask2 = NP("zeros", len(data), dtype=defs.maskType)

        for i, x in enumerate(data):
            if (mask is not None and mask[i]) or (isinstance(x, float) and math.isnan(x)) or (isinstance(x, basestring) and x.upper() == "NAN"):
                data2[i] = defs.PADDING
                mask2[i] = defs.MISSING
            else:
                try:
                    data2[i] = self.stringToValue(x)
                except (ValueError, TypeError):
                    data2[i] = defs.PADDING
                    mask2[i] = defs.INVALID

        if not mask2.any():
            data, mask = data2, None
        else:
            data, mask = data2, mask2

        data, mask = self._checkValues(data, mask)
        data, mask = self._checkIntervals(data, mask)
        return DataColumn(self, data, mask)
示例#11
0
    def _toDataColumn_dateTimeNumber(self, data, mask):
        dataColumn = self._toDataColumn_number(data, mask)
        data, mask = NP(NP(dataColumn.data * self._factor) + self._offset), dataColumn.mask

        data, mask = self._checkValues(data, mask)
        data, mask = self._checkIntervals(data, mask)
        return DataColumn(self, data, mask)
示例#12
0
    def functionAverageFake(self, value, howmany, fieldType):
        """Averages rows in a DataColumn when it is known that there are no matches.

        @type value: number
        @param value: Initial and final value.
        @type howmany: int
        @param howmany: Number of rows.
        @type fieldType: FieldType
        @param fieldType: The type of field to emulate.
        @rtype: DataColumn
        @return: The faked results.
        """

        fieldType = FakeFieldType("double", "continuous")
        numerator = NP("empty", howmany, dtype=fieldType.dtype)
        denominator = NP("empty", howmany, dtype=fieldType.dtype)
        numerator[:] = value[0]
        denominator[:] = value[1]
        data = NP(numerator / denominator)
        if value[1] == 0:
            mask = NP("empty", howmany, dtype=defs.maskType)
            mask[:] = defs.INVALID
        else:
            mask = None
        return DataColumn(fieldType, data, mask)
示例#13
0
    def maskInvalid(self, data, mask):
        """Helper method to replace NaN and infinite values with
        INVALID after a potentially dangerous operation.

        Example::

            result = NP("log", dataColumn.data)    # log(0) = -inf, log(-x) = nan
            resultMask = self.maskInvalid(result, dataColumn.mask)
            return DataColumn(fakeFieldType, result, resultMask)

        The input C{data} and C{mask} are not modified by this
        method; a substitute mask is returned.

        @type data: 1d Numpy array
        @param data: The dataset that may contain NaN and infinite values.
        @type mask: 1d Numpy array of C{defs.maskType}, or None
        @param mask: The original mask.
        @rtype: 1d Numpy array of C{defs.maskType}, or None
        @return: The new mask.
        """

        bad = NP("logical_not", NP("isfinite", data))
        if bad.any():
            if mask is None:
                mask = bad * defs.INVALID
            else:
                NP("logical_and", bad, NP(mask == defs.VALID), bad)
                if not mask.flags.writeable:
                    mask = NP("copy", mask)
                    mask.setflags(write=True)
                mask[bad] = defs.INVALID
        if mask is not None and not mask.any():
            mask = None
        return mask
示例#14
0
    def evaluate(self,
                 dataTable,
                 functionTable,
                 performanceTable,
                 returnUnknowns=False):
        """Evaluate the predicate, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this predicate.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this predicate.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @type returnUnknowns: bool
        @param returnUnknowns: If True, return a "mask" for the selection that indicates which rows are unknown, rather than True or False.
        @rtype: 1d Numpy array of bool or 3-tuple of arrays
        @return: Either a simple selection array or selection, unknowns, encounteredUnknowns
        """

        performanceTable.begin("Predicate False")

        result = NP("zeros", len(dataTable), dtype=NP.dtype(bool))
        if returnUnknowns:
            unknowns = NP("zeros", len(dataTable), dtype=NP.dtype(bool))
            result = result, unknowns, unknowns

        performanceTable.end("Predicate False")
        return result
示例#15
0
 def applyWithMask(self, data, mask, argument, mask2):
     data, allbad = data
     data[mask2] = NP("logical_xor", data[mask2], argument.data[mask2])
     if argument.mask is not None:
         allbad[mask2] = NP("logical_and",
                            NP(allbad[mask2] != defs.VALID),
                            argument.mask[mask2])
     return (data, allbad), mask
示例#16
0
    def calculate(self, dataTable, functionTable=None, performanceTable=None):
        """Perform a calculation directly, without constructing a
        DataTable first.

        This method is intended for performance-critical cases where
        the DataTable would be built without having to analyze the
        PMML for field type context.

        This method modifies the input DataTable and FunctionTable.

        @type dataTable: DataTable
        @param dataTable: The pre-built DataTable.
        @type functionTable: FunctionTable or None
        @param functionTable: A table of functions.  Initially, it contains only the built-in functions, but any user functions defined in PMML would be added to it.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataTable
        @return: A DataTable containing the result, usually a modified version of the input.
        """

        if functionTable is None:
            functionTable = FunctionTable()
        if performanceTable is None:
            performanceTable = FakePerformanceTable()

        if not self.get("isScorable", defaultFromXsd=True, convertType=True):
            dataTable.score = DataColumn(self.scoreType,
                                         NP(NP("ones", len(dataTable), dtype=self.scoreType.dtype) * defs.PADDING),
                                         NP(NP("ones", len(dataTable), dtype=defs.maskType) * defs.INVALID))
            return dataTable

        subTable = dataTable.subTable()

        for miningField in self.xpath("pmml:MiningSchema/pmml:MiningField"):
            miningField.replaceField(subTable, functionTable, performanceTable)

        for calculable in self.calculableTrans():
            calculable.calculate(subTable, functionTable, performanceTable)

        score = self.calculateScore(subTable, functionTable, performanceTable)
        dataTable.score = score[None]
        if self.name is not None:
            for key, value in score.items():
                if key is None:
                    dataTable.fields[self.name] = value
                else:
                    dataTable.fields["%s.%s" % (self.name, key)] = value

        for outputField in self.xpath("pmml:Output/pmml:OutputField"):
            displayName = outputField.get("displayName", outputField["name"])
            dataTable.output[displayName] = outputField.format(subTable, functionTable, performanceTable, score)

        for fieldName in subTable.output:
            dataTable.output[fieldName] = subTable.output[fieldName]

        return dataTable.score
示例#17
0
    def finalizeDistance(self, state, adjustM, distributionBased, covarianceMatrix):
        """Third and final step in a vectorized metric calculation, called once after all fields and cluster centers.

        Only modifes the C{state} object.

        @type state: ad-hoc Python object
        @param state: State information that persists long enough to span the three steps of a metric calculation.  This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState.
        @type adjustM: 1d Numpy array of numbers
        @param adjustM: The "adjustM" value, intended to adjust for missing values, as defined in the PMML specification.
        @type distributionBased: bool
        @param distributionBased: If True, use a covariance matrix to scale the distance result.
        @type covarianceMatrix: Numpy matrix
        @param covarianceMatrix: The covariance matrix to scale the result if C{distributionBased}.
        @rtype: 1d Numpy array of numbers
        @return: The array of distances or similarities for center-based clustering, and number of standard deviations for distribution-based clustering.
        """

        if adjustM is None:
            result = state.sumInQuadrature
        else:
            result = NP(state.sumInQuadrature * adjustM)

        if distributionBased:
            normalizations = NP("sqrt", NP("sum", NP(state.displacements**2), axis=1))
            selection = NP(normalizations > 0.0)
            state.displacements[selection] = state.displacements[selection] / (normalizations[:, NP.newaxis])[selection]

            lengthOfSigma = NP("sum", NP(NP(state.displacements.dot(covarianceMatrix)) * state.displacements), axis=1)

            result[selection] = NP(result[selection] / lengthOfSigma[selection])

        return result
示例#18
0
 def _fromDataColumn_number(self, dataColumn):
     if dataColumn.mask is None:
         return NP("array", dataColumn.data, dtype=NP.dtype(object))
     else:
         output = NP("empty", len(dataColumn), dtype=NP.dtype(object))
         mask = dataColumn.mask
         for i, x in enumerate(dataColumn.data):
             if mask[i] == defs.VALID:
                 output[i] = x
             elif mask[i] == defs.MISSING:
                 output[i] = defs.NAN
             else:
                 output[i] = None
         return output
示例#19
0
    def applyMapMissingTo(fieldType, data, mask, mapMissingTo, overwrite=False):
        """Replace MISSING values with a given substitute.

        This function does not modify the original data (unless
        C{overwrite} is True), but it returns a substitute.  Example
        use::

            data, mask = dataColumn.data, dataColumn.mask
            data, mask = FieldCastMethods.applyMapMissingTo(dataColumn.fieldType, data, mask, "-999")
            return DataColumn(dataColumn.fieldType, data, mask)

        It can also be used in conjunction with other FieldCastMethods.

        @type fieldType: FieldType
        @param fieldType: The data fieldType (to interpret C{mapMissingTo}).
        @type data: 1d Numpy array
        @param data: The data.
        @type mask: 1d Numpy array of dtype defs.maskType, or None
        @param mask: The mask.
        @type mapMissingTo: string
        @param mapMissingTo: The replacement value, represented as a string (e.g. directly from a PMML attribute).
        @type overwrite: bool
        @param overwrite: If True, temporarily unlike and overwrite the original mask.
        @rtype: 2-tuple of 1d Numpy arrays
        @return: The new data and mask.
        """

        if mask is None: return data, mask

        if mapMissingTo is not None:
            selection = NP(mask == defs.MISSING)
            try:
                mappedValue = fieldType.stringToValue(mapMissingTo)
            except ValueError as err:
                raise defs.PmmlValidationError("mapMissingTo string \"%s\" cannot be cast as %r: %s" % (mapMissingTo, fieldType, str(err)))

            if overwrite:
                data.setflags(write=True)
                mask.setflags(write=True)
            else:
                data = NP("copy", data)
                mask = NP("copy", mask)

            data[selection] = mappedValue
            mask[selection] = defs.VALID

            if not mask.any():
                mask = None

        return data, mask
示例#20
0
    def logpdf(self, array):
        """Vectorized logarithm of the probability density function (PDF).

        @type array: 1d Numpy array of numbers
        @param array: The input vector.
        @rtype: 1d Numpy array of numbers
        @return: The result of ln(PDF_Gaussian(x)) for all input values x.
        """

        mean = float(self.attrib["mean"])
        twovariance = 2.0 * float(self.attrib["variance"])
        return NP(
            NP(NP("negative", NP("square", NP(array - mean))) / twovariance) -
            math.log(math.sqrt(math.pi * twovariance)))
示例#21
0
    def singleton(self, inputData, inputMask=None, inputState=None):
        """Create a single-row DataTable for event-based processes.

        This static method is to the DataTable constructor, but it
        creates a DataTable with only one row and it uses the Python
        data type of the C{inputData} to define a type, rather than an
        explicit C{context}.

        @type inputData: dict-like mapping from strings to single values (not lists)
        @param inputData: A single data record.
        @type inputMask: dict-like mapping from strings to single C{defs.maskType} values (not lists), or None
        @param inputMask: A single mask.
        @type inputState: DataTableState or None
        @param inputState: Initial state of the DataTable.  To continue a previous calculation, use the C{dataTable.state} from the previous calculation.
        """

        dataColumns = OrderedDict()
        for fieldName in sorted(inputData.keys()):
            value = inputData[fieldName]

            if isinstance(value, basestring):
                fieldType = FakeFieldType("string", "continuous")
            elif isinstance(value, float):
                fieldType = FakeFieldType("double", "continuous")
            elif isinstance(value, int):
                fieldType = FakeFieldType("integer", "continuous")
            elif isinstance(value, bool):
                fieldType = FakeFieldType("boolean", "continuous")

            # TODO: PMML date types (when passed a datetype.datetype object)

            else:
                fieldType = FakeFieldType("object", "any")

            data = NP("empty", 1, dtype=fieldType.dtype)
            data[0] = value

            if inputMask is None or inputMask.get(fieldName) is None:
                mask = None
            else:
                mask = NP("empty", 1, dtype=defs.maskType)
                mask[0] = inputMask.get(fieldName)

            dataColumns[fieldName] = DataColumn(fieldType, data, mask)

        dataTable = DataTable.__new__(DataTable)
        dataTable._configure(dataColumns, inputState)
        return dataTable
示例#22
0
    def accumulate(self, state, cxy, fieldWeight, distributionBased):
        """Second step in a vectorized metric calculation, called for each field and cluster center.

        Only modifies the C{state} object.

        @type state: ad-hoc Python object
        @param state: State information that persists long enough to span the three steps of a metric calculation.  This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState.
        @type cxy: 1d Numpy array of numbers
        @param cxy: Comparison distance or similarity for all rows.
        @type fieldWeight: number
        @param fieldWeight: The weight of this field.
        @type distributionBased: bool
        @param distributionBased: If True, use a covariance matrix to scale the distance result.
        """

        NP("maximum", state.maximumComponent, NP(cxy * fieldWeight), state.maximumComponent)
示例#23
0
    def _toDataColumn_string(self, data, mask):
        dataColumn = self._toDataColumn_object(data, mask)

        data = dataColumn.data
        mask = dataColumn.mask
        data.setflags(write=True)
        if mask is not None:
            mask.setflags(write=True)

        if mask is not None:
            for i, x in enumerate(dataColumn.data):
                if (x is None or (isinstance(x, float) and math.isnan(x))) and mask[i] == defs.VALID:
                    mask[i] = defs.MISSING
                elif not isinstance(x, basestring):
                    data[i] = repr(x)

        else:
            for i, x in enumerate(dataColumn.data):
                if x is None or (isinstance(x, float) and math.isnan(x)):
                    if mask is None:
                        mask = NP("zeros", len(data), dtype=defs.maskType)
                    mask[i] = defs.MISSING
                elif not isinstance(x, basestring):
                    data[i] = repr(x)

            if mask is not None:
                dataColumn._mask = mask

        data, mask = self._checkValues(data, mask)
        data, mask = self._checkIntervals(data, mask)

        return DataColumn(self, data, mask)
示例#24
0
    def zmaxPush(self, zmax, fieldType, sticky=False):
        """Make the z range of the bounding box larger by (possibly)
        pushing the z maximum higher.

        "Sticky" means that the final bounding box will not be
        expanded beyond this value, if it turns out to be the most
        extreme.  This feature is used, for example, in the layout of
        a vertical histogram: the xmin and xmax of the plot window
        should align with the xmin and xmax of a histogram unless an
        overlaying graphic pushes the boundary farther.  The ymax of
        the histogram should be inflated beyond the tallest bin so
        that it can be clearly seen.

        If C{zStrictlyPositive} is True, negative C{zmax} values are
        ignored.

        @type zmax: number
        @param zmax: The new C{zmax}, if this C{zmax} is larger than the currently largest C{zmax}.
        @type fieldType: FieldType
        @param fieldType: The FieldType of z.  Only homogeneous FieldTypes are allowed.
        @type sticky: bool
        @param sticky: Label this zmax as a "sticky" zmax.
        @raise PmmlValidationError: If any z FieldTypes differ, this function will raise an error.
        """

        self._checkFieldTypeZ(fieldType)
        if NP("isfinite", zmax) and (not self.zStrictlyPositive or zmax > 0.0) and (self.zmax is None or zmax > self.zmax):
            self.zmax = zmax
            if sticky: self.zmaxSticky = zmax
示例#25
0
    def yminPush(self, ymin, fieldType, sticky=False):
        """Make the y range of the bounding box larger by (possibly)
        pushing the y minimum lower.

        "Sticky" means that the final bounding box will not be
        expanded beyond this value, if it turns out to be the most
        extreme.  This feature is used, for example, in the layout of
        a vertical histogram: the xmin and xmax of the plot window
        should align with the xmin and xmax of a histogram unless an
        overlaying graphic pushes the boundary farther.  The ymax of
        the histogram should be inflated beyond the tallest bin so
        that it can be clearly seen.

        If C{yStrictlyPositive} is True, negative C{ymin} values are
        ignored.

        @type ymin: number
        @param ymin: The new C{ymin}, if this C{ymin} is smaller than the currently smallest C{ymin}.
        @type fieldType: FieldType
        @param fieldType: The FieldType of y.  Only homogeneous FieldTypes are allowed.
        @type sticky: bool
        @param sticky: Label this ymin as a "sticky" ymin.
        @raise PmmlValidationError: If any y FieldTypes differ, this function will raise an error.
        """

        self._checkFieldTypeY(fieldType)
        if NP("isfinite", ymin) and (not self.yStrictlyPositive or ymin > 0.0) and (self.ymin is None or ymin < self.ymin):
            self.ymin = ymin
            if sticky: self.yminSticky = ymin
示例#26
0
    def endReducerKey(self, key):
        for clusterName in self.clusterVectors.keys():
            if clusterName == key:
                newPosition = NP("array", [self.numer[fieldName] / self.denom[fieldName] if self.denom[fieldName] > 0.0 else 0.0 for fieldName in self.fieldNames], dtype=NP.dtype(float))

                self.emit(clusterName, newPosition)
                break
示例#27
0
    def mapReduce(self):
        """Build a MapReduce-Ready K-means producer.

        Used by C{optimize} and C{hadoopOptimize}.

        @rtype: MapReduce
        @return: An instance of MapReduce that can either be run in pure-Python mode or submitted to Hadoop.
        """
        class KMeansMapReduceApplication(MapReduceKMeans):
            metadata = {}
            allChangeThreshold = self.allChangeThreshold

        KMeansMapReduceApplication.metadata[
            "ClusteringModel"] = self.clusteringModel

        clusterVectors = {}
        for index, cluster in enumerate(
                self.clusteringModel.xpath("pmml:Cluster")):
            clusterName = cluster.get("id", "%d" % (index + 1))
            clusterVectors[clusterName] = NP(
                "array",
                cluster.childOfTag("Array").values(),
                dtype=NP.dtype(float))
        KMeansMapReduceApplication.metadata["clusterVectors"] = clusterVectors

        self.KMeansMapReduceApplication = KMeansMapReduceApplication

        return MapReduce(KMeansMapReduceApplication)
示例#28
0
    def determineScaleBins(numBins, low, high, array):
        """Determine the C{numBins}, C{low}, and C{high} of the
        histogram from explicitly set values where available and
        implicitly derived values where necessary.

        Explicitly set values always override implicit values derived
        from the dataset.
          - C{low}, C{high} implicit values are the extrema of the
            dataset.
          - C{numBins} implicit value is the Freedman-Diaconis
            heuristic for number of histogram bins.

        @type numBins: int or None
        @param numBins: Input number of bins.
        @type low: number or None
        @param low: Low edge.
        @type high: number or None
        @param high: High edge.
        @type array: 1d Numpy array of numbers
        @param array: Dataset to use to implicitly derive values.
        @rtype: 3-tuple
        @return: C{numBins}, C{low}, C{high}
        """

        generateLow = (low is None)
        generateHigh = (high is None)

        if generateLow: low = float(array.min())
        if generateHigh: high = float(array.max())

        if low == high:
            low, high = low - 1.0, high + 1.0
        elif high < low:
            if generateLow:
                low = high - 1.0
            elif generateHigh:
                high = low + 1.0
            else:
                raise defs.PmmlValidationError(
                    "PlotHistogram attributes low and high must be in the right order: low = %g, high = %g"
                    % (low, high))
        else:
            if generateLow and generateHigh:
                low, high = low - 0.2 * (high - low), high + 0.2 * (high - low)
            elif generateLow:
                low = low - 0.2 * (high - low)
            elif generateHigh:
                high = high + 0.2 * (high - low)

        if numBins is None:
            # the Freedman-Diaconis rule
            q1, q3 = NP("percentile", array, [25.0, 75.0])
            binWidth = 2.0 * (q3 - q1) / math.pow(len(array), 1.0 / 3.0)
            if binWidth > 0.0:
                numBins = max(10, int(math.ceil((high - low) / binWidth)))
            else:
                numBins = 10

        return numBins, low, high
示例#29
0
 def evaluate(self, dataTable, functionTable, performanceTable,
              arguments):
     dataColumn = Between.evaluate(dataTable, functionTable,
                                   performanceTable, arguments)
     dataColumn._unlock()
     NP("logical_not", dataColumn.data, dataColumn.data)
     dataColumn._lock()
     return dataColumn
示例#30
0
    def finalizeDistance(self, state, adjustM, distributionBased, covarianceMatrix):
        """Third and final step in a vectorized metric calculation, called once after all fields and cluster centers.

        Only modifes the C{state} object.

        @type state: ad-hoc Python object
        @param state: State information that persists long enough to span the three steps of a metric calculation.  This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState.
        @type adjustM: 1d Numpy array of numbers
        @param adjustM: The "adjustM" value, intended to adjust for missing values, as defined in the PMML specification.
        @type distributionBased: bool
        @param distributionBased: If True, use a covariance matrix to scale the distance result.
        @type covarianceMatrix: Numpy matrix
        @param covarianceMatrix: The covariance matrix to scale the result if C{distributionBased}.
        @rtype: 1d Numpy array of numbers
        @return: The array of distances or similarities for center-based clustering, and number of standard deviations for distribution-based clustering.
        """

        return NP(NP(state.a11 + state.a00) / NP(NP(NP(state.a11 + state.a10) + state.a01) + state.a00))