示例#1
0
    def evaluate(self,
                 dataTable,
                 functionTable,
                 performanceTable,
                 returnUnknowns=False):
        """Evaluate the predicate, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this predicate.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this predicate.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @type returnUnknowns: bool
        @param returnUnknowns: If True, return a "mask" for the selection that indicates which rows are unknown, rather than True or False.
        @rtype: 1d Numpy array of bool or 3-tuple of arrays
        @return: Either a simple selection array or selection, unknowns, encounteredUnknowns
        """

        performanceTable.begin("Predicate False")

        result = NP("zeros", len(dataTable), dtype=NP.dtype(bool))
        if returnUnknowns:
            unknowns = NP("zeros", len(dataTable), dtype=NP.dtype(bool))
            result = result, unknowns, unknowns

        performanceTable.end("Predicate False")
        return result
示例#2
0
    def cusum(self, testDistributions, fieldName, dataColumn, state, performanceTable):
        """Calculate the score of a CUSUM TestStatistic.

        The CUSUM cumulative sum is a stateful calculation: each row
        depends on the result of the previous row.  To continue
        calculations through multiple calls to C{calc} or
        C{calculate}, pass a DataTableState object and give the
        BaselineModel a C{stateId} attribute.  The C{stateId} is not
        valid in strict PMML, but it can be inserted after validation
        or used in custom-ODG models (C{from augustus.odg import *}).

        @type testDistributions: PmmlBinding
        @param testDistributions: The <TestDistributions> element.
        @type fieldName: string
        @param fieldName: The field name (for error messages).
        @type dataColumn: DataColumn
        @param dataColumn: The field.
        @type state: DataTableState
        @param state: The persistent state object, which is used to initialize the start state and save the end state of the cumulative sum.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: dict
        @return: A dictionary mapping PMML "feature" strings to DataColumns; CUSUM only defines the None key ("predictedValue").
        """

        baseline = testDistributions.xpath("pmml:Baseline/pmml:GaussianDistribution | pmml:Baseline/pmml:PoissonDistribution")
        alternate = testDistributions.xpath("pmml:Alternate/pmml:GaussianDistribution | pmml:Alternate/pmml:PoissonDistribution")

        if len(baseline) == 0 or len(alternate) == 0:
            raise defs.PmmlValidationError("BaselineModel CUSUM requires a Baseline and an Alternate that are either GaussianDistribution or PoissonDistribution")

        ratios = alternate[0].logpdf(dataColumn.data) - baseline[0].logpdf(dataColumn.data)
        if dataColumn.mask is None:
            good = NP("ones", len(dataColumn), dtype=NP.dtype(bool))
        else:
            good = NP(dataColumn.mask == defs.VALID)

        stateId = self.get("stateId")
        last = None
        if stateId is not None:
            last = state.get(stateId)
        if last is None:
            last = 0.0

        resetValue = testDistributions.get("resetValue", defaultFromXsd=True, convertType=True)

        output = NP("empty", len(dataColumn), dtype=NP.dtype(float))

        performanceTable.begin("fill CUSUM")
        for index in xrange(len(dataColumn)):
            if good[index]:
                last = max(resetValue, last + ratios[index])
            output[index] = last
        performanceTable.end("fill CUSUM")

        if stateId is not None:
            state[stateId] = last

        return {None: DataColumn(self.scoreType, output, None)}
示例#3
0
    def pointsToSmoothCurve(xarray, yarray, samples, smoothingScale, loop):
        """Fit a smooth line through a set of given numeric points
        with a characteristic smoothing scale.

        This is a non-parametric locally linear fit, used to plot data
        as a smooth line.

        @type xarray: 1d Numpy array of numbers
        @param xarray: Array of x values.
        @type yarray: 1d Numpy array of numbers
        @param yarray: Array of y values.
        @type samples: 1d Numpy array of numbers
        @param samples: Locations at which to fit the C{xarray} and C{yarray} with best-fit positions and derivatives.
        @type smoothingScale: number
        @param smoothingScale: Standard deviation of the Gaussian kernel used to smooth the locally linear fit.
        @type loop: bool
        @param loop: If False, disconnect the end of the fitted curve from the beginning.
        @rtype: 4-tuple of 1d Numpy arrays
        @return: C{xlist}, C{ylist}, C{dxlist}, C{dylist} appropriate for C{formatPathdata}.
        """

        ylist = []
        dylist = []

        for sample in samples:
            weights = NP(
                NP(
                    NP(
                        "exp",
                        NP(
                            NP(-0.5 * NP("power", NP(xarray - sample), 2)) /
                            NP(smoothingScale * smoothingScale))) /
                    smoothingScale) / (math.sqrt(2.0 * math.pi)))
            sum1 = weights.sum()
            sumx = NP(weights * xarray).sum()
            sumxx = NP(weights * NP(xarray * xarray)).sum()
            sumy = NP(weights * yarray).sum()
            sumxy = NP(weights * NP(xarray * yarray)).sum()

            delta = (sum1 * sumxx) - (sumx * sumx)
            intercept = ((sumxx * sumy) - (sumx * sumxy)) / delta
            slope = ((sum1 * sumxy) - (sumx * sumy)) / delta

            ylist.append(intercept + (sample * slope))
            dylist.append(slope)

        xlist = samples
        ylist = NP("array", ylist, dtype=NP.dtype(float))
        dxlist = NP((NP("roll", xlist, -1) - NP("roll", xlist, 1)) / 2.0)
        dylist = NP("array", dylist, dtype=NP.dtype(float)) * dxlist
        if not loop:
            dxlist[0] = 0.0
            dxlist[-1] = 0.0
            dylist[0] = 0.0
            dylist[-1] = 0.0

        return xlist, ylist, dxlist, dylist
示例#4
0
    def functionMultiset(self, dataColumn, whereMask, groupSelection, getstate,
                         setstate):
        """Derives a multiset of rows in a DataColumn, possibly with an SQL where mask and groupField.

        @type dataColumn: DataColumn
        @param dataColumn: The input data column.
        @type whereMask: 1d Numpy array of bool, or None
        @param whereMask: The result of the SQL where selection.
        @type groupSelection: 1d Numpy array of bool, or None.
        @param groupSelection: Rows corresponding to a particular value of the groupField.
        @type getstate: callable function
        @param getstate: Retrieve staring values from the DataTableState.
        @type setstate: callable function
        @param setstate: Store ending values to the DataTableState.
        @rtype: DataColumn of dict objects
        @return: A column of multisetted rows.
        """

        fieldType = FakeFieldType("object", "any")

        selection = NP("ones", len(dataColumn), dtype=NP.dtype(bool))
        if dataColumn.mask is not None:
            selection = NP("logical_and", selection,
                           NP(dataColumn.mask == defs.VALID))

        if whereMask is not None:
            NP("logical_and", selection, whereMask, selection)

        if groupSelection is not None:
            NP("logical_and", selection, groupSelection, selection)

        multiset = {}
        if getstate is not None:
            startingState = getstate()
            if startingState is not None:
                multiset = startingState
        current = dict(multiset)

        data = NP("empty", len(dataColumn), dtype=NP.dtype(object))

        toPython = dataColumn.fieldType.valueToPython
        for i, x in enumerate(dataColumn.data):
            if selection[i]:
                value = toPython(x)
                if value not in multiset:
                    multiset[value] = 0
                multiset[value] += 1
                current = dict(multiset)
            data[i] = current

        if setstate is not None:
            setstate(multiset)

        return DataColumn(fieldType, data, None)
示例#5
0
    def _checkValues(self, data, mask):
        values = self.values
        if len(values) == 0:
            return data, mask

        if mask is None:
            missing = NP("zeros", len(data), dtype=NP.dtype(bool))
            invalid = NP("zeros", len(data), dtype=NP.dtype(bool))
        else:
            missing = NP(mask == defs.MISSING)
            invalid = NP(mask == defs.INVALID)
        valid = NP("zeros", len(data), dtype=NP.dtype(bool))

        numberOfValidSpecified = 0
        for value in values:
            v = value.get("value")
            displayValue = value.get("displayValue")
            if displayValue is not None:
                self._displayValue[v] = displayValue

            prop = value.get("property", "valid")
            try:
                v2 = self.stringToValue(v)
            except ValueError:
                raise defs.PmmlValidationError("Improper value in Value specification: \"%s\"" % v)

            if prop == "valid":
                NP("logical_or", valid, NP(data == v2), valid)
                numberOfValidSpecified += 1
            elif prop == "missing":
                NP("logical_or", missing, NP(data == v2), missing)
            elif prop == "invalid":
                NP("logical_or", invalid, NP(data == v2), invalid)

        if numberOfValidSpecified > 0:
            # guilty until proven innocent
            NP("logical_and", valid, NP("logical_not", missing), valid)
            if valid.all():
                return data, None
            mask = NP(NP("ones", len(data), dtype=defs.maskType) * defs.INVALID)
            mask[missing] = defs.MISSING
            mask[valid] = defs.VALID

        else:
            # innocent until proven guilty
            NP("logical_and", invalid, NP("logical_not", missing), invalid)
            if not NP("logical_or", invalid, missing).any():
                return data, None
            mask = NP("zeros", len(data), dtype=defs.maskType)
            mask[missing] = defs.MISSING
            mask[invalid] = defs.INVALID

        return data, mask
示例#6
0
    def functionMultiset(self, dataColumn, whereMask, groupSelection, getstate, setstate):
        """Derives a multiset of rows in a DataColumn, possibly with an SQL where mask and groupField.

        @type dataColumn: DataColumn
        @param dataColumn: The input data column.
        @type whereMask: 1d Numpy array of bool, or None
        @param whereMask: The result of the SQL where selection.
        @type groupSelection: 1d Numpy array of bool, or None.
        @param groupSelection: Rows corresponding to a particular value of the groupField.
        @type getstate: callable function
        @param getstate: Retrieve staring values from the DataTableState.
        @type setstate: callable function
        @param setstate: Store ending values to the DataTableState.
        @rtype: DataColumn of dict objects
        @return: A column of multisetted rows.
        """

        fieldType = FakeFieldType("object", "any")

        selection = NP("ones", len(dataColumn), dtype=NP.dtype(bool))
        if dataColumn.mask is not None:
            selection = NP("logical_and", selection, NP(dataColumn.mask == defs.VALID))

        if whereMask is not None:
            NP("logical_and", selection, whereMask, selection)

        if groupSelection is not None:
            NP("logical_and", selection, groupSelection, selection)

        multiset = {}
        if getstate is not None:
            startingState = getstate()
            if startingState is not None:
                multiset = startingState
        current = dict(multiset)

        data = NP("empty", len(dataColumn), dtype=NP.dtype(object))

        toPython = dataColumn.fieldType.valueToPython
        for i, x in enumerate(dataColumn.data):
            if selection[i]:
                value = toPython(x)
                if value not in multiset:
                    multiset[value] = 0
                multiset[value] += 1
                current = dict(multiset)
            data[i] = current

        if setstate is not None:
            setstate(multiset)

        return DataColumn(fieldType, data, None)
示例#7
0
 def _fromDataColumn_number(self, dataColumn):
     if dataColumn.mask is None:
         return NP("array", dataColumn.data, dtype=NP.dtype(object))
     else:
         output = NP("empty", len(dataColumn), dtype=NP.dtype(object))
         mask = dataColumn.mask
         for i, x in enumerate(dataColumn.data):
             if mask[i] == defs.VALID:
                 output[i] = x
             elif mask[i] == defs.MISSING:
                 output[i] = defs.NAN
             else:
                 output[i] = None
         return output
示例#8
0
    def pointsToSmoothCurve(xarray, yarray, samples, smoothingScale, loop):
        """Fit a smooth line through a set of given numeric points
        with a characteristic smoothing scale.

        This is a non-parametric locally linear fit, used to plot data
        as a smooth line.

        @type xarray: 1d Numpy array of numbers
        @param xarray: Array of x values.
        @type yarray: 1d Numpy array of numbers
        @param yarray: Array of y values.
        @type samples: 1d Numpy array of numbers
        @param samples: Locations at which to fit the C{xarray} and C{yarray} with best-fit positions and derivatives.
        @type smoothingScale: number
        @param smoothingScale: Standard deviation of the Gaussian kernel used to smooth the locally linear fit.
        @type loop: bool
        @param loop: If False, disconnect the end of the fitted curve from the beginning.
        @rtype: 4-tuple of 1d Numpy arrays
        @return: C{xlist}, C{ylist}, C{dxlist}, C{dylist} appropriate for C{formatPathdata}.
        """

        ylist = []
        dylist = []

        for sample in samples:
            weights = NP(NP(NP("exp", NP(NP(-0.5 * NP("power", NP(xarray - sample), 2)) / NP(smoothingScale * smoothingScale))) / smoothingScale) / (math.sqrt(2.0*math.pi)))
            sum1 = weights.sum()
            sumx = NP(weights * xarray).sum()
            sumxx = NP(weights * NP(xarray * xarray)).sum()
            sumy = NP(weights * yarray).sum()
            sumxy = NP(weights * NP(xarray * yarray)).sum()

            delta = (sum1 * sumxx) - (sumx * sumx)
            intercept = ((sumxx * sumy) - (sumx * sumxy)) / delta
            slope = ((sum1 * sumxy) - (sumx * sumy)) / delta

            ylist.append(intercept + (sample * slope))
            dylist.append(slope)

        xlist = samples
        ylist = NP("array", ylist, dtype=NP.dtype(float))
        dxlist = NP((NP("roll", xlist, -1) - NP("roll", xlist, 1)) / 2.0)
        dylist = NP("array", dylist, dtype=NP.dtype(float)) * dxlist
        if not loop:
            dxlist[0] = 0.0
            dxlist[-1] = 0.0
            dylist[0] = 0.0
            dylist[-1] = 0.0

        return xlist, ylist, dxlist, dylist
示例#9
0
    def mapReduce(self):
        """Build a MapReduce-Ready K-means producer.

        Used by C{optimize} and C{hadoopOptimize}.

        @rtype: MapReduce
        @return: An instance of MapReduce that can either be run in pure-Python mode or submitted to Hadoop.
        """
        class KMeansMapReduceApplication(MapReduceKMeans):
            metadata = {}
            allChangeThreshold = self.allChangeThreshold

        KMeansMapReduceApplication.metadata[
            "ClusteringModel"] = self.clusteringModel

        clusterVectors = {}
        for index, cluster in enumerate(
                self.clusteringModel.xpath("pmml:Cluster")):
            clusterName = cluster.get("id", "%d" % (index + 1))
            clusterVectors[clusterName] = NP(
                "array",
                cluster.childOfTag("Array").values(),
                dtype=NP.dtype(float))
        KMeansMapReduceApplication.metadata["clusterVectors"] = clusterVectors

        self.KMeansMapReduceApplication = KMeansMapReduceApplication

        return MapReduce(KMeansMapReduceApplication)
示例#10
0
        def evaluate(self, dataTable, functionTable, performanceTable,
                     arguments):
            arguments = [
                x.evaluate(dataTable, functionTable, performanceTable)
                for x in arguments
            ]
            performanceTable.begin("built-in \"%s\"" % self.name)

            fieldType = self.allBooleanType(arguments, atleast=2)

            data = NP("zeros", len(dataTable), dtype=fieldType.dtype)
            mask = None
            allbad = NP("ones", len(dataTable), dtype=NP.dtype(bool))

            (data, allbad), mask = self.applySkipMissing((data, allbad), mask,
                                                         arguments)

            if allbad.any():
                if mask is None:
                    mask = allbad * defs.MISSING
                else:
                    NP("logical_and", allbad, NP(mask == defs.VALID), allbad)
                    mask[allbad] = defs.MISSING

            performanceTable.end("built-in \"%s\"" % self.name)
            return DataColumn(fieldType, data, mask)
示例#11
0
    def functionMax(self, dataColumn, whereMask, groupSelection, getstate, setstate):
        """Finds the maximum of rows in a DataColumn, possibly with an SQL where mask and groupField.

        @type dataColumn: DataColumn
        @param dataColumn: The input data column.
        @type whereMask: 1d Numpy array of bool, or None
        @param whereMask: The result of the SQL where selection.
        @type groupSelection: 1d Numpy array of bool, or None.
        @param groupSelection: Rows corresponding to a particular value of the groupField.
        @type getstate: callable function
        @param getstate: Retrieve staring values from the DataTableState.
        @type setstate: callable function
        @param setstate: Store ending values to the DataTableState.
        @rtype: DataColumn
        @return: A column of maximized rows.
        """

        fieldType = dataColumn.fieldType

        if fieldType.optype not in ("continuous", "ordinal"):
            raise defs.PmmlValidationError("Aggregate function \"min\" requires a continuous or ordinal input field")

        if dataColumn.mask is None:
            selection = NP("ones", len(dataColumn), dtype=NP.dtype(bool))
        else:
            selection = NP(dataColumn.mask == defs.VALID)

        if whereMask is not None:
            NP("logical_and", selection, whereMask, selection)

        if groupSelection is not None:
            NP("logical_and", selection, groupSelection, selection)

        maximum = None
        if getstate is not None:
            startingState = getstate()
            if startingState is not None:
                maximum = startingState

        data = NP("empty", len(dataColumn), dtype=fieldType.dtype)
        mask = NP("zeros", len(dataColumn), dtype=defs.maskType)

        for i, x in enumerate(dataColumn.data):
            if selection[i]:
                if maximum is None or x > maximum:
                    maximum = x
            if maximum is None:
                mask[i] = defs.INVALID
            else:
                data[i] = maximum

        if not mask.any():
            mask = None

        if setstate is not None:
            setstate(maximum)

        return DataColumn(fieldType, data, mask)
示例#12
0
    def evaluate(self,
                 dataTable,
                 functionTable,
                 performanceTable,
                 returnUnknowns=False):
        """Evaluate the predicate, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this predicate.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this predicate.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @type returnUnknowns: bool
        @param returnUnknowns: If True, return a "mask" for the selection that indicates which rows are unknown, rather than True or False.
        @rtype: 1d Numpy array of bool or 3-tuple of arrays
        @return: Either a simple selection array or selection, unknowns, encounteredUnknowns
        """

        performanceTable.begin("SimpleSetPredicate")

        fieldName = self.get("field")
        dataColumn = dataTable.fields[fieldName]

        fromString = dataColumn.fieldType.stringToValue
        array = [
            fromString(x)
            for x in self.childOfClass(Array).values(convertType=False)
        ]

        selection = NP("in1d", dataColumn.data, array)

        if self.get("booleanOperator") == "isNotIn":
            NP("logical_not", selection, selection)

        if returnUnknowns:
            if dataColumn.mask is None:
                unknowns = NP("zeros", len(dataTable), dtype=NP.dtype(bool))
            else:
                unknowns = NP(dataColumn.mask != defs.VALID)

            performanceTable.end("SimpleSetPredicate")
            return selection, unknowns, unknowns

        else:
            if dataColumn.mask is not None:
                NP("logical_and", selection, NP(dataColumn.mask == defs.VALID),
                   selection)

            performanceTable.end("SimpleSetPredicate")
            return selection
示例#13
0
    def evaluate(self, dataTable, functionTable, performanceTable, returnUnknowns=False):
        """Evaluate the predicate, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this predicate.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this predicate.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @type returnUnknowns: bool
        @param returnUnknowns: If True, return a "mask" for the selection that indicates which rows are unknown, rather than True or False.
        @rtype: 1d Numpy array of bool or 3-tuple of arrays
        @return: Either a simple selection array or selection, unknowns, encounteredUnknowns
        """

        performanceTable.begin("Predicate False")

        result = NP("zeros", len(dataTable), dtype=NP.dtype(bool))
        if returnUnknowns:
            unknowns = NP("zeros", len(dataTable), dtype=NP.dtype(bool))
            result = result, unknowns, unknowns

        performanceTable.end("Predicate False")
        return result
示例#14
0
    def initialize(self, state, numberOfRecords, numberOfFields,
                   distributionBased):
        """First step in a vectorized metric calculation with missing values, called once before all fields and cluster centers.

        Only modifies the C{state} object.

        @type state: ad-hoc Python object
        @param state: State information that persists long enough to span the three steps of a metric calculation.  This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState.
        @type numberOfRecords: int
        @param numberOfRecords: The number of rows in the dataset.
        @type numberOfFields: int
        @param numberOfFields: The number of columns in the dataset.
        @type distributionBased: bool
        @param distributionBased: If True, use a covariance matrix to scale the distance result.
        """

        state.sumInQuadrature = NP("zeros",
                                   numberOfRecords,
                                   dtype=NP.dtype(float))
        if distributionBased:
            state.displacements = NP("empty",
                                     (numberOfRecords, numberOfFields),
                                     dtype=NP.dtype(float))
            state.displacementIndex = 0
示例#15
0
 def _fromDataColumn(self, dataColumn):
     # enumeration uses less memory and, interestingly, a little less time than a list comprehension (80 ns instead of 100 ns per record)
     output = NP("empty", len(dataColumn), dtype=NP.dtype(object))
     if dataColumn.mask is None:
         for i, x in enumerate(dataColumn.data):
             output[i] = self.valueToPython(x)
     else:
         mask = dataColumn.mask
         for i, x in enumerate(dataColumn.data):
             if mask[i] == defs.VALID:
                 output[i] = self.valueToPython(x)
             elif mask[i] == defs.MISSING:
                 output[i] = defs.NAN
             else:
                 output[i] = None
     return output
示例#16
0
    def initialize(self, state, numberOfRecords, numberOfFields, distributionBased):
        """First step in a vectorized metric calculation with missing values, called once before all fields and cluster centers.

        Only modifies the C{state} object.

        @type state: ad-hoc Python object
        @param state: State information that persists long enough to span the three steps of a metric calculation.  This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState.
        @type numberOfRecords: int
        @param numberOfRecords: The number of rows in the dataset.
        @type numberOfFields: int
        @param numberOfFields: The number of columns in the dataset.
        @type distributionBased: bool
        @param distributionBased: If True, use a covariance matrix to scale the distance result.
        """

        state.maximumComponent = NP("zeros", numberOfRecords, dtype=NP.dtype(float))
        if distributionBased:
            raise NotImplementedError("Distribution-based clustering has not been implemented for the %s metric" % self.t)
示例#17
0
    def _checkIntervals(self, data, mask):
        intervals = self.intervals
        if len(intervals) == 0:
            return data, mask

        # innocent until proven guilty
        invalid = NP("zeros", len(data), dtype=NP.dtype(bool))
        for interval in intervals:
            closure = interval["closure"]
            leftMargin = interval.get("leftMargin")
            rightMargin = interval.get("rightMargin")

            if leftMargin is not None:
                try:
                    leftMargin = self.stringToValue(leftMargin)
                except ValueError:
                    raise defs.PmmlValidationError("Improper value in Interval leftMargin specification: \"%s\"" % leftMargin)

                if closure in ("openClosed", "openOpen"):
                    invalid[NP(data <= leftMargin)] = True
                elif closure in ("closedOpen", "closedClosed"):
                    invalid[NP(data < leftMargin)] = True

            if rightMargin is not None:
                try:
                    rightMargin = self.stringToValue(rightMargin)
                except ValueError:
                    raise defs.PmmlValidationError("Improper value in Interval rightMargin specification: \"%s\"" % rightMargin)

                if closure in ("openOpen", "closedOpen"):
                    invalid[NP(data >= rightMargin)] = True
                elif closure in ("openClosed", "closedClosed"):
                    invalid[NP(data > rightMargin)] = True

        if not invalid.any():
            return data, mask

        if mask is None:
            return data, NP(invalid * defs.INVALID)
        else:
            NP("logical_and", invalid, NP(mask == defs.VALID), invalid)   # only change what wasn't already marked as MISSING
            mask[invalid] = defs.INVALID
            return data, mask
示例#18
0
    def initialize(self, state, numberOfRecords, numberOfFields, distributionBased):
        """First step in a vectorized metric calculation with missing values, called once before all fields and cluster centers.

        Only modifies the C{state} object.

        @type state: ad-hoc Python object
        @param state: State information that persists long enough to span the three steps of a metric calculation.  This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState.
        @type numberOfRecords: int
        @param numberOfRecords: The number of rows in the dataset.
        @type numberOfFields: int
        @param numberOfFields: The number of columns in the dataset.
        @type distributionBased: bool
        @param distributionBased: If True, use a covariance matrix to scale the distance result.
        """

        state.sumInQuadrature = NP("zeros", numberOfRecords, dtype=NP.dtype(float))
        if distributionBased:
            state.displacements = NP("empty", (numberOfRecords, numberOfFields), dtype=NP.dtype(float))
            state.displacementIndex = 0
示例#19
0
    def evaluate(self, dataTable, functionTable, performanceTable, returnUnknowns=False):
        """Evaluate the predicate, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this predicate.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this predicate.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @type returnUnknowns: bool
        @param returnUnknowns: If True, return a "mask" for the selection that indicates which rows are unknown, rather than True or False.
        @rtype: 1d Numpy array of bool or 3-tuple of arrays
        @return: Either a simple selection array or selection, unknowns, encounteredUnknowns
        """

        performanceTable.begin("SimpleSetPredicate")

        fieldName = self.get("field")
        dataColumn = dataTable.fields[fieldName]
        
        fromString = dataColumn.fieldType.stringToValue
        array = [fromString(x) for x in self.childOfClass(Array).values(convertType=False)]

        selection = NP("in1d", dataColumn.data, array)

        if self.get("booleanOperator") == "isNotIn":
            NP("logical_not", selection, selection)

        if returnUnknowns:
            if dataColumn.mask is None:
                unknowns = NP("zeros", len(dataTable), dtype=NP.dtype(bool))
            else:
                unknowns = NP(dataColumn.mask != defs.VALID)

            performanceTable.end("SimpleSetPredicate")
            return selection, unknowns, unknowns

        else:
            if dataColumn.mask is not None:
                NP("logical_and", selection, NP(dataColumn.mask == defs.VALID), selection)

            performanceTable.end("SimpleSetPredicate")
            return selection
示例#20
0
        def evaluate(self, dataTable, functionTable, performanceTable, arguments):
            arguments = [x.evaluate(dataTable, functionTable, performanceTable) for x in arguments]
            performanceTable.begin("built-in \"%s\"" % self.name)

            fieldType = self.allBooleanType(arguments, atleast=2)

            data = NP("zeros", len(dataTable), dtype=fieldType.dtype)
            mask = None
            allbad = NP("ones", len(dataTable), dtype=NP.dtype(bool))

            (data, allbad), mask = self.applySkipMissing((data, allbad), mask, arguments)

            if allbad.any():
                if mask is None:
                    mask = allbad * defs.MISSING
                else:
                    NP("logical_and", allbad, NP(mask == defs.VALID), allbad)
                    mask[allbad] = defs.MISSING

            performanceTable.end("built-in \"%s\"" % self.name)
            return DataColumn(fieldType, data, mask)
示例#21
0
    def prepare(self, state, dataTable, functionTable, performanceTable, plotRange):
        """Prepare a plot element for drawing.

        This stage consists of calculating all quantities and
        determing the bounds of the data.  These bounds may be unioned
        with bounds from other plot elements that overlay this plot
        element, so the drawing (which requires a finalized coordinate
        system) cannot begin yet.

        This method modifies C{plotRange}.

        @type state: ad-hoc Python object
        @param state: State information that persists long enough to use quantities computed in C{prepare} in the C{draw} stage.  This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState.
        @type dataTable: DataTable
        @param dataTable: Contains the data to plot.
        @type functionTable: FunctionTable
        @param functionTable: Defines functions that may be used to transform data for plotting.
        @type performanceTable: PerformanceTable
        @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process.
        @type plotRange: PlotRange
        @param plotRange: The bounding box of plot coordinates that this function will expand.
        """

        self._saveContext(dataTable)

        self.checkRoles(["x", "y", "x-errorbar", "x-errorbar-up", "x-errorbar-down", "y-errorbar", "y-errorbar-up", "y-errorbar-down", "weight"])

        xExpression = self.xpath("pmml:PlotNumericExpression[@role='x']")
        yExpression = self.xpath("pmml:PlotNumericExpression[@role='y']")

        cutExpression = self.xpath("pmml:PlotSelection")

        exExpression = self.xpath("pmml:PlotNumericExpression[@role='x-errorbar']")
        exupExpression = self.xpath("pmml:PlotNumericExpression[@role='x-errorbar-up']")
        exdownExpression = self.xpath("pmml:PlotNumericExpression[@role='x-errorbar-down']")

        eyExpression = self.xpath("pmml:PlotNumericExpression[@role='y-errorbar']")
        eyupExpression = self.xpath("pmml:PlotNumericExpression[@role='y-errorbar-up']")
        eydownExpression = self.xpath("pmml:PlotNumericExpression[@role='y-errorbar-down']")

        weightExpression = self.xpath("pmml:PlotNumericExpression[@role='weight']")

        if len(xExpression) != 1 or len(yExpression) != 1:
            raise defs.PmmlValidationError("PlotScatter requires two PlotNumericExpressions, one with role \"x\", the other with role \"y\"")

        xValues = xExpression[0].evaluate(dataTable, functionTable, performanceTable)
        yValues = yExpression[0].evaluate(dataTable, functionTable, performanceTable)

        if len(cutExpression) == 1:
            selection = cutExpression[0].select(dataTable, functionTable, performanceTable)
        else:
            selection = NP("ones", len(dataTable), NP.dtype(bool))

        if len(exExpression) == 0 and len(exupExpression) == 0 and len(exdownExpression) == 0:
            exup, exdown = None, None
        elif len(exExpression) == 1 and len(exupExpression) == 0 and len(exdownExpression) == 0:
            exup = exExpression[0].evaluate(dataTable, functionTable, performanceTable)
            exdown = None
        elif len(exExpression) == 0 and len(exupExpression) == 1 and len(exdownExpression) == 1:
            exup = exupExpression[0].evaluate(dataTable, functionTable, performanceTable)
            exdown = exdownExpression[0].evaluate(dataTable, functionTable, performanceTable)
        else:
            raise defs.PmmlValidationError("Use \"x-errorbar\" for symmetric error bars or \"x-errorbar-up\" and \"x-errorbar-down\" for asymmetric errorbars, but no other combinations")

        if len(eyExpression) == 0 and len(eyupExpression) == 0 and len(eydownExpression) == 0:
            eyup, eydown = None, None
        elif len(eyExpression) == 1 and len(eyupExpression) == 0 and len(eydownExpression) == 0:
            eyup = eyExpression[0].evaluate(dataTable, functionTable, performanceTable)
            eydown = None
        elif len(eyExpression) == 0 and len(eyupExpression) == 1 and len(eydownExpression) == 1:
            eyup = eyupExpression[0].evaluate(dataTable, functionTable, performanceTable)
            eydown = eydownExpression[0].evaluate(dataTable, functionTable, performanceTable)
        else:
            raise defs.PmmlValidationError("Use \"y-errorbar\" for symmetric error bars or \"y-errorbar-up\" and \"y-errorbar-down\" for asymmetric errorbars, but no other combinations")

        if len(weightExpression) == 1:
            weight = weightExpression[0].evaluate(dataTable, functionTable, performanceTable)
        else:
            weight = None

        performanceTable.begin("PlotScatter prepare")

        if xValues.mask is not None:
            NP("logical_and", selection, NP(xValues.mask == defs.VALID), selection)
        if yValues.mask is not None:
            NP("logical_and", selection, NP(yValues.mask == defs.VALID), selection)

        if exup is not None and exup.mask is not None:
            NP("logical_and", selection, NP(exup.mask == defs.VALID), selection)
        if exdown is not None and exdown.mask is not None:
            NP("logical_and", selection, NP(exdown.mask == defs.VALID), selection)
        if eyup is not None and eyup.mask is not None:
            NP("logical_and", selection, NP(eyup.mask == defs.VALID), selection)
        if eydown is not None and eydown.mask is not None:
            NP("logical_and", selection, NP(eydown.mask == defs.VALID), selection)

        state.x = xValues.data[selection]
        state.y = yValues.data[selection]

        state.exup, state.exdown, state.eyup, state.eydown = None, None, None, None
        if exup is not None:
            state.exup = exup.data[selection]
        if exdown is not None:
            state.exdown = exdown.data[selection]
        if eyup is not None:
            state.eyup = eyup.data[selection]
        if eydown is not None:
            state.eydown = eydown.data[selection]

        state.weight = None
        if weight is not None:
            state.weight = weight.data[selection]

        stateId = self.get("stateId")
        if stateId is not None:
            persistentState = dataTable.state.get(stateId)
            if persistentState is None:
                persistentState = {}
                dataTable.state[stateId] = persistentState
            else:
                state.x = NP("concatenate", (persistentState["x"], state.x))
                state.y = NP("concatenate", (persistentState["y"], state.y))

                if exup is not None:
                    state.exup = NP("concatenate", (persistentState["exup"], state.exup))
                if exdown is not None:
                    state.exdown = NP("concatenate", (persistentState["exdown"], state.exdown))
                if eyup is not None:
                    state.eyup = NP("concatenate", (persistentState["eyup"], state.eyup))
                if eydown is not None:
                    state.eydown = NP("concatenate", (persistentState["eydown"], state.eydown))

                if weight is not None:
                    state.weight = NP("concatenate", (persistentState["weight"], state.weight))

            persistentState["x"] = state.x
            persistentState["y"] = state.y

            if exup is not None:
                persistentState["exup"] = state.exup
            if exdown is not None:
                persistentState["exdown"] = state.exdown
            if eyup is not None:
                persistentState["eyup"] = state.eyup
            if eydown is not None:
                persistentState["eydown"] = state.eydown

            if weight is not None:
                persistentState["weight"] = state.weight

        plotRange.expand(state.x, state.y, xValues.fieldType, yValues.fieldType)
        performanceTable.end("PlotScatter prepare")
示例#22
0
    def prepare(self, state, dataTable, functionTable, performanceTable,
                plotRange):
        """Prepare a plot element for drawing.

        This stage consists of calculating all quantities and
        determing the bounds of the data.  These bounds may be unioned
        with bounds from other plot elements that overlay this plot
        element, so the drawing (which requires a finalized coordinate
        system) cannot begin yet.

        This method modifies C{plotRange}.

        @type state: ad-hoc Python object
        @param state: State information that persists long enough to use quantities computed in C{prepare} in the C{draw} stage.  This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState.
        @type dataTable: DataTable
        @param dataTable: Contains the data to plot.
        @type functionTable: FunctionTable
        @param functionTable: Defines functions that may be used to transform data for plotting.
        @type performanceTable: PerformanceTable
        @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process.
        @type plotRange: PlotRange
        @param plotRange: The bounding box of plot coordinates that this function will expand.
        """

        self.checkRoles(["z(x,y)", "x", "y", "zmean", "zweight"])

        performanceTable.begin("PlotHeatMap prepare")
        self._saveContext(dataTable)

        zofxy = self.xpath("pmml:PlotFormula[@role='z(x,y)']")
        xexpr = self.xpath("pmml:PlotNumericExpression[@role='x']")
        yexpr = self.xpath("pmml:PlotNumericExpression[@role='y']")
        zmean = self.xpath("pmml:PlotNumericExpression[@role='zmean']")
        zweight = self.xpath("pmml:PlotNumericExpression[@role='zweight']")
        cutExpression = self.xpath("pmml:PlotSelection")

        if len(zofxy) == 1 and len(xexpr) == 0 and len(yexpr) == 0 and len(
                zmean) == 0 and len(zweight) == 0:
            xbins = self.get("xbins", convertType=True)
            xlow = self.get("xlow", convertType=True)
            xhigh = self.get("xhigh", convertType=True)
            ybins = self.get("ybins", convertType=True)
            ylow = self.get("ylow", convertType=True)
            yhigh = self.get("yhigh", convertType=True)

            if xbins is None or xlow is None or xhigh is None or ybins is None or ylow is None or yhigh is None:
                raise defs.PmmlValidationError(
                    "xbins, xlow, xhigh, ybins, ylow, and yhigh are required for HeatMaps of a mathematical formula"
                )

            if xlow >= xhigh or ylow >= yhigh:
                raise defs.PmmlValidationError(
                    "xlow must be less than xhigh and ylow must be less than yhigh"
                )

            if plotRange.xStrictlyPositive or plotRange.yStrictlyPositive:
                raise defs.PmmlValidationError(
                    "PlotHeatMap can only be properly displayed in linear x, y coordinates"
                )

            xbinWidth = (xhigh - xlow) / float(xbins)
            ybinWidth = (yhigh - ylow) / float(ybins)

            xarray = NP("tile",
                        NP("linspace", xlow, xhigh, xbins, endpoint=True),
                        ybins)
            yarray = NP("repeat",
                        NP("linspace", ylow, yhigh, ybins, endpoint=True),
                        xbins)

            sampleTable = DataTable({
                "x": "double",
                "y": "double"
            }, {
                "x": xarray,
                "y": yarray
            })
            parsed = Formula.parse(zofxy[0].text)

            performanceTable.pause("PlotHeatMap prepare")
            zdataColumn = parsed.evaluate(sampleTable, functionTable,
                                          performanceTable)
            performanceTable.unpause("PlotHeatMap prepare")
            if not zdataColumn.fieldType.isnumeric():
                raise defs.PmmlValidationError(
                    "PlotFormula z(x,y) must return a numeric expression, not %r"
                    % zdataColumn.fieldType)

            selection = NP("isfinite", zdataColumn.data)
            if zdataColumn.mask is not None:
                NP("logical_and", selection,
                   NP(zdataColumn.mask == defs.VALID), selection)
            if plotRange.zStrictlyPositive:
                NP("logical_and", selection, NP(zdataColumn.data > 0.0),
                   selection)

            gooddata = zdataColumn.data[selection]
            plotRange.zminPush(gooddata.min(),
                               zdataColumn.fieldType,
                               sticky=False)
            plotRange.zmaxPush(gooddata.max(),
                               zdataColumn.fieldType,
                               sticky=False)

            state.zdata = zdataColumn.data
            state.zmask = NP("logical_not", selection) * defs.INVALID

        elif len(zofxy) == 0 and len(xexpr) == 1 and len(yexpr) == 1:
            performanceTable.pause("PlotHeatMap prepare")
            xdataColumn = xexpr[0].evaluate(dataTable, functionTable,
                                            performanceTable)
            ydataColumn = yexpr[0].evaluate(dataTable, functionTable,
                                            performanceTable)
            performanceTable.unpause("PlotHeatMap prepare")

            xbins = self.get("xbins", convertType=True)
            xlow = self.get("xlow", convertType=True)
            xhigh = self.get("xhigh", convertType=True)
            ybins = self.get("ybins", convertType=True)
            ylow = self.get("ylow", convertType=True)
            yhigh = self.get("yhigh", convertType=True)

            if len(xdataColumn) > 0:
                if xlow is None: xlow = NP("nanmin", xdataColumn.data)
                if xhigh is None: xhigh = NP("nanmax", xdataColumn.data)
                if ylow is None: ylow = NP("nanmin", ydataColumn.data)
                if yhigh is None: yhigh = NP("nanmax", ydataColumn.data)
            else:
                if xlow is None: xlow = 0.0
                if xhigh is None: xhigh = 1.0
                if ylow is None: ylow = 0.0
                if yhigh is None: yhigh = 1.0

            if xbins is None:
                q1, q3 = NP("percentile", xdataColumn.data, [25.0, 75.0])
                binWidth = 2.0 * (q3 - q1) / math.pow(len(xdataColumn.data),
                                                      1.0 / 3.0)
                if binWidth > 0.0:
                    xbins = max(10, int(math.ceil((xhigh - xlow) / binWidth)))
                else:
                    xbins = 10

            if ybins is None:
                q1, q3 = NP("percentile", ydataColumn.data, [25.0, 75.0])
                binWidth = 2.0 * (q3 - q1) / math.pow(len(ydataColumn.data),
                                                      1.0 / 3.0)
                if binWidth > 0.0:
                    ybins = max(10, int(math.ceil((yhigh - ylow) / binWidth)))
                else:
                    ybins = 10

            if xlow >= xhigh or ylow >= yhigh:
                raise defs.PmmlValidationError(
                    "xlow must be less than xhigh and ylow must be less than yhigh"
                )

            if plotRange.xStrictlyPositive or plotRange.yStrictlyPositive:
                raise defs.PmmlValidationError(
                    "PlotHeatMap can only be properly displayed in linear x, y coordinates"
                )

            persistentState = {}
            stateId = self.get("stateId")
            if stateId is not None:
                if stateId in dataTable.state:
                    persistentState = dataTable.state[stateId]
                else:
                    dataTable.state[stateId] = persistentState

            if len(zmean) == 0:
                if "xbins" in persistentState: xbins = persistentState["xbins"]
                if "xlow" in persistentState: xlow = persistentState["xlow"]
                if "xhigh" in persistentState: xhigh = persistentState["xhigh"]
                if "ybins" in persistentState: ybins = persistentState["ybins"]
                if "ylow" in persistentState: ylow = persistentState["ylow"]
                if "yhigh" in persistentState: yhigh = persistentState["yhigh"]

                persistentState["xbins"] = xbins
                persistentState["xlow"] = xlow
                persistentState["xhigh"] = xhigh
                persistentState["ybins"] = ybins
                persistentState["ylow"] = ylow
                persistentState["yhigh"] = yhigh

            xbinWidth = (xhigh - xlow) / float(xbins)
            ybinWidth = (yhigh - ylow) / float(ybins)

            mask = NP("ones", len(dataTable), dtype=NP.dtype(float))
            if xdataColumn.mask is not None:
                NP("multiply", mask, (xdataColumn.mask == defs.VALID), mask)
            if ydataColumn.mask is not None:
                NP("multiply", mask, (ydataColumn.mask == defs.VALID), mask)

            if len(cutExpression) == 1:
                performanceTable.pause("PlotHeatMap prepare")
                NP(
                    "multiply", mask,
                    cutExpression[0].select(dataTable, functionTable,
                                            performanceTable), mask)
                performanceTable.unpause("PlotHeatMap prepare")

            if len(zmean) == 0 and len(zweight) == 0:
                histogram, xedges, yedges = NP("histogram2d",
                                               ydataColumn.data,
                                               xdataColumn.data,
                                               bins=(ybins, xbins),
                                               range=[[ylow, yhigh],
                                                      [xlow, xhigh]],
                                               weights=mask)
                if len(dataTable) == 0:
                    # work around Numpy <= 1.6.1 bug
                    histogram = NP("zeros", (ybins, xbins),
                                   dtype=NP.dtype(float))

                if "histogram" in persistentState:
                    persistentState["histogram"] = NP(
                        persistentState["histogram"] + histogram)
                else:
                    persistentState["histogram"] = histogram

                histogram = persistentState["histogram"]

                if plotRange.zStrictlyPositive:
                    zmin = 0.1
                else:
                    zmin = 0.0
                zmax = NP("nanmax", histogram)

                plotRange.zminPush(zmin, self.zfieldType, sticky=True)
                if zmax > zmin:
                    plotRange.zmaxPush(zmax, self.zfieldType, sticky=False)

            elif len(zmean) == 0 and len(zweight) == 1:
                performanceTable.pause("PlotHeatMap prepare")
                weightsDataColumn = zweight[0].evaluate(
                    dataTable, functionTable, performanceTable)
                performanceTable.unpause("PlotHeatMap prepare")

                if weightsDataColumn.mask is not None:
                    NP("multiply", mask,
                       (weightsDataColumn.mask == defs.VALID), mask)
                weights = NP(weightsDataColumn.data * mask)

                histogram, xedges, yedges = NP("histogram2d",
                                               ydataColumn.data,
                                               xdataColumn.data,
                                               bins=(ybins, xbins),
                                               range=[[ylow, yhigh],
                                                      [xlow, xhigh]],
                                               weights=weights)

                if "histogram" in persistentState:
                    persistentState["histogram"] = NP(
                        persistentState["histogram"] + histogram)
                else:
                    persistentState["histogram"] = histogram

                histogram = persistentState["histogram"]

                if plotRange.zStrictlyPositive:
                    w = weights[NP(weights > 0.0)]
                    if len(w) > 0:
                        zmin = 0.1 * NP("nanmin", w)
                    else:
                        zmin = 0.1
                else:
                    zmin = 0.0
                zmax = NP("nanmax", histogram)

                plotRange.zminPush(zmin, self.zfieldType, sticky=True)
                if zmax > zmin:
                    plotRange.zmaxPush(zmax, self.zfieldType, sticky=False)

            elif len(zmean) == 1 and len(zweight) == 0:
                performanceTable.pause("PlotHeatMap prepare")
                zdataColumn = zmean[0].evaluate(dataTable, functionTable,
                                                performanceTable)
                performanceTable.unpause("PlotHeatMap prepare")

                if zdataColumn.mask is not None:
                    NP("multiply", mask, (zdataColumn.mask == defs.VALID),
                       mask)
                weights = NP(zdataColumn.data * mask)

                numer, xedges, yedges = NP("histogram2d",
                                           ydataColumn.data,
                                           xdataColumn.data,
                                           bins=(ybins, xbins),
                                           range=[[ylow, yhigh], [xlow,
                                                                  xhigh]],
                                           weights=weights)
                denom, xedges, yedges = NP("histogram2d",
                                           ydataColumn.data,
                                           xdataColumn.data,
                                           bins=(ybins, xbins),
                                           range=[[ylow, yhigh], [xlow,
                                                                  xhigh]],
                                           weights=mask)

                if "numer" in persistentState:
                    persistentState["numer"] = NP(persistentState["numer"] +
                                                  numer)
                    persistentState["denom"] = NP(persistentState["denom"] +
                                                  denom)
                else:
                    persistentState["numer"] = numer
                    persistentState["denom"] = denom

                numer = persistentState["numer"]
                denom = persistentState["denom"]
                histogram = numer / denom

                selection = NP("isfinite", histogram)
                if plotRange.zStrictlyPositive:
                    NP("logical_and", selection, NP(histogram > 0.0),
                       selection)

                if NP("count_nonzero", selection) > 0:
                    gooddata = histogram[selection]
                    plotRange.zminPush(gooddata.min(),
                                       self.zfieldType,
                                       sticky=False)
                    plotRange.zmaxPush(gooddata.max(),
                                       self.zfieldType,
                                       sticky=False)

            else:
                raise defs.PmmlValidationError(
                    "The only allowed combinations of PlotFormula/PlotNumericExpressions are: \"z(x,y)\" (function), \"x y\" (histogram), \"x y zmean\" (mean of z in x y bins), \"x y zweight\" (weighted x y histogram)"
                )

            state.zdata = NP("reshape", histogram, xbins * ybins)
            state.zmask = None

        else:
            raise defs.PmmlValidationError(
                "The only allowed combinations of PlotFormula/PlotNumericExpressions are: \"z(x,y)\" (function), \"x y\" (histogram), \"x y zmean\" (mean of z in x y bins), \"x y zweight\" (weighted x y histogram)"
            )

        plotRange.xminPush(xlow, self.xyfieldType, sticky=True)
        plotRange.yminPush(ylow, self.xyfieldType, sticky=True)
        plotRange.xmaxPush(xhigh, self.xyfieldType, sticky=True)
        plotRange.ymaxPush(yhigh, self.xyfieldType, sticky=True)

        state.xbins = xbins
        state.xlow = xlow
        state.xhigh = xhigh
        state.ybins = ybins
        state.ylow = ylow
        state.yhigh = yhigh

        performanceTable.end("PlotHeatMap prepare")
示例#23
0
    def evaluate(self, dataTable, functionTable, performanceTable, returnUnknowns=False):
        """Evaluate the predicate, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this predicate.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this predicate.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @type returnUnknowns: bool
        @param returnUnknowns: If True, return a "mask" for the selection that indicates which rows are unknown, rather than True or False.
        @rtype: 1d Numpy array of bool or 3-tuple of arrays
        @return: Either a simple selection array or selection, unknowns, encounteredUnknowns
        """

        performanceTable.begin("SimplePredicate")

        fieldName = self.get("field")

        dataColumn = dataTable.fields[fieldName]

        operator = self.get("operator")
        if operator == "isMissing":
            if dataColumn.mask is None:
                selection = NP("zeros", len(dataTable), dtype=NP.dtype(bool))
            else:
                selection = NP(dataColumn.mask == defs.MISSING)

        elif operator == "isNotMissing":
            if dataColumn.mask is None:
                selection = NP("ones", len(dataTable), dtype=NP.dtype(bool))
            else:
                selection = NP(dataColumn.mask != defs.MISSING)

        else:
            try:
                value = dataColumn.fieldType.stringToValue(self.get("value"))
            except ValueError as err:
                raise defs.PmmlValidationError("SimplePredicate.value \"%s\" cannot be cast as %r: %s" % (self.get("value"), dataColumn.fieldType, str(err)))

            if operator == "equal":
                selection = NP(dataColumn.data == value)

            elif operator == "notEqual":
                selection = NP(dataColumn.data != value)

            elif operator == "lessThan":
                if dataColumn.fieldType.optype == "categorical":
                    raise TypeError("Categorical field \"%s\" cannot be compared using %s" % (fieldName, operator))
                selection = NP(dataColumn.data < value)

            elif operator == "lessOrEqual":
                if dataColumn.fieldType.optype == "categorical":
                    raise TypeError("Categorical field \"%s\" cannot be compared using %s" % (fieldName, operator))
                selection = NP(dataColumn.data <= value)

            elif operator == "greaterThan":
                if dataColumn.fieldType.optype == "categorical":
                    raise TypeError("Categorical field \"%s\" cannot be compared using %s" % (fieldName, operator))
                selection = NP(dataColumn.data > value)

            elif operator == "greaterOrEqual":
                if dataColumn.fieldType.optype == "categorical":
                    raise TypeError("Categorical field \"%s\" cannot be compared using %s" % (fieldName, operator))
                selection = NP(dataColumn.data >= value)

        if returnUnknowns:
            if dataColumn.mask is None:
                unknowns = NP("zeros", len(dataTable), dtype=NP.dtype(bool))
            else:
                unknowns = NP(dataColumn.mask != defs.VALID)

            performanceTable.end("SimplePredicate")
            return selection, unknowns, unknowns

        else:
            if dataColumn.mask is not None:
                NP("logical_and", selection, NP(dataColumn.mask == defs.VALID), selection)

            performanceTable.end("SimplePredicate")
            return selection
示例#24
0
    def prepare(self, state, dataTable, functionTable, performanceTable,
                plotRange):
        """Prepare a plot element for drawing.

        This stage consists of calculating all quantities and
        determing the bounds of the data.  These bounds may be unioned
        with bounds from other plot elements that overlay this plot
        element, so the drawing (which requires a finalized coordinate
        system) cannot begin yet.

        This method modifies C{plotRange}.

        @type state: ad-hoc Python object
        @param state: State information that persists long enough to use quantities computed in C{prepare} in the C{draw} stage.  This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState.
        @type dataTable: DataTable
        @param dataTable: Contains the data to plot.
        @type functionTable: FunctionTable
        @param functionTable: Defines functions that may be used to transform data for plotting.
        @type performanceTable: PerformanceTable
        @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process.
        @type plotRange: PlotRange
        @param plotRange: The bounding box of plot coordinates that this function will expand.
        """

        self.checkRoles([
            "y(x)", "dy/dx", "x(t)", "y(t)", "dx/dt", "dy/dt", "x", "y", "dx",
            "dy"
        ])

        performanceTable.begin("PlotCurve prepare")
        self._saveContext(dataTable)

        yofx = self.xpath("pmml:PlotFormula[@role='y(x)']")
        dydx = self.xpath("pmml:PlotFormula[@role='dy/dx']")

        xoft = self.xpath("pmml:PlotFormula[@role='x(t)']")
        yoft = self.xpath("pmml:PlotFormula[@role='y(t)']")
        dxdt = self.xpath("pmml:PlotFormula[@role='dx/dt']")
        dydt = self.xpath("pmml:PlotFormula[@role='dy/dt']")

        nx = self.xpath("pmml:PlotNumericExpression[@role='x']")
        ny = self.xpath("pmml:PlotNumericExpression[@role='y']")
        ndx = self.xpath("pmml:PlotNumericExpression[@role='dx']")
        ndy = self.xpath("pmml:PlotNumericExpression[@role='dy']")
        cutExpression = self.xpath("pmml:PlotSelection")

        if len(yofx) + len(dydx) + len(xoft) + len(yoft) + len(dxdt) + len(
                dydt) > 0:
            if len(yofx) == 1 and len(dydx) == 0 and len(xoft) == 0 and len(
                    yoft) == 0 and len(dxdt) == 0 and len(dydt) == 0:
                expression = (yofx[0].text, )
                derivative = (None, )

            elif len(yofx) == 1 and len(dydx) == 1 and len(xoft) == 0 and len(
                    yoft) == 0 and len(dxdt) == 0 and len(dydt) == 0:
                expression = (yofx[0].text, )
                derivative = (dydx[0].text, )

            elif len(yofx) == 0 and len(dydx) == 0 and len(xoft) == 1 and len(
                    yoft) == 1 and len(dxdt) == 0 and len(dydt) == 0:
                expression = xoft[0].text, yoft[0].text
                derivative = None, None

            elif len(yofx) == 0 and len(dydx) == 0 and len(xoft) == 1 and len(
                    yoft) == 1 and len(dxdt) == 1 and len(dydt) == 1:
                expression = xoft[0].text, yoft[0].text
                derivative = dxdt[0].text, dydt[0].text

            else:
                raise defs.PmmlValidationError(
                    "The only allowed combinations of PlotFormulae are: \"y(x)\", \"y(x) dy/dx\", \"x(t) y(t)\", and \"x(t) y(t) dx/dt dy/dt\""
                )

            low = self.get("low", convertType=True)
            high = self.get("high", convertType=True)
            if low is None or high is None:
                raise defs.PmmlValidationError(
                    "The \"low\" and \"high\" attributes are required for PlotCurves defined by formulae"
                )

            samples = self.generateSamples(low, high)

            loop = self.get("loop", defaultFromXsd=True, convertType=True)
            state.x, state.y, state.dx, state.dy, xfieldType, yfieldType = self.expressionsToPoints(
                expression, derivative, samples, loop, functionTable,
                performanceTable)

        else:
            performanceTable.pause("PlotCurve prepare")
            if len(ndx) == 1:
                dxdataColumn = ndx[0].evaluate(dataTable, functionTable,
                                               performanceTable)
            else:
                dxdataColumn = None
            if len(ndy) == 1:
                dydataColumn = ndy[0].evaluate(dataTable, functionTable,
                                               performanceTable)
            else:
                dydataColumn = None
            performanceTable.unpause("PlotCurve prepare")

            if len(nx) == 0 and len(ny) == 1:
                performanceTable.pause("PlotCurve prepare")
                ydataColumn = ny[0].evaluate(dataTable, functionTable,
                                             performanceTable)
                performanceTable.unpause("PlotCurve prepare")

                if len(cutExpression) == 1:
                    performanceTable.pause("PlotCurve prepare")
                    selection = cutExpression[0].select(
                        dataTable, functionTable, performanceTable)
                    performanceTable.unpause("PlotCurve prepare")
                else:
                    selection = NP("ones", len(ydataColumn.data),
                                   NP.dtype(bool))

                if ydataColumn.mask is not None:
                    selection = NP("logical_and", selection,
                                   NP(ydataColumn.mask == defs.VALID),
                                   selection)
                if dxdataColumn is not None and dxdataColumn.mask is not None:
                    selection = NP("logical_and", selection,
                                   NP(dxdataColumn.mask == defs.VALID),
                                   selection)
                if dydataColumn is not None and dydataColumn.mask is not None:
                    selection = NP("logical_and", selection,
                                   NP(dydataColumn.mask == defs.VALID),
                                   selection)

                yarray = ydataColumn.data[selection]

                xarray = NP("ones", len(yarray), dtype=NP.dtype(float))
                xarray[0] = 0.0
                xarray = NP("cumsum", xarray)

                dxarray, dyarray = None, None
                if dxdataColumn is not None:
                    dxarray = dxdataColumn.data[selection]
                if dydataColumn is not None:
                    dyarray = dydataColumn.data[selection]

                xfieldType = self.xfieldType
                yfieldType = ydataColumn.fieldType

            elif len(nx) == 1 and len(ny) == 1:
                performanceTable.pause("PlotCurve prepare")
                xdataColumn = nx[0].evaluate(dataTable, functionTable,
                                             performanceTable)
                ydataColumn = ny[0].evaluate(dataTable, functionTable,
                                             performanceTable)
                performanceTable.unpause("PlotCurve prepare")

                if len(cutExpression) == 1:
                    performanceTable.pause("PlotCurve prepare")
                    selection = cutExpression[0].select(
                        dataTable, functionTable, performanceTable)
                    performanceTable.unpause("PlotCurve prepare")
                else:
                    selection = NP("ones", len(ydataColumn.data),
                                   NP.dtype(bool))

                if xdataColumn.mask is not None:
                    selection = NP("logical_and", selection,
                                   NP(xdataColumn.mask == defs.VALID),
                                   selection)
                if ydataColumn.mask is not None:
                    selection = NP("logical_and", selection,
                                   NP(ydataColumn.mask == defs.VALID),
                                   selection)
                if dxdataColumn is not None and dxdataColumn.mask is not None:
                    selection = NP("logical_and", selection,
                                   NP(dxdataColumn.mask == defs.VALID),
                                   selection)
                if dydataColumn is not None and dydataColumn.mask is not None:
                    selection = NP("logical_and", selection,
                                   NP(dydataColumn.mask == defs.VALID),
                                   selection)

                xarray = xdataColumn.data[selection]
                yarray = ydataColumn.data[selection]

                dxarray, dyarray = None, None
                if dxdataColumn is not None:
                    dxarray = dxdataColumn.data[selection]
                if dydataColumn is not None:
                    dyarray = dydataColumn.data[selection]

                xfieldType = xdataColumn.fieldType
                yfieldType = ydataColumn.fieldType

            else:
                raise defs.PmmlValidationError(
                    "The only allowed combinations of PlotNumericExpressions are: \"y(x)\" and \"x(t) y(t)\""
                )

            persistentState = {}
            stateId = self.get("stateId")
            if stateId is not None:
                if stateId in dataTable.state:
                    persistentState = dataTable.state[stateId]
                    xarray = NP("concatenate", [xarray, persistentState["x"]])
                    yarray = NP("concatenate", [yarray, persistentState["y"]])
                    if dxarray is not None:
                        dxarray = NP("concatenate",
                                     [dxarray, persistentState["dx"]])
                    if dyarray is not None:
                        dyarray = NP("concatenate",
                                     [dyarray, persistentState["dy"]])
                else:
                    dataTable.state[stateId] = persistentState

            persistentState["x"] = xarray
            persistentState["y"] = yarray
            if dxarray is not None:
                persistentState["dx"] = dxarray
            if dyarray is not None:
                persistentState["dy"] = dyarray

            smooth = self.get("smooth", defaultFromXsd=True, convertType=True)
            if not smooth:
                if dyarray is not None and dxarray is None:
                    dxarray = NP(
                        (NP("roll", xarray, -1) - NP("roll", xarray, 1)) / 2.0)
                    dyarray = dyarray * dxarray

                loop = self.get("loop", defaultFromXsd=True, convertType=True)
                if dxarray is not None and not loop:
                    dxarray[0] = 0.0
                    dxarray[-1] = 0.0
                if dyarray is not None and not loop:
                    dyarray[0] = 0.0
                    dyarray[-1] = 0.0

                state.x = xarray
                state.y = yarray
                state.dx = dxarray
                state.dy = dyarray

            else:
                smoothingScale = self.get("smoothingScale",
                                          defaultFromXsd=True,
                                          convertType=True)
                loop = self.get("loop", defaultFromXsd=True, convertType=True)

                samples = self.generateSamples(xarray.min(), xarray.max())
                state.x, state.y, state.dx, state.dy = self.pointsToSmoothCurve(
                    xarray, yarray, samples, smoothingScale, loop)

        if plotRange is not None:
            plotRange.expand(state.x, state.y, xfieldType, yfieldType)

        performanceTable.end("PlotCurve prepare")
示例#25
0
    def format(self, subTable, functionTable, performanceTable, score):
        """Extract or post-process output for the output field of a DataTable.

        @type subTable: DataTable
        @param subTable: The DataTable associated with this local lexical scope.
        @type functionTable: FunctionTable or None
        @param functionTable: A table of functions.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @type score: dict
        @param score: Dictionary mapping PMML score "feature" strings to DataColumns.  This dictionary always contains a None key, which is the basic feature ("predictedValue").
        @rtype: DataColumn
        @return: The output that would go into an output field of a DataTable.
        """

        performanceTable.begin("OutputField")

        feature = self.get("feature")
        if feature is None:
            dataColumn = subTable.fields[self["name"]]

        elif feature == "predictedValue":
            dataColumn = score[None]

        elif feature == "predictedDisplayValue":
            original = score[None]
            toString = original.fieldType.valueToString
            data = NP("empty", len(subTable), dtype=NP.dtype(object))
            for i, x in enumerate(original.data):
                data[i] = toString(x)
            dataColumn = DataColumn(FakeFieldType("string", "continuous"), data, None)

        elif feature == "transformedValue":
            expression = self.childOfClass(PmmlExpression)
            if expression is None:
                raise defs.PmmlValidationError("OutputField with feature \"transformedValue\" requires an EXPRESSION")
            
            performanceTable.pause("OutputField")
            dataColumn = expression.evaluate(subTable, functionTable, performanceTable)
            performanceTable.unpause("OutputField")

        elif feature == "decision":
            decisions = self.childOfTag("Decisions")
            if decisions is None:
                raise defs.PmmlValidationError("OutputField with feature \"decision\" requires a Decisions block")

            performanceTable.pause("OutputField")
            dataColumn = self.childOfClass(PmmlExpression).evaluate(subTable, functionTable, performanceTable)
            performanceTable.unpause("OutputField")

            if dataColumn.mask is None:
                valid = None
            else:
                valid = NP(dataColumn.mask == defs.VALID)

            fieldType = FakeFieldType("object", "any")
            data = NP("empty", len(subTable), dtype=fieldType.dtype)
            mask = NP(NP("ones", len(subTable), dtype=defs.maskType) * defs.MISSING)

            for decision in decisions.childrenOfTag("Decision"):
                value = dataColumn.fieldType.stringToValue(decision["value"])

                selection = NP(dataColumn.data == value)
                if valid is not None:
                    NP("logical_and", selection, valid, selection)

                for i in xrange(len(data)):
                    if selection[i]:
                        data[i] = decision

                mask[selection] = defs.VALID
            
            if not mask.any():
                mask = None

            dataColumn = DataColumn(fieldType, data, mask)

        elif feature in score:
            dataColumn = score[feature]

        else:
            model = self.getparent()
            if model is not None: model = model.getparent()

            if model is None:
                model = "(orphaned OutputField; no parent model)"
            else:
                model = model.t

            raise defs.PmmlValidationError("Models of type %s do not produce \"%s\" features (or at least, it is not yet implemented by Augustus)" % (model, feature))

        dataType = self.get("dataType", dataColumn.fieldType.dataType)
        optype = self.get("optype", dataColumn.fieldType.optype)
        if (dataType != dataColumn.fieldType.dataType or optype != dataColumn.fieldType.optype) and feature not in ("predictedDisplayValue", "decision"):
            dataColumn = FieldCastMethods.cast(FakeFieldType(dataType, optype), dataColumn)

        if feature is not None:
            subTable.fields[self.get("displayName", self["name"])] = dataColumn

        performanceTable.end("OutputField")
        return dataColumn
示例#26
0
    def draw(self, state, plotCoordinates, plotDefinitions, performanceTable):
        """Draw the plot element.

        This stage consists of creating an SVG image of the
        pre-computed data.

        @type state: ad-hoc Python object
        @param state: State information that persists long enough to use quantities computed in C{prepare} in the C{draw} stage.  This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState.
        @type plotCoordinates: PlotCoordinates
        @param plotCoordinates: The coordinate system in which this plot element will be placed.
        @type plotDefinitions: PlotDefinitions
        @type plotDefinitions: The dictionary of key-value pairs that forms the <defs> section of the SVG document.
        @type performanceTable: PerformanceTable
        @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process.
        @rtype: SvgBinding
        @return: An SVG fragment representing the fully drawn plot element.
        """

        svg = SvgBinding.elementMaker
        performanceTable.begin("PlotBoxAndWhisker draw")

        vertical = self.get("vertical", defaultFromXsd=True, convertType=True)
        gap = self.get("gap", defaultFromXsd=True, convertType=True)

        if state.slicedFieldType is not self.fieldTypeNumeric:
            if vertical:
                strings = plotCoordinates.xstrings
            else:
                strings = plotCoordinates.ystrings

            newRanges = []
            for string in strings:
                try:
                    index = state.edges.index(string)
                except ValueError:
                    newRanges.append(None)
                else:
                    newRanges.append(state.ranges[index])

            state.ranges = newRanges
            state.edges = [(i - 0.5, i + 0.5) for i in xrange(len(strings))]

        lowEdge = NP("array", [low if low is not None else float("-inf") for low, high in state.edges], dtype=NP.dtype(float))
        highEdge = NP("array", [high if high is not None else float("inf") for low, high in state.edges], dtype=NP.dtype(float))

        selection = NP("array", [levels is not None for levels in state.ranges], dtype=NP.dtype(bool))
        lowEdge = lowEdge[selection]
        highEdge = highEdge[selection]

        lowWhisker  = NP("array", [levels[0] for levels in state.ranges if levels is not None], dtype=state.profiledFieldType.dtype)
        lowBox      = NP("array", [levels[1] for levels in state.ranges if levels is not None], dtype=state.profiledFieldType.dtype)
        midLine     = NP("array", [levels[2] for levels in state.ranges if levels is not None], dtype=state.profiledFieldType.dtype)
        highBox     = NP("array", [levels[3] for levels in state.ranges if levels is not None], dtype=state.profiledFieldType.dtype)
        highWhisker = NP("array", [levels[4] for levels in state.ranges if levels is not None], dtype=state.profiledFieldType.dtype)
        
        output = svg.g()
        if len(lowEdge) > 0:
            if vertical:
                Ax = lowEdge
                Bx = lowEdge
                Cx = lowEdge
                Dx = highEdge
                Ex = highEdge
                Fx = highEdge
                Gx = NP(NP(lowEdge + highEdge) / 2.0)
                Hx = Gx
                Ix = Gx
                Jx = Gx

                Ay = lowBox
                By = midLine
                Cy = highBox
                Dy = lowBox
                Ey = midLine
                Fy = highBox
                Gy = lowWhisker
                Hy = lowBox
                Iy = highBox
                Jy = highWhisker

            else:
                Ax = lowBox
                Bx = midLine
                Cx = highBox
                Dx = lowBox
                Ex = midLine
                Fx = highBox
                Gx = lowWhisker
                Hx = lowBox
                Ix = highBox
                Jx = highWhisker

                Ay = lowEdge
                By = lowEdge
                Cy = lowEdge
                Dy = highEdge
                Ey = highEdge
                Fy = highEdge
                Gy = NP(NP(lowEdge + highEdge) / 2.0)
                Hy = Gy
                Iy = Gy
                Jy = Gy

            AX, AY = plotCoordinates(Ax, Ay)
            BX, BY = plotCoordinates(Bx, By)
            CX, CY = plotCoordinates(Cx, Cy)
            DX, DY = plotCoordinates(Dx, Dy)
            EX, EY = plotCoordinates(Ex, Ey)
            FX, FY = plotCoordinates(Fx, Fy)
            GX, GY = plotCoordinates(Gx, Gy)
            HX, HY = plotCoordinates(Hx, Hy)
            IX, IY = plotCoordinates(Ix, Iy)
            JX, JY = plotCoordinates(Jx, Jy)

            if vertical:
                if gap > 0.0 and NP(NP(DX - gap/2.0) - NP(AX + gap/2.0)).min() > 0.0:
                    AX += gap/2.0
                    BX += gap/2.0
                    CX += gap/2.0
                    DX -= gap/2.0
                    EX -= gap/2.0
                    FX -= gap/2.0
            else:
                if gap > 0.0 and NP(NP(DY - gap/2.0) - NP(AY + gap/2.0)).min() > 0.0:
                    AY += gap/2.0
                    BY += gap/2.0
                    CY += gap/2.0
                    DY -= gap/2.0
                    EY -= gap/2.0
                    FY -= gap/2.0

            style = self.getStyleState()
            strokeStyle = dict((x, style[x]) for x in style if x.startswith("stroke"))
            strokeStyle["fill"] = "none"
            style = PlotStyle.toString(style)
            strokeStyle = PlotStyle.toString(strokeStyle)

            for i in xrange(len(lowEdge)):
                pathdata = ["M %r %r" % (HX[i], HY[i]),
                            "L %r %r" % (AX[i], AY[i]),
                            "L %r %r" % (BX[i], BY[i]),
                            "L %r %r" % (CX[i], CY[i]),
                            "L %r %r" % (IX[i], IY[i]),
                            "L %r %r" % (FX[i], FY[i]),
                            "L %r %r" % (EX[i], EY[i]),
                            "L %r %r" % (DX[i], DY[i]),
                            "L %r %r" % (HX[i], HY[i]),
                            "Z"]
                output.append(svg.path(d=" ".join(pathdata), style=style))
                output.append(svg.path(d="M %r %r L %r %r" % (BX[i], BY[i], EX[i], EY[i]), style=strokeStyle))
                output.append(svg.path(d="M %r %r L %r %r" % (HX[i], HY[i], GX[i], GY[i]), style=strokeStyle))
                output.append(svg.path(d="M %r %r L %r %r" % (IX[i], IY[i], JX[i], JY[i]), style=strokeStyle))

                if vertical:
                    width = (DX[i] - AX[i]) / 4.0
                    output.append(svg.path(d="M %r %r L %r %r" % (GX[i] - width, GY[i], GX[i] + width, GY[i]), style=strokeStyle))
                    output.append(svg.path(d="M %r %r L %r %r" % (JX[i] - width, JY[i], JX[i] + width, JY[i]), style=strokeStyle))
                else:
                    width = (DY[i] - AY[i]) / 4.0
                    output.append(svg.path(d="M %r %r L %r %r" % (GX[i], GY[i] - width, GX[i], GY[i] + width), style=strokeStyle))
                    output.append(svg.path(d="M %r %r L %r %r" % (JX[i], JY[i] - width, JX[i], JY[i] + width), style=strokeStyle))

        performanceTable.end("PlotBoxAndWhisker draw")

        svgId = self.get("svgId")
        if svgId is not None:
            output["id"] = svgId

        return output
示例#27
0
    def prepare(self, state, dataTable, functionTable, performanceTable, plotRange):
        """Prepare a plot element for drawing.

        This stage consists of calculating all quantities and
        determing the bounds of the data.  These bounds may be unioned
        with bounds from other plot elements that overlay this plot
        element, so the drawing (which requires a finalized coordinate
        system) cannot begin yet.

        This method modifies C{plotRange}.

        @type state: ad-hoc Python object
        @param state: State information that persists long enough to use quantities computed in C{prepare} in the C{draw} stage.  This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState.
        @type dataTable: DataTable
        @param dataTable: Contains the data to plot.
        @type functionTable: FunctionTable
        @param functionTable: Defines functions that may be used to transform data for plotting.
        @type performanceTable: PerformanceTable
        @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process.
        @type plotRange: PlotRange
        @param plotRange: The bounding box of plot coordinates that this function will expand.
        """

        self.checkRoles(["sliced", "profiled"])

        slicedExpression = self.xpath("pmml:PlotExpression[@role='sliced']")
        profiledExpression = self.xpath("pmml:PlotNumericExpression[@role='profiled']")
        cutExpression = self.xpath("pmml:PlotSelection")
        if len(slicedExpression) != 1:
            raise defs.PmmlValidationError("PlotHistogram requires a PlotExpression with role \"sliced\"")
        if len(profiledExpression) != 1:
            raise defs.PmmlValidationError("PlotHistogram requires a PlotNumericExpression with role \"profiled\"")

        slicedDataColumn = slicedExpression[0].evaluate(dataTable, functionTable, performanceTable)
        profiledDataColumn = profiledExpression[0].evaluate(dataTable, functionTable, performanceTable)

        if len(cutExpression) == 1:
            selection = cutExpression[0].select(dataTable, functionTable, performanceTable)
        else:
            selection = NP("ones", len(dataTable), NP.dtype(bool))

        performanceTable.begin("PlotBoxAndWhisker prepare")
        self._saveContext(dataTable)

        if slicedDataColumn.mask is not None:
            NP("logical_and", selection, NP(slicedDataColumn.mask == defs.VALID), selection)
        if profiledDataColumn.mask is not None:
            NP("logical_and", selection, NP(profiledDataColumn.mask == defs.VALID), selection)

        slicedArray = slicedDataColumn.data[selection]
        profiledArray = profiledDataColumn.data[selection]
        
        persistentState = {}
        stateId = self.get("stateId")
        if stateId is not None:
            if stateId in dataTable.state:
                persistentState = dataTable.state[stateId]
            else:
                dataTable.state[stateId] = persistentState

        intervals = self.xpath("pmml:Interval")
        values = self.xpath("pmml:Value")

        if "binType" not in persistentState:
            performanceTable.begin("establish binType")

            binType = PlotHistogram.establishBinType(slicedDataColumn.fieldType, intervals, values)
            persistentState["binType"] = binType

            if binType == "nonuniform":
                persistentState["distributions"] = [NP("empty", 0, dtype=profiledDataColumn.fieldType.dtype) for x in xrange(len(intervals))]

            elif binType == "explicit":
                persistentState["distributions"] = [NP("empty", 0, dtype=profiledDataColumn.fieldType.dtype) for x in xrange(len(values))]

            elif binType == "unique":
                persistentState["distributions"] = {}

            elif binType == "scale":
                numBins = self.get("numBins", convertType=True)
                low = self.get("low", convertType=True)
                high = self.get("high", convertType=True)

                numBins, low, high = PlotHistogram.determineScaleBins(numBins, low, high, slicedArray)

                persistentState["low"] = low
                persistentState["high"] = high
                persistentState["numBins"] = numBins
                persistentState["distributions"] = [NP("empty", 0, dtype=profiledDataColumn.fieldType.dtype) for x in xrange(numBins)]

            performanceTable.end("establish binType")

        if persistentState["binType"] == "nonuniform":
            performanceTable.begin("binType nonuniform")

            distributions = [None] * len(intervals)
            state.edges = []
            lastLimitPoint = None
            lastClosed = None
            lastInterval = None

            for index, interval in enumerate(intervals):
                selection, lastLimitPoint, lastClosed, lastInterval = PlotHistogram.selectInterval(slicedDataColumn.fieldType, slicedArray, index, len(intervals) - 1, interval, state.edges, lastLimitPoint, lastClosed, lastInterval)

                if selection is None:
                    distributions[index] = profiledArray
                else:
                    distributions[index] = profiledArray[selection]

            persistentState["distributions"] = [NP("concatenate", [x, y]) for x, y in itertools.izip(persistentState["distributions"], distributions)]
            distributions = persistentState["distributions"]
            lowEdge = min(low for low, high in state.edges if low is not None)
            highEdge = max(high for low, high in state.edges if high is not None)
            state.slicedFieldType = self.fieldTypeNumeric

            performanceTable.end("binType nonuniform")

        elif persistentState["binType"] == "explicit":
            performanceTable.begin("binType explicit")

            distributions = [None] * len(values)
            displayValues = []

            for index, value in enumerate(values):
                internalValue = slicedDataColumn.fieldType.stringToValue(value["value"])
                displayValues.append(value.get("displayValue", slicedDataColumn.fieldType.valueToString(internalValue, displayValue=True)))

                selection = NP(slicedArray == internalValue)
                distributions[index] = profiledArray[selection]
                
            persistentState["distributions"] = [NP("concatenate", [x, y]) for x, y in itertools.izip(persistentState["distributions"], distributions)]
            distributions = persistentState["distributions"]
            state.edges = displayValues
            state.slicedFieldType = slicedDataColumn.fieldType

            performanceTable.end("binType explicit")

        elif persistentState["binType"] == "unique":
            performanceTable.begin("binType unique")

            uniques, inverse = NP("unique", slicedArray, return_inverse=True)

            persistentDistributions = persistentState["distributions"]
            for i, u in enumerate(uniques):
                string = slicedDataColumn.fieldType.valueToString(u, displayValue=False)
                selection = NP(inverse == i)

                if string in persistentDistributions:
                    persistentDistributions[string] = NP("concatenate", [persistentDistributions[string], profiledArray[selection]])
                else:
                    persistentDistributions[string] = profiledArray[selection]

            tosort = [(len(distribution), string) for string, distribution in persistentDistributions.items()]
            tosort.sort(reverse=True)

            numBins = self.get("numBins", convertType=True)
            if numBins is not None:
                tosort = tosort[:numBins]

            distributions = [persistentDistributions[string] for count, string in tosort]
            state.edges = [slicedDataColumn.fieldType.valueToString(slicedDataColumn.fieldType.stringToValue(string), displayValue=True) for count, string in tosort]
            state.slicedFieldType = slicedDataColumn.fieldType
            
            performanceTable.end("binType unique")

        elif persistentState["binType"] == "scale":
            performanceTable.begin("binType scale")

            numBins = persistentState["numBins"]
            low = persistentState["low"]
            high = persistentState["high"]
            binWidth = (high - low) / float(numBins)

            binAssignments = NP("array", NP("floor", NP(NP(slicedArray - low)/binWidth)), dtype=NP.dtype(int))
            distributions = [None] * numBins

            for index in xrange(numBins):
                selection = NP(binAssignments == index)
                distributions[index] = profiledArray[selection]
                
            persistentState["distributions"] = [NP("concatenate", [x, y]) for x, y in itertools.izip(persistentState["distributions"], distributions)]
            distributions = persistentState["distributions"]
            state.edges = [(low + i*binWidth, low + (i + 1)*binWidth) for i in xrange(numBins)]
            lowEdge = low
            highEdge = high
            state.slicedFieldType = self.fieldTypeNumeric
        
            performanceTable.end("binType scale")

        levels = self.get("levels", defaultFromXsd=True)
        lowWhisker = self.get("lowWhisker", defaultFromXsd=True, convertType=True)
        lowBox = self.get("lowBox", defaultFromXsd=True, convertType=True)
        midLine = self.get("midLine", defaultFromXsd=True, convertType=True)
        highBox = self.get("highBox", defaultFromXsd=True, convertType=True)
        highWhisker = self.get("highWhisker", defaultFromXsd=True, convertType=True)

        state.ranges = []
        minProfiled = None
        maxProfiled = None
        for distribution in distributions:
            if levels == "percentage":
                if len(distribution) > 0:
                    state.ranges.append(NP("percentile", distribution, [lowWhisker, lowBox, midLine, highBox, highWhisker]))
                else:
                    state.ranges.append(None)

            elif levels == "standardDeviation":
                mu = NP("mean", distribution)
                sigma = NP("std", distribution, ddof=1)

                if NP("isfinite", sigma) and sigma > 0.0:
                    state.ranges.append([(lowWhisker - mu)/sigma, (lowBox - mu)/sigma, (midLine - mu)/sigma, (highBox - mu)/sigma, (highWhisker - mu)/sigma])
                else:
                    state.ranges.append(None)

            if state.ranges[-1] is not None:
                if minProfiled is None:
                    minProfiled = min(state.ranges[-1])
                    maxProfiled = max(state.ranges[-1])
                else:
                    minProfiled = min(minProfiled, min(state.ranges[-1]))
                    maxProfiled = max(maxProfiled, max(state.ranges[-1]))

        state.profiledFieldType = profiledDataColumn.fieldType

        if self.get("vertical", defaultFromXsd=True, convertType=True):
            if state.slicedFieldType is self.fieldTypeNumeric:
                plotRange.xminPush(lowEdge, state.slicedFieldType, sticky=False)
                plotRange.xmaxPush(highEdge, state.slicedFieldType, sticky=False)
                if minProfiled is not None:
                    plotRange.yminPush(minProfiled, state.profiledFieldType, sticky=False)
                    plotRange.ymaxPush(maxProfiled, state.profiledFieldType, sticky=False)

            else:
                strings = NP("array", state.edges, dtype=NP.dtype(object))
                if minProfiled is not None:
                    values = NP("ones", len(state.edges), dtype=state.profiledFieldType.dtype) * maxProfiled
                    values[0] = minProfiled
                else:
                    values = NP("zeros", len(state.edges), dtype=state.profiledFieldType.dtype)

                plotRange.expand(strings, values, state.slicedFieldType, state.profiledFieldType)

        else:
            if state.slicedFieldType is self.fieldTypeNumeric:
                plotRange.yminPush(lowEdge, state.slicedFieldType, sticky=False)
                plotRange.ymaxPush(highEdge, state.slicedFieldType, sticky=False)
                if minProfiled is not None:
                    plotRange.xminPush(minProfiled, state.profiledFieldType, sticky=False)
                    plotRange.xmaxPush(maxProfiled, state.profiledFieldType, sticky=False)

            else:
                strings = NP("array", state.edges, dtype=NP.dtype(object))
                if minProfiled is not None:
                    values = NP("ones", len(state.edges), dtype=state.profiledFieldType.dtype) * maxProfiled
                    values[0] = minProfiled
                else:
                    values = NP("zeros", len(state.edges), dtype=state.profiledFieldType.dtype)
                
                plotRange.expand(values, strings, state.profiledFieldType, state.slicedFieldType)

        performanceTable.end("PlotBoxAndWhisker prepare")
示例#28
0
    def mapReduce(self):
        """Build a MapReduce-Ready K-means producer.

        Used by C{optimize} and C{hadoopOptimize}.

        @rtype: MapReduce
        @return: An instance of MapReduce that can either be run in pure-Python mode or submitted to Hadoop.
        """

        class KMeansMapReduceApplication(MapReduceKMeans):
            metadata = {}
            allChangeThreshold = self.allChangeThreshold

        KMeansMapReduceApplication.metadata["ClusteringModel"] = self.clusteringModel

        clusterVectors = {}
        for index, cluster in enumerate(self.clusteringModel.xpath("pmml:Cluster")):
            clusterName = cluster.get("id", "%d" % (index + 1))
            clusterVectors[clusterName] = NP("array", cluster.childOfTag("Array").values(), dtype=NP.dtype(float))
        KMeansMapReduceApplication.metadata["clusterVectors"] = clusterVectors

        self.KMeansMapReduceApplication = KMeansMapReduceApplication

        return MapReduce(KMeansMapReduceApplication)
示例#29
0
    def smallTrials(self, dataTable, numberOfTrials=5, recordsPerTrial=100, performanceTable=None):
        """Improve the initial seed with a few small trials on random subsets of the data.

        Modifies C{self.clusteringModel}.

        @type dataTable: DataTable
        @param dataTable: The input data.
        @type numberOfTrials: int
        @param numberOfTrials: The number of independent trials with the same number of C{recordsPerTrial}.  The trial with the smallest sum of in-cluster variances wins.
        @type recordsPerTrial: int
        @param recordsPerTrial: The number of rows to randomly select from the DataTable in each trial.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        """

        if performanceTable is None:
            performanceTable = FakePerformanceTable()

        performanceTable.begin("smallTrials")

        mapReduce = self.mapReduce()
        
        self.KMeansMapReduceApplication.metadata["ClusteringModel"] = copy.deepcopy(self.KMeansMapReduceApplication.metadata["ClusteringModel"])

        bestVariance = None
        bestSeed = None
        for trialNumber in xrange(numberOfTrials):
            indexes = random.sample(xrange(len(dataTable)), recordsPerTrial)
            subTable = dataTable.subTable(NP("array", indexes, dtype=NP.dtype(int)))

            self.randomSeeds(dataTable)
            mapReduce.metadata["ClusteringModel"] = self.clusteringModel

            outputRecords, outputKeyValues, numberOfIterations = mapReduce.run([subTable], parallel=False, frozenClass=False, numberOfMappers=1, numberOfReducers=1, iterationLimit=self.iterationLimit)

            for extension in self.clusteringModel.xpath("pmml:Extension[@name='iterations.smallTrials']"):
                extension["value"] = repr(int(extension["value"]) + numberOfIterations)

            mapReduce.metadata["ClusteringModel"]["modelName"] = "smallTrials"
            mapReduce.metadata["ClusteringModel"].subFields = dict(mapReduce.metadata["ClusteringModel"].subFields)
            mapReduce.metadata["ClusteringModel"].subFields.update({"affinity": True})
            mapReduce.metadata["ClusteringModel"].calculate(subTable)

            data = subTable.fields["smallTrials.affinity"].data
            mask = subTable.fields["smallTrials.affinity"].mask
            if mask is None:
                variance = NP(data**2).sum() / float(len(subTable))
            else:
                selection = NP(mask == defs.VALID)
                denom = NP("count_nonzero", selection)
                if denom > 0:
                    variance = NP(data[selection]**2).sum() / float(denom)
                else:
                    variance = None
            if variance is not None and (bestVariance is None or variance < bestVariance):
                bestVariance = variance
                bestSeed = mapReduce.metadata["clusterVectors"]

        if bestSeed is not None:
            self.explicitSeeds(bestSeed)

        performanceTable.end("smallTrials")
示例#30
0
    def prepare(self, state, dataTable, functionTable, performanceTable, plotRange):
        """Prepare a plot element for drawing.

        This stage consists of calculating all quantities and
        determing the bounds of the data.  These bounds may be unioned
        with bounds from other plot elements that overlay this plot
        element, so the drawing (which requires a finalized coordinate
        system) cannot begin yet.

        This method modifies C{plotRange}.

        @type state: ad-hoc Python object
        @param state: State information that persists long enough to use quantities computed in C{prepare} in the C{draw} stage.  This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState.
        @type dataTable: DataTable
        @param dataTable: Contains the data to plot.
        @type functionTable: FunctionTable
        @param functionTable: Defines functions that may be used to transform data for plotting.
        @type performanceTable: PerformanceTable
        @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process.
        @type plotRange: PlotRange
        @param plotRange: The bounding box of plot coordinates that this function will expand.
        """

        self._saveContext(dataTable)

        self.checkRoles(["x", "y", "x-errorbar", "x-errorbar-up", "x-errorbar-down", "y-errorbar", "y-errorbar-up", "y-errorbar-down", "weight"])

        xExpression = self.xpath("pmml:PlotNumericExpression[@role='x']")
        yExpression = self.xpath("pmml:PlotNumericExpression[@role='y']")

        cutExpression = self.xpath("pmml:PlotSelection")

        exExpression = self.xpath("pmml:PlotNumericExpression[@role='x-errorbar']")
        exupExpression = self.xpath("pmml:PlotNumericExpression[@role='x-errorbar-up']")
        exdownExpression = self.xpath("pmml:PlotNumericExpression[@role='x-errorbar-down']")

        eyExpression = self.xpath("pmml:PlotNumericExpression[@role='y-errorbar']")
        eyupExpression = self.xpath("pmml:PlotNumericExpression[@role='y-errorbar-up']")
        eydownExpression = self.xpath("pmml:PlotNumericExpression[@role='y-errorbar-down']")

        weightExpression = self.xpath("pmml:PlotNumericExpression[@role='weight']")

        if len(xExpression) != 1 or len(yExpression) != 1:
            raise defs.PmmlValidationError("PlotScatter requires two PlotNumericExpressions, one with role \"x\", the other with role \"y\"")

        xValues = xExpression[0].evaluate(dataTable, functionTable, performanceTable)
        yValues = yExpression[0].evaluate(dataTable, functionTable, performanceTable)

        if len(cutExpression) == 1:
            selection = cutExpression[0].select(dataTable, functionTable, performanceTable)
        else:
            selection = NP("ones", len(dataTable), NP.dtype(bool))

        if len(exExpression) == 0 and len(exupExpression) == 0 and len(exdownExpression) == 0:
            exup, exdown = None, None
        elif len(exExpression) == 1 and len(exupExpression) == 0 and len(exdownExpression) == 0:
            exup = exExpression[0].evaluate(dataTable, functionTable, performanceTable)
            exdown = None
        elif len(exExpression) == 0 and len(exupExpression) == 1 and len(exdownExpression) == 1:
            exup = exupExpression[0].evaluate(dataTable, functionTable, performanceTable)
            exdown = exdownExpression[0].evaluate(dataTable, functionTable, performanceTable)
        else:
            raise defs.PmmlValidationError("Use \"x-errorbar\" for symmetric error bars or \"x-errorbar-up\" and \"x-errorbar-down\" for asymmetric errorbars, but no other combinations")

        if len(eyExpression) == 0 and len(eyupExpression) == 0 and len(eydownExpression) == 0:
            eyup, eydown = None, None
        elif len(eyExpression) == 1 and len(eyupExpression) == 0 and len(eydownExpression) == 0:
            eyup = eyExpression[0].evaluate(dataTable, functionTable, performanceTable)
            eydown = None
        elif len(eyExpression) == 0 and len(eyupExpression) == 1 and len(eydownExpression) == 1:
            eyup = eyupExpression[0].evaluate(dataTable, functionTable, performanceTable)
            eydown = eydownExpression[0].evaluate(dataTable, functionTable, performanceTable)
        else:
            raise defs.PmmlValidationError("Use \"y-errorbar\" for symmetric error bars or \"y-errorbar-up\" and \"y-errorbar-down\" for asymmetric errorbars, but no other combinations")

        if len(weightExpression) == 1:
            weight = weightExpression[0].evaluate(dataTable, functionTable, performanceTable)
        else:
            weight = None

        performanceTable.begin("PlotScatter prepare")

        if xValues.mask is not None:
            NP("logical_and", selection, NP(xValues.mask == defs.VALID), selection)
        if yValues.mask is not None:
            NP("logical_and", selection, NP(yValues.mask == defs.VALID), selection)

        if exup is not None and exup.mask is not None:
            NP("logical_and", selection, NP(exup.mask == defs.VALID), selection)
        if exdown is not None and exdown.mask is not None:
            NP("logical_and", selection, NP(exdown.mask == defs.VALID), selection)
        if eyup is not None and eyup.mask is not None:
            NP("logical_and", selection, NP(eyup.mask == defs.VALID), selection)
        if eydown is not None and eydown.mask is not None:
            NP("logical_and", selection, NP(eydown.mask == defs.VALID), selection)

        state.x = xValues.data[selection]
        state.y = yValues.data[selection]

        state.exup, state.exdown, state.eyup, state.eydown = None, None, None, None
        if exup is not None:
            state.exup = exup.data[selection]
        if exdown is not None:
            state.exdown = exdown.data[selection]
        if eyup is not None:
            state.eyup = eyup.data[selection]
        if eydown is not None:
            state.eydown = eydown.data[selection]

        state.weight = None
        if weight is not None:
            state.weight = weight.data[selection]

        stateId = self.get("stateId")
        if stateId is not None:
            persistentState = dataTable.state.get(stateId)
            if persistentState is None:
                persistentState = {}
                dataTable.state[stateId] = persistentState
            else:
                state.x = NP("concatenate", (persistentState["x"], state.x))
                state.y = NP("concatenate", (persistentState["y"], state.y))

                if exup is not None:
                    state.exup = NP("concatenate", (persistentState["exup"], state.exup))
                if exdown is not None:
                    state.exdown = NP("concatenate", (persistentState["exdown"], state.exdown))
                if eyup is not None:
                    state.eyup = NP("concatenate", (persistentState["eyup"], state.eyup))
                if eydown is not None:
                    state.eydown = NP("concatenate", (persistentState["eydown"], state.eydown))

                if weight is not None:
                    state.weight = NP("concatenate", (persistentState["weight"], state.weight))

            persistentState["x"] = state.x
            persistentState["y"] = state.y

            if exup is not None:
                persistentState["exup"] = state.exup
            if exdown is not None:
                persistentState["exdown"] = state.exdown
            if eyup is not None:
                persistentState["eyup"] = state.eyup
            if eydown is not None:
                persistentState["eydown"] = state.eydown

            if weight is not None:
                persistentState["weight"] = state.weight

        plotRange.expand(state.x, state.y, xValues.fieldType, yValues.fieldType)
        performanceTable.end("PlotScatter prepare")
示例#31
0
    def cusum(self, testDistributions, fieldName, dataColumn, state,
              performanceTable):
        """Calculate the score of a CUSUM TestStatistic.

        The CUSUM cumulative sum is a stateful calculation: each row
        depends on the result of the previous row.  To continue
        calculations through multiple calls to C{calc} or
        C{calculate}, pass a DataTableState object and give the
        BaselineModel a C{stateId} attribute.  The C{stateId} is not
        valid in strict PMML, but it can be inserted after validation
        or used in custom-ODG models (C{from augustus.odg import *}).

        @type testDistributions: PmmlBinding
        @param testDistributions: The <TestDistributions> element.
        @type fieldName: string
        @param fieldName: The field name (for error messages).
        @type dataColumn: DataColumn
        @param dataColumn: The field.
        @type state: DataTableState
        @param state: The persistent state object, which is used to initialize the start state and save the end state of the cumulative sum.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: dict
        @return: A dictionary mapping PMML "feature" strings to DataColumns; CUSUM only defines the None key ("predictedValue").
        """

        baseline = testDistributions.xpath(
            "pmml:Baseline/pmml:GaussianDistribution | pmml:Baseline/pmml:PoissonDistribution"
        )
        alternate = testDistributions.xpath(
            "pmml:Alternate/pmml:GaussianDistribution | pmml:Alternate/pmml:PoissonDistribution"
        )

        if len(baseline) == 0 or len(alternate) == 0:
            raise defs.PmmlValidationError(
                "BaselineModel CUSUM requires a Baseline and an Alternate that are either GaussianDistribution or PoissonDistribution"
            )

        ratios = alternate[0].logpdf(dataColumn.data) - baseline[0].logpdf(
            dataColumn.data)
        if dataColumn.mask is None:
            good = NP("ones", len(dataColumn), dtype=NP.dtype(bool))
        else:
            good = NP(dataColumn.mask == defs.VALID)

        stateId = self.get("stateId")
        last = None
        if stateId is not None:
            last = state.get(stateId)
        if last is None:
            last = 0.0

        resetValue = testDistributions.get("resetValue",
                                           defaultFromXsd=True,
                                           convertType=True)

        output = NP("empty", len(dataColumn), dtype=NP.dtype(float))

        performanceTable.begin("fill CUSUM")
        for index in xrange(len(dataColumn)):
            if good[index]:
                last = max(resetValue, last + ratios[index])
            output[index] = last
        performanceTable.end("fill CUSUM")

        if stateId is not None:
            state[stateId] = last

        return {None: DataColumn(self.scoreType, output, None)}
示例#32
0
    def endReducerKey(self, key):
        for clusterName in self.clusterVectors.keys():
            if clusterName == key:
                newPosition = NP("array", [self.numer[fieldName] / self.denom[fieldName] if self.denom[fieldName] > 0.0 else 0.0 for fieldName in self.fieldNames], dtype=NP.dtype(float))

                self.emit(clusterName, newPosition)
                break
示例#33
0
    def _selectFirst(self, dataTable, functionTable, performanceTable,
                     segmentation):
        """Used by C{calculateScore}."""

        performanceTable.begin("Segmentation selectFirst")

        scoresData = NP("empty", len(dataTable), dtype=NP.dtype(object))
        scoresMask = NP("zeros", len(dataTable), dtype=defs.maskType)
        unfilled = NP("ones", len(dataTable), dtype=NP.dtype(bool))
        segments = NP("empty", len(dataTable), dtype=NP.dtype(object))

        newOutputData = []
        for segment in segmentation.childrenOfTag("Segment", iterator=True):
            performanceTable.pause("Segmentation selectFirst")
            selection = segment.childOfClass(PmmlPredicate).evaluate(
                dataTable, functionTable, performanceTable)
            performanceTable.unpause("Segmentation selectFirst")
            NP("logical_and", selection, unfilled, selection)
            if not selection.any():
                continue

            subTable = dataTable.subTable(selection)
            subModel = segment.childOfClass(PmmlModel)
            performanceTable.pause("Segmentation selectFirst")

            subModel.calculate(subTable, functionTable, performanceTable)
            performanceTable.unpause("Segmentation selectFirst")

            scoresData[selection] = subTable.score.data
            if subTable.score.mask is not None:
                scoresMask[selection] = subTable.score.mask
            else:
                scoresMask[selection] = defs.VALID

            segmentName = segment.get("id")
            if segmentName is not None:
                segments[selection] = segmentName

            for fieldName, dataColumn in subTable.output.items():
                if fieldName not in dataTable.output:
                    data = NP("empty",
                              len(dataTable),
                              dtype=dataColumn.fieldType.dtype)
                    data[selection] = dataColumn.data

                    mask = NP(
                        NP("ones", len(dataTable), dtype=defs.maskType) *
                        defs.MISSING)
                    if dataColumn.mask is None:
                        mask[selection] = defs.VALID
                    else:
                        mask[selection] = dataColumn.mask

                    newDataColumn = DataColumn(dataColumn.fieldType, data,
                                               mask)
                    newDataColumn._unlock()
                    dataTable.output[fieldName] = newDataColumn
                    newOutputData.append(newDataColumn)

                else:
                    newDataColumn = dataTable.output[fieldName]

                    newDataColumn.data[selection] = dataColumn.data
                    if dataColumn.mask is None:
                        newDataColumn.mask[selection] = defs.VALID
                    else:
                        newDataColumn.mask[selection] = dataColumn.mask

            unfilled -= selection
            if not unfilled.any():
                break

        for newDataColumn in newOutputData:
            if not newDataColumn.mask.any():
                newDataColumn._mask = None
            newDataColumn._lock()

        if not scoresMask.any():
            scoresMask = None

        scores = DataColumn(self.scoreType, scoresData, scoresMask)

        if self.name is None:
            performanceTable.end("Segmentation selectFirst")
            return {None: scores}
        else:
            performanceTable.end("Segmentation selectFirst")
            return {
                None: scores,
                "segment": DataColumn(self.scoreTypeSegment, segments, None)
            }
示例#34
0
    def smallTrials(self,
                    dataTable,
                    numberOfTrials=5,
                    recordsPerTrial=100,
                    performanceTable=None):
        """Improve the initial seed with a few small trials on random subsets of the data.

        Modifies C{self.clusteringModel}.

        @type dataTable: DataTable
        @param dataTable: The input data.
        @type numberOfTrials: int
        @param numberOfTrials: The number of independent trials with the same number of C{recordsPerTrial}.  The trial with the smallest sum of in-cluster variances wins.
        @type recordsPerTrial: int
        @param recordsPerTrial: The number of rows to randomly select from the DataTable in each trial.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        """

        if performanceTable is None:
            performanceTable = FakePerformanceTable()

        performanceTable.begin("smallTrials")

        mapReduce = self.mapReduce()

        self.KMeansMapReduceApplication.metadata[
            "ClusteringModel"] = copy.deepcopy(
                self.KMeansMapReduceApplication.metadata["ClusteringModel"])

        bestVariance = None
        bestSeed = None
        for trialNumber in xrange(numberOfTrials):
            indexes = random.sample(xrange(len(dataTable)), recordsPerTrial)
            subTable = dataTable.subTable(
                NP("array", indexes, dtype=NP.dtype(int)))

            self.randomSeeds(dataTable)
            mapReduce.metadata["ClusteringModel"] = self.clusteringModel

            outputRecords, outputKeyValues, numberOfIterations = mapReduce.run(
                [subTable],
                parallel=False,
                frozenClass=False,
                numberOfMappers=1,
                numberOfReducers=1,
                iterationLimit=self.iterationLimit)

            for extension in self.clusteringModel.xpath(
                    "pmml:Extension[@name='iterations.smallTrials']"):
                extension["value"] = repr(
                    int(extension["value"]) + numberOfIterations)

            mapReduce.metadata["ClusteringModel"]["modelName"] = "smallTrials"
            mapReduce.metadata["ClusteringModel"].subFields = dict(
                mapReduce.metadata["ClusteringModel"].subFields)
            mapReduce.metadata["ClusteringModel"].subFields.update(
                {"affinity": True})
            mapReduce.metadata["ClusteringModel"].calculate(subTable)

            data = subTable.fields["smallTrials.affinity"].data
            mask = subTable.fields["smallTrials.affinity"].mask
            if mask is None:
                variance = NP(data**2).sum() / float(len(subTable))
            else:
                selection = NP(mask == defs.VALID)
                denom = NP("count_nonzero", selection)
                if denom > 0:
                    variance = NP(data[selection]**2).sum() / float(denom)
                else:
                    variance = None
            if variance is not None and (bestVariance is None
                                         or variance < bestVariance):
                bestVariance = variance
                bestSeed = mapReduce.metadata["clusterVectors"]

        if bestSeed is not None:
            self.explicitSeeds(bestSeed)

        performanceTable.end("smallTrials")
示例#35
0
    def _selectAllMedianMajority(self, dataTable, functionTable,
                                 performanceTable, segmentation, which):
        """Used by C{calculateScore}."""

        if which is self.SELECT_ALL:
            performanceLabel = "Segmentation selectAll"
        elif which is self.MEDIAN:
            performanceLabel = "Segmentation median"
        elif which is self.MAJORITY_VOTE:
            performanceLabel = "Segmentation majorityVote"
        elif which is self.WEIGHTED_MAJORITY_VOTE:
            performanceLabel = "Segmentation weightedMajorityVote"
        performanceTable.begin(performanceLabel)

        scores = [[] for x in xrange(len(dataTable))]
        if which is self.SELECT_ALL:
            segments = [[] for x in xrange(len(dataTable))]

        newOutputData = {}
        for segment in segmentation.childrenOfTag("Segment", iterator=True):
            performanceTable.pause(performanceLabel)
            selection = segment.childOfClass(PmmlPredicate).evaluate(
                dataTable, functionTable, performanceTable)
            performanceTable.unpause(performanceLabel)
            if not selection.any():
                continue

            segmentName = segment.get("id")
            indexes = NP("nonzero", selection)[0]

            subTable = dataTable.subTable(selection)
            subModel = segment.childOfClass(PmmlModel)

            performanceTable.pause(performanceLabel)
            subModel.calculate(subTable, functionTable, performanceTable)
            performanceTable.unpause(performanceLabel)

            if which is self.MEDIAN and subTable.score.fieldType.dataType in (
                    "string", "boolean", "object"):
                raise defs.PmmlValidationError(
                    "Segmentation with multipleModelMethod=\"median\" cannot be applied to models that produce dataType \"%s\""
                    % subTable.score.fieldType.dataType)

            scoreData = subTable.score.data
            scoreMask = subTable.score.mask
            indexesUsed = indexes
            if which is self.SELECT_ALL:
                for subIndex, index in enumerate(indexes):
                    if scoreMask is None or scoreMask[subIndex] == defs.VALID:
                        scores[index].append(scoreData[subIndex])
                        segments[index].append(segmentName)

            elif which is self.MEDIAN:
                for subIndex, index in enumerate(indexes):
                    if scoreMask is None or scoreMask[subIndex] == defs.VALID:
                        scores[index].append(scoreData[subIndex])

            elif which in (self.MAJORITY_VOTE, self.WEIGHTED_MAJORITY_VOTE):
                if which is self.MAJORITY_VOTE:
                    weight = 1.0
                else:
                    weight = float(segment.get("weight", 1.0))
                for subIndex, index in enumerate(indexes):
                    if scoreMask is None or scoreMask[subIndex] == defs.VALID:
                        newValue = scoreData[subIndex]
                        score = scores[index]
                        found = False
                        for pair in score:
                            if pair[0] == newValue:
                                pair[1] += weight
                                found = True
                                break
                        if not found:
                            score.append([newValue, weight])

            if which is self.SELECT_ALL:
                for fieldName, dataColumn in subTable.output.items():
                    newData = newOutputData.get(fieldName)
                    if newData is None:
                        newData = [[] for x in xrange(len(dataTable))]
                        newOutputData[fieldName] = newData

                    dataColumnData = dataColumn.data
                    dataColumnMask = dataColumn.mask
                    for subIndex, index in enumerate(indexes):
                        if scoreMask is None or scoreMask[
                                subIndex] == defs.VALID:
                            if dataColumnMask is None or dataColumnMask[
                                    subIndex] == defs.VALID:
                                newData[index].append(dataColumnData[subIndex])
                            else:
                                newData[index].append(None)

        if which is self.SELECT_ALL:
            for fieldName, newData in newOutputData.items():
                finalNewData = NP("empty",
                                  len(dataTable),
                                  dtype=NP.dtype(object))
                for index, newDatum in enumerate(newData):
                    finalNewData[index] = tuple(newDatum)
                dataTable.output[fieldName] = DataColumn(
                    self.scoreType, finalNewData, None)

            finalScoresData = NP("empty",
                                 len(dataTable),
                                 dtype=NP.dtype(object))
            for index, score in enumerate(scores):
                finalScoresData[index] = tuple(score)
            finalScores = DataColumn(self.scoreType, finalScoresData, None)

            if self.name is None:
                performanceTable.end(performanceLabel)
                return {None: finalScores}
            else:
                finalSegmentsData = NP("empty",
                                       len(dataTable),
                                       dtype=NP.dtype(object))
                for index, segment in enumerate(segments):
                    finalSegmentsData[index] = tuple(segment)

                performanceTable.end(performanceLabel)
                return {
                    None:
                    finalScores,
                    "segment":
                    DataColumn(self.scoreTypeSegment, finalSegmentsData, None)
                }

        elif which is self.MEDIAN:
            finalScoresData = NP("empty",
                                 len(dataTable),
                                 dtype=NP.dtype(object))
            finalScoresMask = NP("empty", len(dataTable), dtype=defs.maskType)
            for index, score in enumerate(scores):
                if len(score) > 0:
                    finalScoresData[index] = NP("median", score)
                    finalScoresMask[index] = defs.VALID
                else:
                    finalScoresMask[index] = defs.INVALID

            if not finalScoresMask.any():
                finalScoresMask = None
            finalScores = DataColumn(self.scoreType, finalScoresData,
                                     finalScoresMask)

            performanceTable.end(performanceLabel)
            return {None: finalScores}

        elif which in (self.MAJORITY_VOTE, self.WEIGHTED_MAJORITY_VOTE):
            finalScoresData = NP("empty",
                                 len(dataTable),
                                 dtype=NP.dtype(object))
            finalScoresMask = NP("empty", len(dataTable), dtype=defs.maskType)
            cardinality = NP("empty",
                             len(dataTable),
                             dtype=self.scoreTypeCardinality.dtype)

            for index, score in enumerate(scores):
                bestN, bestValue = None, None
                for value, N in score:
                    if bestN is None or N > bestN:
                        bestN = N
                        bestValue = value
                if bestN is not None:
                    finalScoresData[index] = bestValue
                    finalScoresMask[index] = defs.VALID
                    cardinality[index] = bestN
                else:
                    finalScoresMask[index] = defs.INVALID
                    cardinality[index] = 0

            if not finalScoresMask.any():
                finalScoresMask = None
            finalScores = DataColumn(self.scoreType, finalScoresData,
                                     finalScoresMask)

            if self.name is None:
                performanceTable.end(performanceLabel)
                return {None: finalScores}
            else:
                finalCardinality = DataColumn(self.scoreTypeCardinality,
                                              cardinality, None)

                performanceTable.end(performanceLabel)
                return {None: finalScores, "cardinality": finalCardinality}
示例#36
0
    def draw(self, state, plotCoordinates, plotDefinitions, performanceTable):
        """Draw the plot element.

        This stage consists of creating an SVG image of the
        pre-computed data.

        @type state: ad-hoc Python object
        @param state: State information that persists long enough to use quantities computed in C{prepare} in the C{draw} stage.  This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState.
        @type plotCoordinates: PlotCoordinates
        @param plotCoordinates: The coordinate system in which this plot element will be placed.
        @type plotDefinitions: PlotDefinitions
        @type plotDefinitions: The dictionary of key-value pairs that forms the <defs> section of the SVG document.
        @type performanceTable: PerformanceTable
        @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process.
        @rtype: SvgBinding
        @return: An SVG fragment representing the fully drawn plot element.
        """

        svg = SvgBinding.elementMaker
        performanceTable.begin("PlotGuideLines draw")

        output = svg.g()

        for directive in self.xpath("pmml:PlotVerticalLines | pmml:PlotHorizontalLines | pmml:PlotLine"):
            style = dict(self.styleDefaults)
            currentStyle = directive.get("style")
            if currentStyle is not None:
                style.update(PlotStyle.toDict(currentStyle))
            style["fill"] = "none"
            style = PlotStyle.toString(style)

            if directive.hasTag("PlotVerticalLines"):
                try:
                    x0 = plotCoordinates.xfieldType.stringToValue(directive["x0"])
                except ValueError:
                    raise defs.PmmlValidationError("Invalid x0: %r" % directive["x0"])

                spacing = float(directive["spacing"])
                low = plotCoordinates.innerX1
                high = plotCoordinates.innerX2

                up = list(NP("arange", x0, high, spacing, dtype=NP.dtype(float)))
                down = list(NP("arange", x0 - spacing, low, -spacing, dtype=NP.dtype(float)))

                for x in up + down:
                    x1, y1 = x, float("-inf")
                    X1, Y1 = plotCoordinates(x1, y1)
                    x2, y2 = x, float("inf")
                    X2, Y2 = plotCoordinates(x2, y2)

                    output.append(svg.path(d="M %r %r L %r %r" % (X1, Y1, X2, Y2), style=style))

            elif directive.hasTag("PlotHorizontalLines"):
                try:
                    y0 = plotCoordinates.xfieldType.stringToValue(directive["y0"])
                except ValueError:
                    raise defs.PmmlValidationError("Invalid y0: %r" % directive["y0"])

                spacing = float(directive["spacing"])
                low = plotCoordinates.innerY1
                high = plotCoordinates.innerY2

                up = list(NP("arange", y0, high, spacing, dtype=NP.dtype(float)))
                down = list(NP("arange", y0 - spacing, low, -spacing, dtype=NP.dtype(float)))

                for y in up + down:
                    x1, y1 = float("-inf"), y
                    X1, Y1 = plotCoordinates(x1, y1)
                    x2, y2 = float("inf"), y
                    X2, Y2 = plotCoordinates(x2, y2)

                    output.append(svg.path(d="M %r %r L %r %r" % (X1, Y1, X2, Y2), style=style))

            elif directive.hasTag("PlotLine"):
                try:
                    x1 = plotCoordinates.xfieldType.stringToValue(directive["x1"])
                    y1 = plotCoordinates.xfieldType.stringToValue(directive["y1"])
                    x2 = plotCoordinates.xfieldType.stringToValue(directive["x2"])
                    y2 = plotCoordinates.xfieldType.stringToValue(directive["y2"])
                except ValueError:
                    raise defs.PmmlValidationError("Invalid x1, y1, x2, or y2: %r %r %r %r" % (directive["x1"], directive["y1"], directive["x2"], directive["y2"]))

                X1, Y1 = plotCoordinates(x1, y1)
                X2, Y2 = plotCoordinates(x2, y2)

                output.append(svg.path(d="M %r %r L %r %r" % (X1, Y1, X2, Y2), style=style))

        svgId = self.get("svgId")
        if svgId is not None:
            output["id"] = svgId

        performanceTable.end("PlotGuideLines draw")

        return output
示例#37
0
    def _sumAverageWeighted(self, dataTable, functionTable, performanceTable,
                            segmentation, which):
        """Used by C{calculateScore}."""

        if which is self.SUM:
            performanceLabel = "Segmentation sum"
        elif which is self.AVERAGE:
            performanceLabel = "Segmentation average"
        elif which is self.WEIGHTED_AVERAGE:
            performanceLabel = "Segmentation weightedAverage"
        performanceTable.begin(performanceLabel)

        scoresData = NP("zeros", len(dataTable), dtype=NP.dtype(object))
        if which is not self.SUM:
            denominator = NP("zeros", len(dataTable), dtype=NP.dtype(float))
        invalid = NP("zeros", len(dataTable), dtype=NP.dtype(bool))

        for segment in segmentation.childrenOfTag("Segment", iterator=True):
            performanceTable.pause(performanceLabel)
            selection = segment.childOfClass(PmmlPredicate).evaluate(
                dataTable, functionTable, performanceTable)
            performanceTable.unpause(performanceLabel)
            if not selection.any():
                continue

            subTable = dataTable.subTable(selection)
            subModel = segment.childOfClass(PmmlModel)
            performanceTable.pause(performanceLabel)
            subModel.calculate(subTable, functionTable, performanceTable)
            performanceTable.unpause(performanceLabel)

            if subTable.score.fieldType.dataType in ("string", "boolean",
                                                     "object"):
                raise defs.PmmlValidationError(
                    "Segmentation with multipleModelMethod=\"%s\" cannot be applied to models that produce dataType \"%s\""
                    % (self.childOfTag("Segmentation").get(
                        "multipleModelMethod"),
                       subTable.score.fieldType.dataType))

            # ignore invalid in matches (like the built-in "+" and "avg" Apply functions)
            if subTable.score.mask is not None:
                NP("logical_and", selection,
                   NP(subTable.score.mask == defs.VALID), selection)

            if which is self.SUM:
                scoresData[selection] += subTable.score.data
            if which is self.AVERAGE:
                scoresData[selection] += subTable.score.data
                denominator[selection] += 1.0
            elif which is self.WEIGHTED_AVERAGE:
                weight = float(segment.get("weight", 1.0))
                scoresData[selection] += (subTable.score.data * weight)
                denominator[selection] += weight

            if subTable.score.mask is not None:
                invalid[selection] = NP("logical_or", invalid[selection],
                                        NP(subTable.score.mask != defs.VALID))

        if which is not self.SUM:
            NP("logical_or", invalid, NP(denominator == 0.0), invalid)
            valid = NP("logical_not", invalid)
            scoresData[valid] /= denominator[valid]

        if invalid.any():
            scoresMask = NP(
                NP("array", invalid, dtype=defs.maskType) * defs.INVALID)
        else:
            scoresMask = None

        scores = DataColumn(self.scoreType, scoresData, scoresMask)

        performanceTable.end(performanceLabel)
        return {None: scores}
示例#38
0
    def _setup(self):
        if self.optype != "continuous" and len(self.intervals) > 0:
            raise defs.PmmlValidationError("Non-continuous fields cannot have Intervals")

        self._displayValue = {}

        if self.dataType == "object":   # for scoring results that don't fit the PMML pattern
            self.toDataColumn = self._toDataColumn_object
            self.fromDataColumn = self._fromDataColumn_object
            self.dtype = NP.dtype(object)
            self.stringToValue = self._stringToValue_object
            self.valueToString = self._valueToString_object
            self.valueToPython = self._valueToPython

        elif self.dataType == "string":
            if self.optype == "categorical":
                self._stringToValue = {}            # TODO: merge categorical and ordinal <Value> handling
                self._valueToString = {}            # into _checkValues(data, mask)
                self._newValuesAllowed = True
                for value in self.values:
                    v = value.get("value")
                    displayValue = value.get("displayValue")
                    if displayValue is not None:
                        self._displayValue[v] = displayValue
                    if value.get("property", "valid") == "valid":
                        self._addCategorical(v)
                if len(self._stringToValue) > 0:
                    self._newValuesAllowed = False

                self.toDataColumn = self._toDataColumn_internal
                self.fromDataColumn = self._fromDataColumn
                self.dtype = NP.int64
                self.stringToValue = self._stringToValue_categorical
                self.valueToString = self._valueToString_categorical
                self.valueToPython = self._valueToString_categorical

            elif self.optype == "ordinal":
                self._stringToValue = {}            # TODO: see above
                self._valueToString = {}
                self._newValuesAllowed = True
                for value in self.values:
                    v = value.get("value")
                    displayValue = value.get("displayValue")
                    if displayValue is not None:
                        self._displayValue[v] = displayValue
                    if value.get("property", "valid") == "valid":
                        self._addOrdinal(v)
                self._newValuesAllowed = False

                self.toDataColumn = self._toDataColumn_internal
                self.fromDataColumn = self._fromDataColumn
                self.dtype = NP.dtype(int)
                self.stringToValue = self._stringToValue_ordinal
                self.valueToString = self._valueToString_ordinal
                self.valueToPython = self._valueToString_ordinal

            elif self.optype == "continuous":
                self.toDataColumn = self._toDataColumn_string
                self.fromDataColumn = self._fromDataColumn_object
                self.dtype = NP.dtype(object)
                self.stringToValue = self._stringToValue_string
                self.valueToString = self._valueToString_string
                self.valueToPython = self._valueToString_string

            else:
                raise defs.PmmlValidationError("Unrecognized optype: %s" % self.optype)

        elif self.dataType == "integer":
            self.toDataColumn = self._toDataColumn_number
            self.fromDataColumn = self._fromDataColumn_number
            self.dtype = NP.dtype(int)
            self.stringToValue = self._stringToValue_integer
            self.valueToString = self._valueToString_integer
            self.valueToPython = self._valueToPython

        elif self.dataType == "float":
            self.toDataColumn = self._toDataColumn_number
            self.fromDataColumn = self._fromDataColumn_number
            self.dtype = NP.float32
            self.stringToValue = self._stringToValue_float
            self.valueToString = self._valueToString_float
            self.valueToPython = self._valueToPython

        elif self.dataType == "double":
            self.toDataColumn = self._toDataColumn_number
            self.fromDataColumn = self._fromDataColumn_number
            self.dtype = NP.dtype(float)
            self.stringToValue = self._stringToValue_double
            self.valueToString = self._valueToString_double
            self.valueToPython = self._valueToPython

        elif self.dataType == "boolean":
            self.toDataColumn = self._toDataColumn_number
            self.fromDataColumn = self._fromDataColumn_number
            self.dtype = NP.dtype(bool)
            self.stringToValue = self._stringToValue_boolean
            self.valueToString = self._valueToString_boolean
            self.valueToPython = self._valueToPython

        elif self.dataType == "date":
            self.toDataColumn = self._toDataColumn_dateTime
            self.fromDataColumn = self._fromDataColumn
            self.dtype = NP.int64
            self.stringToValue = self._stringToValue_date
            self.valueToString = self._valueToString_date
            self.valueToPython = self._valueToPython_date

        elif self.dataType == "time":
            self.toDataColumn = self._toDataColumn_dateTime
            self.fromDataColumn = self._fromDataColumn
            self.dtype = NP.int64
            self.stringToValue = self._stringToValue_time
            self.valueToString = self._valueToString_time
            self.valueToPython = self._valueToPython_time

        elif self.dataType == "dateTime":
            self.toDataColumn = self._toDataColumn_dateTime
            self.fromDataColumn = self._fromDataColumn
            self.dtype = NP.int64
            self.stringToValue = self._stringToValue_dateTime
            self.valueToString = self._valueToString_dateTime
            self.valueToPython = self._valueToPython_dateTime

        elif self.dataType == "dateDaysSince[0]":
            # _offset is the number of seconds between 1/1/1 B.C. and 1/1/1970, using the astronomical convention
            # that 1 B.C. is "year zero" (which does not exist, even in the proleptic Gregorian calendar)
            # and that this fictitious year would have been a leap year (366 full days)
            # http://en.wikipedia.org/wiki/Year_zero#Astronomers
            self._offset = -62167219200 * self._dateTimeResolution
            self._factor = 86400 * self._dateTimeResolution        # number of microseconds in a day
            self.toDataColumn = self._toDataColumn_dateTimeNumber
            self.fromDataColumn = self._fromDataColumn_dateTimeNumber
            self.dtype = NP.int64
            self.stringToValue = self._stringToValue_dateTimeNumber
            self.valueToString = self._valueToString_dateTimeNumber
            self.valueToPython = self._valueToPython_dateTimeNumber

        elif self.dataType == "dateDaysSince[1960]":
            self._offset = -315619200 * self._dateTimeResolution   # number of seconds between 1/1/1960 and 1/1/1970, accounting for leap years/leap seconds
            self._factor = 86400 * self._dateTimeResolution        # number of microseconds in a day
            self.toDataColumn = self._toDataColumn_dateTimeNumber
            self.fromDataColumn = self._fromDataColumn_dateTimeNumber
            self.dtype = NP.int64
            self.stringToValue = self._stringToValue_dateTimeNumber
            self.valueToString = self._valueToString_dateTimeNumber
            self.valueToPython = self._valueToPython_dateTimeNumber

        elif self.dataType == "dateDaysSince[1970]":
            self._offset = 0
            self._factor = 86400 * self._dateTimeResolution        # number of microseconds in a day
            self.toDataColumn = self._toDataColumn_dateTimeNumber
            self.fromDataColumn = self._fromDataColumn_dateTimeNumber
            self.dtype = NP.int64
            self.stringToValue = self._stringToValue_dateTimeNumber
            self.valueToString = self._valueToString_dateTimeNumber
            self.valueToPython = self._valueToPython_dateTimeNumber

        elif self.dataType == "dateDaysSince[1980]":
            self._offset = 315532800 * self._dateTimeResolution    # number of seconds between 1/1/1980 and 1/1/1970, accounting for leap years/leap seconds
            self._factor = 86400 * self._dateTimeResolution        # number of microseconds in a day
            self.toDataColumn = self._toDataColumn_dateTimeNumber
            self.fromDataColumn = self._fromDataColumn_dateTimeNumber
            self.dtype = NP.int64
            self.stringToValue = self._stringToValue_dateTimeNumber
            self.valueToString = self._valueToString_dateTimeNumber
            self.valueToPython = self._valueToPython_dateTimeNumber

        elif self.dataType == "timeSeconds":
            self._offset = 0
            self._factor = self._dateTimeResolution            # number of microseconds in a second
            self.toDataColumn = self._toDataColumn_dateTimeNumber
            self.fromDataColumn = self._fromDataColumn_timeSeconds  # reports modulo 1 day
            self.dtype = NP.int64
            self.stringToValue = self._stringToValue_dateTimeNumber
            self.valueToString = self._valueToString_timeSeconds    # reports modulo 1 day
            self.valueToPython = self._valueToPython_timeSeconds    # reports modulo 1 day

        elif self.dataType == "dateTimeSecondsSince[0]":
            self._offset = -62167219200 * self._dateTimeResolution # number of seconds between 1/1/1 B.C. and 1/1/1970, accounting for leap years/leap seconds
            self._factor = self._dateTimeResolution            # number of microseconds in a second
            self.toDataColumn = self._toDataColumn_dateTimeNumber
            self.fromDataColumn = self._fromDataColumn_dateTimeNumber
            self.dtype = NP.int64
            self.stringToValue = self._stringToValue_dateTimeNumber
            self.valueToString = self._valueToString_dateTimeNumber
            self.valueToPython = self._valueToPython_dateTimeNumber

        elif self.dataType == "dateTimeSecondsSince[1960]":
            self._offset = -315619200 * self._dateTimeResolution   # number of seconds between 1/1/1960 and 1/1/1970, accounting for leap years/leap seconds
            self._factor = self._dateTimeResolution            # number of microseconds in a second
            self.toDataColumn = self._toDataColumn_dateTimeNumber
            self.fromDataColumn = self._fromDataColumn_dateTimeNumber
            self.dtype = NP.int64
            self.stringToValue = self._stringToValue_dateTimeNumber
            self.valueToString = self._valueToString_dateTimeNumber
            self.valueToPython = self._valueToPython_dateTimeNumber

        elif self.dataType == "dateTimeSecondsSince[1970]":
            self._offset = 0
            self._factor = self._dateTimeResolution            # number of microseconds in a second
            self.toDataColumn = self._toDataColumn_dateTimeNumber
            self.fromDataColumn = self._fromDataColumn_dateTimeNumber
            self.dtype = NP.int64
            self.stringToValue = self._stringToValue_dateTimeNumber
            self.valueToString = self._valueToString_dateTimeNumber
            self.valueToPython = self._valueToPython_dateTimeNumber

        elif self.dataType == "dateTimeSecondsSince[1980]":
            self._offset = 315532800 * self._dateTimeResolution    # number of seconds between 1/1/1980 and 1/1/1970, accounting for leap years/leap seconds
            self._factor = self._dateTimeResolution            # number of microseconds in a second
            self.toDataColumn = self._toDataColumn_dateTimeNumber
            self.fromDataColumn = self._fromDataColumn_dateTimeNumber
            self.dtype = NP.int64
            self.stringToValue = self._stringToValue_dateTimeNumber
            self.valueToString = self._valueToString_dateTimeNumber
            self.valueToPython = self._valueToPython_dateTimeNumber

        else:
            raise defs.PmmlValidationError("Unrecognized dataType: %s" % self.dataType)

        self._hash = hash((self.dataType, self.optype, tuple(self.values), tuple(self.intervals), self.isCyclic))
示例#39
0
    def evaluate(self, dataTable, functionTable, performanceTable):
        """Evaluate the expression, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this expression.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: The result of the calculation as a DataColumn.
        """

        function = self["function"]
        groupField = self.get("groupField")

        if groupField is None:
            performanceTable.begin("Aggregate %s" % function)
        else:
            performanceTable.begin("Aggregate %s groupField" % function)

        dataColumn = dataTable.fields[self["field"]]
        whereMask = self.where(dataTable, functionTable, performanceTable)
        stateId = self.get("stateId")

        if groupField is None:
            if stateId is None:
                getstate = None
                setstate = None
            else:
                def getstate():
                    return dataTable.state.get(stateId)
                def setstate(value):
                    dataTable.state[stateId] = value
                
            if function == "count":
                dataColumn = self.functionCount(dataColumn, whereMask, None, getstate, setstate)

            elif function == "sum":
                dataColumn = self.functionSum(dataColumn, whereMask, None, getstate, setstate)

            elif function == "average":
                dataColumn = self.functionAverage(dataColumn, whereMask, None, getstate, setstate)

            elif function == "min":
                dataColumn = self.functionMin(dataColumn, whereMask, None, getstate, setstate)

            elif function == "max":
                dataColumn = self.functionMax(dataColumn, whereMask, None, getstate, setstate)

            elif function == "multiset":
                dataColumn = self.functionMultiset(dataColumn, whereMask, None, getstate, setstate)

            performanceTable.end("Aggregate %s" % function)
            return dataColumn

        else:
            groupColumn = dataTable.fields[groupField]
            if groupColumn.mask is None:
                validGroup = groupColumn.data
            else:
                validGroup = groupColumn.data[NP(groupColumn.mask == defs.VALID)]

            if stateId is not None:
                state = dataTable.state.get(stateId)
                if state is None:
                    record = {}
                else:
                    record = state

            valuesSeen = dict((stringValue, False) for stringValue in record)

            groupTables = {}
            groupColumnFieldType = None
            for groupValue in NP("unique", validGroup):
                groupSelection = NP(groupColumn.data == groupValue)
                if groupColumn.mask is not None:
                    NP("logical_and", groupSelection, NP(groupColumn.mask == defs.VALID), groupSelection)

                groupColumnFieldType = groupColumn.fieldType
                stringValue = groupColumnFieldType.valueToString(groupValue)

                if stringValue in record:
                    def getstate():
                        return record[stringValue]
                else:
                    getstate = None

                def setstate(value):
                    record[stringValue] = value

                valuesSeen[stringValue] = True
                value = groupColumnFieldType.valueToPython(groupValue)

                if function == "count":
                    groupTables[value] = self.functionCount(dataColumn, whereMask, groupSelection, getstate, setstate)

                elif function == "sum":
                    groupTables[value] = self.functionSum(dataColumn, whereMask, groupSelection, getstate, setstate)

                elif function == "average":
                    groupTables[value] = self.functionAverage(dataColumn, whereMask, groupSelection, getstate, setstate)

                elif function == "min":
                    groupTables[value] = self.functionMin(dataColumn, whereMask, groupSelection, getstate, setstate)

                elif function == "max":
                    groupTables[value] = self.functionMax(dataColumn, whereMask, groupSelection, getstate, setstate)

                elif function == "multiset":
                    groupTables[value] = self.functionMultiset(dataColumn, whereMask, groupSelection, getstate, setstate)

            if stateId is not None:
                dataTable.state[stateId] = record

            for stringValue in valuesSeen:
                if not valuesSeen[stringValue]:
                    value = groupColumnFieldType.valueToPython(groupColumnFieldType.stringToValue(stringValue))

                    if function == "count":
                        groupTables[value] = self.functionCountFake(record[stringValue], len(dataTable), dataColumn.fieldType)

                    elif function == "sum":
                        groupTables[value] = self.functionSumFake(record[stringValue], len(dataTable), dataColumn.fieldType)

                    elif function == "average":
                        groupTables[value] = self.functionAverageFake(record[stringValue], len(dataTable), dataColumn.fieldType)

                    elif function in ("min", "max"):
                        groupTables[value] = self.functionMinMaxFake(record[stringValue], len(dataTable), dataColumn.fieldType)

                    elif function == "multiset":
                        groupTables[value] = self.functionMultisetFake(record[stringValue], len(dataTable), dataColumn.fieldType)

            performanceTable.begin("Aggregate %s groupField collect" % function)

            fieldType = FakeFieldType("object", "any")
            data = NP("empty", len(dataTable), dtype=NP.dtype(object))

            if function == "count":
                for i in xrange(len(dataTable)):
                    data[i] = dict((value, table.data[i]) for value, table in groupTables.items() if table.data[i] != 0)

            elif function == "sum":
                for i in xrange(len(dataTable)):
                    data[i] = dict((value, table.data[i]) for value, table in groupTables.items() if table.data[i] != 0.0)

            elif function == "average":
                for i in xrange(len(dataTable)):
                    data[i] = dict((value, table.data[i]) for value, table in groupTables.items() if table.data[i] > 0.0 or table.data[i] <= 0.0)

            elif function in ("min", "max"):
                for table in groupTables.values():
                    if table.mask is None:
                        table._mask = NP("zeros", len(table), dtype=defs.maskType)
                for i in xrange(len(dataTable)):
                    data[i] = dict((value, table.data[i]) for value, table in groupTables.items() if table.mask[i] == defs.VALID)

            elif function == "multiset":
                for i in xrange(len(dataTable)):
                    data[i] = dict((value, table.data[i]) for value, table in groupTables.items() if len(table.data[i]) > 0)

            performanceTable.end("Aggregate %s groupField collect" % function)
            performanceTable.end("Aggregate %s groupField" % function)
            return DataColumn(fieldType, data, None)
示例#40
0
    def functionMax(self, dataColumn, whereMask, groupSelection, getstate,
                    setstate):
        """Finds the maximum of rows in a DataColumn, possibly with an SQL where mask and groupField.

        @type dataColumn: DataColumn
        @param dataColumn: The input data column.
        @type whereMask: 1d Numpy array of bool, or None
        @param whereMask: The result of the SQL where selection.
        @type groupSelection: 1d Numpy array of bool, or None.
        @param groupSelection: Rows corresponding to a particular value of the groupField.
        @type getstate: callable function
        @param getstate: Retrieve staring values from the DataTableState.
        @type setstate: callable function
        @param setstate: Store ending values to the DataTableState.
        @rtype: DataColumn
        @return: A column of maximized rows.
        """

        fieldType = dataColumn.fieldType

        if fieldType.optype not in ("continuous", "ordinal"):
            raise defs.PmmlValidationError(
                "Aggregate function \"min\" requires a continuous or ordinal input field"
            )

        if dataColumn.mask is None:
            selection = NP("ones", len(dataColumn), dtype=NP.dtype(bool))
        else:
            selection = NP(dataColumn.mask == defs.VALID)

        if whereMask is not None:
            NP("logical_and", selection, whereMask, selection)

        if groupSelection is not None:
            NP("logical_and", selection, groupSelection, selection)

        maximum = None
        if getstate is not None:
            startingState = getstate()
            if startingState is not None:
                maximum = startingState

        data = NP("empty", len(dataColumn), dtype=fieldType.dtype)
        mask = NP("zeros", len(dataColumn), dtype=defs.maskType)

        for i, x in enumerate(dataColumn.data):
            if selection[i]:
                if maximum is None or x > maximum:
                    maximum = x
            if maximum is None:
                mask[i] = defs.INVALID
            else:
                data[i] = maximum

        if not mask.any():
            mask = None

        if setstate is not None:
            setstate(maximum)

        return DataColumn(fieldType, data, mask)
示例#41
0
    def _selectMax(self, dataTable, functionTable, performanceTable, segmentation):
        """Used by C{calculateScore}."""

        performanceTable.begin("Segmentation max")

        scoresData = NP("empty", len(dataTable), dtype=NP.dtype(object))
        filled = NP("zeros", len(dataTable), dtype=NP.dtype(bool))
        unfilled = NP("ones", len(dataTable), dtype=NP.dtype(bool))

        newOutputData = []
        for segment in segmentation.childrenOfTag("Segment", iterator=True):
            performanceTable.pause("Segmentation max")
            selection = segment.childOfClass(PmmlPredicate).evaluate(dataTable, functionTable, performanceTable)
            performanceTable.unpause("Segmentation max")
            if not selection.any():
                continue
            
            subTable = dataTable.subTable(selection)
            subModel = segment.childOfClass(PmmlModel)
            performanceTable.pause("Segmentation max")
            subModel.calculate(subTable, functionTable, performanceTable)
            performanceTable.unpause("Segmentation max")

            if subTable.score.fieldType.dataType in ("string", "boolean", "object"):
                raise defs.PmmlValidationError("Segmentation with multipleModelMethod=\"max\" cannot be applied to models that produce dataType \"%s\"" % subTable.score.fieldType.dataType)

            # ignore invalid in matches (like the built-in "min" Apply function)
            if subTable.score.mask is not None:
                NP("logical_and", selection, NP(subTable.score.mask == defs.VALID), selection)

            selectionFilled = NP("logical_and", selection, filled)
            selectionUnfilled = NP("logical_and", selection, unfilled)
            filled_selection = filled[selection]
            unfilled_selection = unfilled[selection]

            left, right = subTable.score.data[filled_selection], scoresData[selectionFilled]
            condition = NP(left > right)
            scoresData[selectionFilled] = NP("where", condition, left, right)
            scoresData[selectionUnfilled] = subTable.score.data[unfilled_selection]

            for fieldName, dataColumn in subTable.output.items():
                if fieldName not in dataTable.output:
                    data = NP("empty", len(dataTable), dtype=dataColumn.fieldType.dtype)
                    data[selectionUnfilled] = dataColumn.data

                    mask = NP(NP("ones", len(dataTable), dtype=defs.maskType) * defs.MISSING)
                    if dataColumn.mask is None:
                        mask[selectionUnfilled] = defs.VALID
                    else:
                        mask[selectionUnfilled] = dataColumn.mask

                    newDataColumn = DataColumn(dataColumn.fieldType, data, mask)
                    newDataColumn._unlock()
                    dataTable.output[fieldName] = newDataColumn
                    newOutputData.append(newDataColumn)

                else:
                    newDataColumn = dataTable.output[fieldName]

                    newDataColumn.data[selectionFilled] = NP("where", condition, dataColumn.data[filled_selection], newDataColumn.data[selectionFilled])
                    newDataColumn.data[selectionUnfilled] = dataColumn.data[unfilled_selection]

                    if dataColumn.mask is None:
                        newDataColumn.mask[selectionUnfilled] = defs.VALID
                    else:
                        newDataColumn.mask[selectionUnfilled] = dataColumn.mask

            filled += selectionUnfilled
            unfilled -= selectionUnfilled

        for newDataColumn in newOutputData:
            if not newDataColumn.mask.any():
                newDataColumn._mask = None
            newDataColumn._lock()
            
        if filled.all():
            scoresMask = None
        else:
            scoresMask = NP(NP("logical_not", filled) * defs.MISSING)
        
        scores = DataColumn(self.scoreType, scoresData, scoresMask)

        performanceTable.end("Segmentation max")
        return {None: scores}
示例#42
0
    def evaluate(self, dataTable, functionTable, performanceTable):
        """Evaluate the expression, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this expression.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: The result of the calculation as a DataColumn.
        """

        function = self["function"]
        groupField = self.get("groupField")

        if groupField is None:
            performanceTable.begin("Aggregate %s" % function)
        else:
            performanceTable.begin("Aggregate %s groupField" % function)

        dataColumn = dataTable.fields[self["field"]]
        whereMask = self.where(dataTable, functionTable, performanceTable)
        stateId = self.get("stateId")

        if groupField is None:
            if stateId is None:
                getstate = None
                setstate = None
            else:

                def getstate():
                    return dataTable.state.get(stateId)

                def setstate(value):
                    dataTable.state[stateId] = value

            if function == "count":
                dataColumn = self.functionCount(dataColumn, whereMask, None,
                                                getstate, setstate)

            elif function == "sum":
                dataColumn = self.functionSum(dataColumn, whereMask, None,
                                              getstate, setstate)

            elif function == "average":
                dataColumn = self.functionAverage(dataColumn, whereMask, None,
                                                  getstate, setstate)

            elif function == "min":
                dataColumn = self.functionMin(dataColumn, whereMask, None,
                                              getstate, setstate)

            elif function == "max":
                dataColumn = self.functionMax(dataColumn, whereMask, None,
                                              getstate, setstate)

            elif function == "multiset":
                dataColumn = self.functionMultiset(dataColumn, whereMask, None,
                                                   getstate, setstate)

            performanceTable.end("Aggregate %s" % function)
            return dataColumn

        else:
            groupColumn = dataTable.fields[groupField]
            if groupColumn.mask is None:
                validGroup = groupColumn.data
            else:
                validGroup = groupColumn.data[NP(
                    groupColumn.mask == defs.VALID)]

            if stateId is not None:
                state = dataTable.state.get(stateId)
                if state is None:
                    record = {}
                else:
                    record = state

            valuesSeen = dict((stringValue, False) for stringValue in record)

            groupTables = {}
            groupColumnFieldType = None
            for groupValue in NP("unique", validGroup):
                groupSelection = NP(groupColumn.data == groupValue)
                if groupColumn.mask is not None:
                    NP("logical_and", groupSelection,
                       NP(groupColumn.mask == defs.VALID), groupSelection)

                groupColumnFieldType = groupColumn.fieldType
                stringValue = groupColumnFieldType.valueToString(groupValue)

                if stringValue in record:

                    def getstate():
                        return record[stringValue]
                else:
                    getstate = None

                def setstate(value):
                    record[stringValue] = value

                valuesSeen[stringValue] = True
                value = groupColumnFieldType.valueToPython(groupValue)

                if function == "count":
                    groupTables[value] = self.functionCount(
                        dataColumn, whereMask, groupSelection, getstate,
                        setstate)

                elif function == "sum":
                    groupTables[value] = self.functionSum(
                        dataColumn, whereMask, groupSelection, getstate,
                        setstate)

                elif function == "average":
                    groupTables[value] = self.functionAverage(
                        dataColumn, whereMask, groupSelection, getstate,
                        setstate)

                elif function == "min":
                    groupTables[value] = self.functionMin(
                        dataColumn, whereMask, groupSelection, getstate,
                        setstate)

                elif function == "max":
                    groupTables[value] = self.functionMax(
                        dataColumn, whereMask, groupSelection, getstate,
                        setstate)

                elif function == "multiset":
                    groupTables[value] = self.functionMultiset(
                        dataColumn, whereMask, groupSelection, getstate,
                        setstate)

            if stateId is not None:
                dataTable.state[stateId] = record

            for stringValue in valuesSeen:
                if not valuesSeen[stringValue]:
                    value = groupColumnFieldType.valueToPython(
                        groupColumnFieldType.stringToValue(stringValue))

                    if function == "count":
                        groupTables[value] = self.functionCountFake(
                            record[stringValue], len(dataTable),
                            dataColumn.fieldType)

                    elif function == "sum":
                        groupTables[value] = self.functionSumFake(
                            record[stringValue], len(dataTable),
                            dataColumn.fieldType)

                    elif function == "average":
                        groupTables[value] = self.functionAverageFake(
                            record[stringValue], len(dataTable),
                            dataColumn.fieldType)

                    elif function in ("min", "max"):
                        groupTables[value] = self.functionMinMaxFake(
                            record[stringValue], len(dataTable),
                            dataColumn.fieldType)

                    elif function == "multiset":
                        groupTables[value] = self.functionMultisetFake(
                            record[stringValue], len(dataTable),
                            dataColumn.fieldType)

            performanceTable.begin("Aggregate %s groupField collect" %
                                   function)

            fieldType = FakeFieldType("object", "any")
            data = NP("empty", len(dataTable), dtype=NP.dtype(object))

            if function == "count":
                for i in xrange(len(dataTable)):
                    data[i] = dict((value, table.data[i])
                                   for value, table in groupTables.items()
                                   if table.data[i] != 0)

            elif function == "sum":
                for i in xrange(len(dataTable)):
                    data[i] = dict((value, table.data[i])
                                   for value, table in groupTables.items()
                                   if table.data[i] != 0.0)

            elif function == "average":
                for i in xrange(len(dataTable)):
                    data[i] = dict(
                        (value, table.data[i])
                        for value, table in groupTables.items()
                        if table.data[i] > 0.0 or table.data[i] <= 0.0)

            elif function in ("min", "max"):
                for table in groupTables.values():
                    if table.mask is None:
                        table._mask = NP("zeros",
                                         len(table),
                                         dtype=defs.maskType)
                for i in xrange(len(dataTable)):
                    data[i] = dict((value, table.data[i])
                                   for value, table in groupTables.items()
                                   if table.mask[i] == defs.VALID)

            elif function == "multiset":
                for i in xrange(len(dataTable)):
                    data[i] = dict((value, table.data[i])
                                   for value, table in groupTables.items()
                                   if len(table.data[i]) > 0)

            performanceTable.end("Aggregate %s groupField collect" % function)
            performanceTable.end("Aggregate %s groupField" % function)
            return DataColumn(fieldType, data, None)
示例#43
0
    def _toDataColumn_number(self, data, mask):
        data, mask = self._checkNumpy(data, mask)
        if isinstance(data, NP.ndarray) and (mask is None or isinstance(mask, NP.ndarray)) and data.dtype == self.dtype:
            mask2 = NP("isnan", data)
            if mask is None:
                mask = NP("array", mask2, defs.maskType) * defs.MISSING
            else:
                mask[mask2] = defs.MISSING

        else:
            data, mask = self._checkNonNumpy(data, mask)
            try:
                data = NP("array", data, dtype=self.dtype)
                # mask is handled in the else statement after the except block

            except (ValueError, TypeError):
                data2 = NP("empty", len(data), dtype=self.dtype)
                if mask is None:
                    mask2 = NP("zeros", len(data), dtype=defs.maskType)
                else:
                    mask2 = NP("fromiter", ((defs.VALID if not m else defs.MISSING) for m in mask), dtype=defs.maskType, count=len(mask))

                for i, v in enumerate(data):
                    try:
                        data2[i] = v
                        if mask2[i] == defs.VALID and ((isinstance(v, float) and math.isnan(v)) or (isinstance(v, basestring) and v.upper() == "NAN")):
                            mask2[i] = defs.MISSING
                        if v is None:
                            raise TypeError
                    except (ValueError, TypeError):
                        data2[i] = defs.PADDING
                        if mask2[i] == defs.VALID:
                            if (isinstance(v, float) and math.isnan(v)) or (isinstance(v, basestring) and v.upper() == "NAN"):
                                mask2[i] = defs.MISSING
                            else:
                                mask2[i] = defs.INVALID

                if not mask2.any():
                    mask2 = None

                data, mask = data2, mask2

            else:
                mask2 = NP("isnan", data)
                if mask is None:
                    mask = NP("array", mask2, defs.maskType)
                else:
                    mask = NP(NP("array", NP("logical_or", mask2, NP("fromiter", (m != 0 for m in mask), dtype=NP.dtype(bool), count=len(mask))), defs.maskType) * defs.MISSING)
                if not mask.any():
                    mask = None

        data, mask = self._checkValues(data, mask)
        data, mask = self._checkIntervals(data, mask)
        return DataColumn(self, data, mask)
示例#44
0
    def compare(self, dataTable, functionTable, performanceTable, centerString, defaultCompareFunction, anyInvalid):
        """Compare input data with a cluster centern along the
        direction of this field.

        Cluster distances are computed in two steps: this C{compare}
        function, which determines the distance in the direction of a
        field, and the metric, which combines results from each field.

        @type dataTable: DataTable
        @param dataTable: The input data.
        @type functionTable: FunctionTable
        @param functionTable: A table of functions.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @type centerString: string
        @param centerString: The center of the cluster in this field, represented as a string.
        @type defaultCompareFunction: string
        @param defaultCompareFunction: The C{compareFunction} defined at the model level, which may be overruled on a per-field basis.
        @type anyInvalid: 1d Numpy array of bool
        @param anyInvalid: Mask for invalid data, accumulated with each C{compare} call.  This method modifies it.
        @rtype: 1d Numpy array of numbers
        @return: The distances or similarities between the input data and the cluster center, along the distance of this field.
        """

        performanceTable.begin("ClusteringField")

        dataColumn = dataTable.fields[self["field"]]

        if dataColumn.mask is not None:
            # even though DataColumns are immutable, we're allowed to change the invalid values
            # because they're not defined; set them so that x - y = 0, and hence they'll be
            # effectively skipped in summations without any extra work
            dataColumn._unlock()
            dataColumn.data[NP(dataColumn.mask != defs.VALID)] = dataColumn.fieldType.stringToValue(centerString)
            dataColumn._lock()

        compareFunction = self.get("compareFunction", defaultCompareFunction)

        if compareFunction == "absDiff":
            result = NP("absolute", NP(dataColumn.data - dataColumn.fieldType.stringToValue(centerString)))

        elif compareFunction == "gaussSim":
            similarityScale = self.get("similarityScale")
            if similarityScale is None:
                raise defs.PmmlValidationError("If compareFunction is \"gaussSim\", a similarityScale must be provided")
            s = float(similarityScale)
            z = NP(dataColumn.data - dataColumn.fieldType.stringToValue(centerString))

            result = NP("exp", NP((-self.LOG2/s**2) * NP(z**2)))

        elif compareFunction == "delta":
            result = NP(dataColumn.data != dataColumn.fieldType.stringToValue(centerString))

        elif compareFunction == "equal":
            result = NP(dataColumn.data == dataColumn.fieldType.stringToValue(centerString))

        elif compareFunction == "table":
            if dataColumn.fieldType.dataType != "integer":
                raise defs.PmmlValidationError("If compareFunction is \"table\", the data must be integers")

            matrix = self.xpath("pmml:Comparisons/pmml:Matrix")
            if len(matrix) != 1:
                raise defs.PmmlValidationError("If compareFunction is \"table\", ClusteringFields needs a Comparisons/Matrix")
            values = matrix[0].values(convertType=False)

            centerValue = dataColumn.fieldType.stringToValue(centerString)
            try:
                row = values[centerValue]
            except IndexError:
                raise defs.PmmlValidationError("Cluster center component is %s, but this is an invalid row index for the Comparisons/Matrix (0-indexed)" % centerString)

            result = NP("empty", len(dataTable), dtype=NP.dtype(float))
            valid = NP("zeros", len(dataTable), dtype=NP.dtype(bool))
            for j, value in enumerate(row):
                selection = NP(dataColumn.data == j)
                result[selection] = dataColumn.fieldType.stringToValue(value)
                NP("logical_or", valid, selection, valid)
            NP("logical_or", anyInvalid, NP("logical_not", valid), anyInvalid)

        performanceTable.end("ClusteringField")
        return result
示例#45
0
    def _sumAverageWeighted(self, dataTable, functionTable, performanceTable, segmentation, which):
        """Used by C{calculateScore}."""

        if which is self.SUM:
            performanceLabel = "Segmentation sum"
        elif which is self.AVERAGE:
            performanceLabel = "Segmentation average"
        elif which is self.WEIGHTED_AVERAGE:
            performanceLabel = "Segmentation weightedAverage"
        performanceTable.begin(performanceLabel)

        scoresData = NP("zeros", len(dataTable), dtype=NP.dtype(object))
        if which is not self.SUM:
            denominator = NP("zeros", len(dataTable), dtype=NP.dtype(float))
        invalid = NP("zeros", len(dataTable), dtype=NP.dtype(bool))

        for segment in segmentation.childrenOfTag("Segment", iterator=True):
            performanceTable.pause(performanceLabel)
            selection = segment.childOfClass(PmmlPredicate).evaluate(dataTable, functionTable, performanceTable)
            performanceTable.unpause(performanceLabel)
            if not selection.any():
                continue
            
            subTable = dataTable.subTable(selection)
            subModel = segment.childOfClass(PmmlModel)
            performanceTable.pause(performanceLabel)
            subModel.calculate(subTable, functionTable, performanceTable)
            performanceTable.unpause(performanceLabel)

            if subTable.score.fieldType.dataType in ("string", "boolean", "object"):
                raise defs.PmmlValidationError("Segmentation with multipleModelMethod=\"%s\" cannot be applied to models that produce dataType \"%s\"" % (self.childOfTag("Segmentation").get("multipleModelMethod"), subTable.score.fieldType.dataType))

            # ignore invalid in matches (like the built-in "+" and "avg" Apply functions)
            if subTable.score.mask is not None:
                NP("logical_and", selection, NP(subTable.score.mask == defs.VALID), selection)

            if which is self.SUM:
                scoresData[selection] += subTable.score.data
            if which is self.AVERAGE:
                scoresData[selection] += subTable.score.data
                denominator[selection] += 1.0
            elif which is self.WEIGHTED_AVERAGE:
                weight = float(segment.get("weight", 1.0))
                scoresData[selection] += (subTable.score.data * weight)
                denominator[selection] += weight

            if subTable.score.mask is not None:
                invalid[selection] = NP("logical_or", invalid[selection], NP(subTable.score.mask != defs.VALID))

        if which is not self.SUM:
            NP("logical_or", invalid, NP(denominator == 0.0), invalid)
            valid = NP("logical_not", invalid)
            scoresData[valid] /= denominator[valid]

        if invalid.any():
            scoresMask = NP(NP("array", invalid, dtype=defs.maskType) * defs.INVALID)
        else:
            scoresMask = None
        
        scores = DataColumn(self.scoreType, scoresData, scoresMask)

        performanceTable.end(performanceLabel)
        return {None: scores}
示例#46
0
    def format(self, subTable, functionTable, performanceTable, score):
        """Extract or post-process output for the output field of a DataTable.

        @type subTable: DataTable
        @param subTable: The DataTable associated with this local lexical scope.
        @type functionTable: FunctionTable or None
        @param functionTable: A table of functions.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @type score: dict
        @param score: Dictionary mapping PMML score "feature" strings to DataColumns.  This dictionary always contains a None key, which is the basic feature ("predictedValue").
        @rtype: DataColumn
        @return: The output that would go into an output field of a DataTable.
        """

        performanceTable.begin("OutputField")

        feature = self.get("feature")
        if feature is None:
            dataColumn = subTable.fields[self["name"]]

        elif feature == "predictedValue":
            dataColumn = score[None]

        elif feature == "predictedDisplayValue":
            original = score[None]
            toString = original.fieldType.valueToString
            data = NP("empty", len(subTable), dtype=NP.dtype(object))
            for i, x in enumerate(original.data):
                data[i] = toString(x)
            dataColumn = DataColumn(FakeFieldType("string", "continuous"),
                                    data, None)

        elif feature == "transformedValue":
            expression = self.childOfClass(PmmlExpression)
            if expression is None:
                raise defs.PmmlValidationError(
                    "OutputField with feature \"transformedValue\" requires an EXPRESSION"
                )

            performanceTable.pause("OutputField")
            dataColumn = expression.evaluate(subTable, functionTable,
                                             performanceTable)
            performanceTable.unpause("OutputField")

        elif feature == "decision":
            decisions = self.childOfTag("Decisions")
            if decisions is None:
                raise defs.PmmlValidationError(
                    "OutputField with feature \"decision\" requires a Decisions block"
                )

            performanceTable.pause("OutputField")
            dataColumn = self.childOfClass(PmmlExpression).evaluate(
                subTable, functionTable, performanceTable)
            performanceTable.unpause("OutputField")

            if dataColumn.mask is None:
                valid = None
            else:
                valid = NP(dataColumn.mask == defs.VALID)

            fieldType = FakeFieldType("object", "any")
            data = NP("empty", len(subTable), dtype=fieldType.dtype)
            mask = NP(
                NP("ones", len(subTable), dtype=defs.maskType) * defs.MISSING)

            for decision in decisions.childrenOfTag("Decision"):
                value = dataColumn.fieldType.stringToValue(decision["value"])

                selection = NP(dataColumn.data == value)
                if valid is not None:
                    NP("logical_and", selection, valid, selection)

                for i in xrange(len(data)):
                    if selection[i]:
                        data[i] = decision

                mask[selection] = defs.VALID

            if not mask.any():
                mask = None

            dataColumn = DataColumn(fieldType, data, mask)

        elif feature in score:
            dataColumn = score[feature]

        else:
            model = self.getparent()
            if model is not None: model = model.getparent()

            if model is None:
                model = "(orphaned OutputField; no parent model)"
            else:
                model = model.t

            raise defs.PmmlValidationError(
                "Models of type %s do not produce \"%s\" features (or at least, it is not yet implemented by Augustus)"
                % (model, feature))

        dataType = self.get("dataType", dataColumn.fieldType.dataType)
        optype = self.get("optype", dataColumn.fieldType.optype)
        if (dataType != dataColumn.fieldType.dataType
                or optype != dataColumn.fieldType.optype) and feature not in (
                    "predictedDisplayValue", "decision"):
            dataColumn = FieldCastMethods.cast(FakeFieldType(dataType, optype),
                                               dataColumn)

        if feature is not None:
            subTable.fields[self.get("displayName", self["name"])] = dataColumn

        performanceTable.end("OutputField")
        return dataColumn
示例#47
0
    def _selectFirst(self, dataTable, functionTable, performanceTable, segmentation):
        """Used by C{calculateScore}."""

        performanceTable.begin("Segmentation selectFirst")

        scoresData = NP("empty", len(dataTable), dtype=NP.dtype(object))
        scoresMask = NP("zeros", len(dataTable), dtype=defs.maskType)
        unfilled = NP("ones", len(dataTable), dtype=NP.dtype(bool))
        segments = NP("empty", len(dataTable), dtype=NP.dtype(object))

        newOutputData = []
        for segment in segmentation.childrenOfTag("Segment", iterator=True):
            performanceTable.pause("Segmentation selectFirst")
            selection = segment.childOfClass(PmmlPredicate).evaluate(dataTable, functionTable, performanceTable)
            performanceTable.unpause("Segmentation selectFirst")
            NP("logical_and", selection, unfilled, selection)
            if not selection.any():
                continue

            subTable = dataTable.subTable(selection)
            subModel = segment.childOfClass(PmmlModel)
            performanceTable.pause("Segmentation selectFirst")

            subModel.calculate(subTable, functionTable, performanceTable)
            performanceTable.unpause("Segmentation selectFirst")

            scoresData[selection] = subTable.score.data
            if subTable.score.mask is not None:
                scoresMask[selection] = subTable.score.mask
            else:
                scoresMask[selection] = defs.VALID

            segmentName = segment.get("id")
            if segmentName is not None:
                segments[selection] = segmentName

            for fieldName, dataColumn in subTable.output.items():
                if fieldName not in dataTable.output:
                    data = NP("empty", len(dataTable), dtype=dataColumn.fieldType.dtype)
                    data[selection] = dataColumn.data

                    mask = NP(NP("ones", len(dataTable), dtype=defs.maskType) * defs.MISSING)
                    if dataColumn.mask is None:
                        mask[selection] = defs.VALID
                    else:
                        mask[selection] = dataColumn.mask

                    newDataColumn = DataColumn(dataColumn.fieldType, data, mask)
                    newDataColumn._unlock()
                    dataTable.output[fieldName] = newDataColumn
                    newOutputData.append(newDataColumn)

                else:
                    newDataColumn = dataTable.output[fieldName]

                    newDataColumn.data[selection] = dataColumn.data
                    if dataColumn.mask is None:
                        newDataColumn.mask[selection] = defs.VALID
                    else:
                        newDataColumn.mask[selection] = dataColumn.mask

            unfilled -= selection
            if not unfilled.any():
                break

        for newDataColumn in newOutputData:
            if not newDataColumn.mask.any():
                newDataColumn._mask = None
            newDataColumn._lock()

        if not scoresMask.any():
            scoresMask = None

        scores = DataColumn(self.scoreType, scoresData, scoresMask)

        if self.name is None:
            performanceTable.end("Segmentation selectFirst")
            return {None: scores}
        else:
            performanceTable.end("Segmentation selectFirst")
            return {None: scores, "segment": DataColumn(self.scoreTypeSegment, segments, None)}
示例#48
0
    def calculateScore(self, dataTable, functionTable, performanceTable):
        """Calculate the score of this model.

        This method is called by C{calculate} to separate operations
        that are performed by all models (in C{calculate}) from
        operations that are performed by specific models (in
        C{calculateScore}).

        @type subTable: DataTable
        @param subTable: The DataTable representing this model's lexical scope.
        @type functionTable: FunctionTable or None
        @param functionTable: A table of functions.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: A DataColumn containing the score.
        """

        performanceTable.begin("ClusteringModel")

        performanceTable.begin("set up")

        distributionBased = (self["modelClass"] == "distributionBased")
        clusteringFields = self.xpath(
            "pmml:ClusteringField[not(@isCenterField='false')]")
        fieldWeights = [
            clusteringField.get("fieldWeight",
                                defaultFromXsd=True,
                                convertType=True)
            for clusteringField in clusteringFields
        ]
        for fieldWeight in fieldWeights:
            if fieldWeight < 0.0:
                raise defs.PmmlValidationError(
                    "ClusteringField fieldWeights must all be non-negative (encountered %g)"
                    % fieldWeight)
        clusters = self.xpath("pmml:Cluster")
        comparisonMeasure = self.childOfClass(ComparisonMeasure)
        defaultCompareFunction = comparisonMeasure.get("compareFunction",
                                                       defaultFromXsd=True)
        metric = comparisonMeasure.childOfClass(PmmlClusteringMetric)
        metrictag = metric.t

        performanceTable.end("set up")

        for clusteringField in clusteringFields:
            dataType = dataTable.fields[
                clusteringField["field"]].fieldType.dataType
            if dataType == "string":
                raise defs.PmmlValidationError(
                    "ClusteringField \"%s\" has dataType \"%s\", which cannot be used for clustering"
                    % (clusteringField["field"], dataType))

        missingValueWeights = self.childOfTag("MissingValueWeights")
        if missingValueWeights is None:
            adjustM = None

        else:
            performanceTable.begin("MissingValueWeights")

            missingWeights = missingValueWeights.childOfClass(
                PmmlArray).values(convertType=True)

            sumNMqi = NP("zeros", len(dataTable), dtype=NP.dtype(float))
            for clusteringField, missingWeight in zip(clusteringFields,
                                                      missingWeights):
                clusteringField.addToAdjustM(dataTable, functionTable,
                                             performanceTable, sumNMqi,
                                             missingWeight)

            adjustM = NP(sum(missingWeights) / sumNMqi)
            adjustM[NP(sumNMqi == 0.0)] = 1.0

            performanceTable.end("MissingValueWeights")

        anyInvalid = NP("zeros", len(dataTable), dtype=NP.dtype(bool))
        for clusteringField in clusteringFields:
            mask = dataTable.fields[clusteringField["field"]].mask
            if mask is not None:
                NP("logical_or", anyInvalid, NP(mask == defs.INVALID),
                   anyInvalid)

        bestClusterId = None
        bestClusterAffinity = None
        allClusterAffinities = {}

        for index, cluster in enumerate(clusters):
            array = cluster.childOfClass(PmmlArray)
            if array is None:
                raise defs.PmmlValidationError(
                    "Cluster must have an array to designate its center")

            centerStrings = array.values(convertType=False)
            if len(centerStrings) != len(clusteringFields):
                raise defs.PmmlValidationError(
                    "Cluster array has %d components, but there are %d ClusteringFields with isCenterField=true"
                    % (len(centerStrings), len(clusteringFields)))

            performanceTable.begin(metrictag)

            if distributionBased:
                matrix = cluster.xpath("pmml:Covariances/pmml:Matrix")
                if len(matrix) != 1:
                    raise defs.PmmlValidationError(
                        "In distribution-based clustering, all clusters must have a Covariances/Matrix"
                    )
                try:
                    covarianceMatrix = NP("array",
                                          matrix[0].values(),
                                          dtype=NP.dtype(float))
                except ValueError:
                    raise defs.PmmlValidationError(
                        "Covariances/Matrix must contain real numbers for distribution-based clustering"
                    )

            else:
                covarianceMatrix = None

            state = self._State()
            metric.initialize(state, len(dataTable), len(clusteringFields),
                              distributionBased)

            for clusteringField, centerString, fieldWeight in zip(
                    clusteringFields, centerStrings, fieldWeights):
                if isinstance(metric, PmmlClusteringMetricBinary):
                    metric.accumulateBinary(
                        state, dataTable.fields[clusteringField["field"]],
                        centerString, distributionBased)
                else:
                    performanceTable.pause(metrictag)
                    cxy = clusteringField.compare(dataTable, functionTable,
                                                  performanceTable,
                                                  centerString,
                                                  defaultCompareFunction,
                                                  anyInvalid)
                    performanceTable.unpause(metrictag)
                    metric.accumulate(state, cxy, fieldWeight,
                                      distributionBased)

            distance = metric.finalizeDistance(state, adjustM,
                                               distributionBased,
                                               covarianceMatrix)
            del state

            performanceTable.end(metrictag)

            if index == 0:
                bestClusterId = NP("ones", len(dataTable),
                                   dtype=NP.dtype(int))  # 1-based index
                bestClusterAffinity = distance

            better = NP(distance < bestClusterAffinity)
            bestClusterId[better] = index + 1  # 1-based index
            bestClusterAffinity[better] = distance[better]

            allClusterAffinities[cluster.get("id",
                                             "%d" % (index + 1))] = distance

        if not anyInvalid.any():
            scoreMask = None
        else:
            scoreMask = NP(anyInvalid * defs.INVALID)

        performanceTable.begin("set scores")
        score = {}

        performanceTable.begin("predictedValue")
        fieldType = FakeFieldType("string", "categorical")
        clusterIdentifiers = NP("empty", len(dataTable), dtype=fieldType.dtype)
        for index, cluster in enumerate(clusters):
            value = fieldType.stringToValue(
                cluster.get("id", "%d" % (index + 1)))
            clusterIdentifiers[NP(bestClusterId == (index + 1))] = value
        score[None] = DataColumn(fieldType, clusterIdentifiers, scoreMask)
        performanceTable.end("predictedValue")

        if self.subFields["predictedDisplayValue"]:
            performanceTable.begin("predictedDisplayValue")
            fieldType = FakeFieldType("string", "categorical")
            clusterNames = NP("empty", len(dataTable), dtype=fieldType.dtype)
            for index, cluster in enumerate(clusters):
                value = fieldType.stringToValue(cluster.get("name", ""))
                clusterNames[NP(bestClusterId == (index + 1))] = value
            score["predictedDisplayValue"] = DataColumn(
                fieldType, clusterNames, scoreMask)
            performanceTable.end("predictedDisplayValue")

        if self.subFields["entity"]:
            performanceTable.begin("entity")
            fieldType = FakeFieldType("object", "any")
            entities = NP("empty", len(dataTable), dtype=fieldType.dtype)
            for index, cluster in enumerate(clusters):
                value = fieldType.stringToValue(cluster.get("name", ""))
                indexPlusOne = index + 1
                for i in xrange(len(entities)):
                    if bestClusterId[i] == indexPlusOne:
                        entities[i] = cluster
            score["entity"] = DataColumn(fieldType, entities, scoreMask)
            performanceTable.end("entity")

        if self.subFields["clusterId"]:
            performanceTable.begin("clusterId")
            fieldType = FakeFieldType("integer", "continuous")
            score["clusterId"] = DataColumn(fieldType, bestClusterId,
                                            scoreMask)
            performanceTable.end("clusterId")

        if self.subFields["entityId"]:
            performanceTable.begin("entityId")
            fieldType = FakeFieldType("integer", "continuous")
            score["entityId"] = DataColumn(fieldType, bestClusterId, scoreMask)
            performanceTable.end("entityId")

        if self.subFields["clusterAffinity"]:
            performanceTable.begin("clusterAffinity")
            fieldType = FakeFieldType("double", "continuous")
            score["clusterAffinity"] = DataColumn(fieldType,
                                                  bestClusterAffinity,
                                                  scoreMask)
            performanceTable.end("clusterAffinity")

        if self.subFields["affinity"]:
            performanceTable.begin("affinity")
            fieldType = FakeFieldType("double", "continuous")
            score["affinity"] = DataColumn(fieldType, bestClusterAffinity,
                                           scoreMask)
            performanceTable.end("affinity")

        if self.subFields["all"]:
            performanceTable.begin("all")
            fieldType = FakeFieldType("double", "continuous")
            for identifier, distance in allClusterAffinities.items():
                score["all.%s" % identifier] = DataColumn(
                    fieldType, distance, scoreMask)
            performanceTable.end("all")

        performanceTable.end("set scores")
        performanceTable.end("ClusteringModel")
        return score
示例#49
0
    def calculateScore(self, dataTable, functionTable, performanceTable):
        """Calculate the score of this model.

        This method is called by C{calculate} to separate operations
        that are performed by all models (in C{calculate}) from
        operations that are performed by specific models (in
        C{calculateScore}).

        @type subTable: DataTable
        @param subTable: The DataTable representing this model's lexical scope.
        @type functionTable: FunctionTable or None
        @param functionTable: A table of functions.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: A DataColumn containing the score.
        """

        performanceTable.begin("ClusteringModel")

        performanceTable.begin("set up")

        distributionBased = (self["modelClass"] == "distributionBased")
        clusteringFields = self.xpath("pmml:ClusteringField[not(@isCenterField='false')]")
        fieldWeights = [clusteringField.get("fieldWeight", defaultFromXsd=True, convertType=True) for clusteringField in clusteringFields]
        for fieldWeight in fieldWeights:
            if fieldWeight < 0.0:
                raise defs.PmmlValidationError("ClusteringField fieldWeights must all be non-negative (encountered %g)" % fieldWeight)
        clusters = self.xpath("pmml:Cluster")
        comparisonMeasure = self.childOfClass(ComparisonMeasure)
        defaultCompareFunction = comparisonMeasure.get("compareFunction", defaultFromXsd=True)
        metric = comparisonMeasure.childOfClass(PmmlClusteringMetric)
        metrictag = metric.t

        performanceTable.end("set up")

        for clusteringField in clusteringFields:
            dataType = dataTable.fields[clusteringField["field"]].fieldType.dataType
            if dataType == "string":
                raise defs.PmmlValidationError("ClusteringField \"%s\" has dataType \"%s\", which cannot be used for clustering" % (clusteringField["field"], dataType))

        missingValueWeights = self.childOfTag("MissingValueWeights")
        if missingValueWeights is None:
            adjustM = None

        else:
            performanceTable.begin("MissingValueWeights")

            missingWeights = missingValueWeights.childOfClass(PmmlArray).values(convertType=True)

            sumNMqi = NP("zeros", len(dataTable), dtype=NP.dtype(float))
            for clusteringField, missingWeight in zip(clusteringFields, missingWeights):
                clusteringField.addToAdjustM(dataTable, functionTable, performanceTable, sumNMqi, missingWeight)

            adjustM = NP(sum(missingWeights) / sumNMqi)
            adjustM[NP(sumNMqi == 0.0)] = 1.0

            performanceTable.end("MissingValueWeights")

        anyInvalid = NP("zeros", len(dataTable), dtype=NP.dtype(bool))
        for clusteringField in clusteringFields:
            mask = dataTable.fields[clusteringField["field"]].mask
            if mask is not None:
                NP("logical_or", anyInvalid, NP(mask == defs.INVALID), anyInvalid)

        bestClusterId = None
        bestClusterAffinity = None
        allClusterAffinities = {}

        for index, cluster in enumerate(clusters):
            array = cluster.childOfClass(PmmlArray)
            if array is None:
                raise defs.PmmlValidationError("Cluster must have an array to designate its center")

            centerStrings = array.values(convertType=False)
            if len(centerStrings) != len(clusteringFields):
                raise defs.PmmlValidationError("Cluster array has %d components, but there are %d ClusteringFields with isCenterField=true" % (len(centerStrings), len(clusteringFields)))

            performanceTable.begin(metrictag)

            if distributionBased:
                matrix = cluster.xpath("pmml:Covariances/pmml:Matrix")
                if len(matrix) != 1:
                    raise defs.PmmlValidationError("In distribution-based clustering, all clusters must have a Covariances/Matrix")
                try:
                    covarianceMatrix = NP("array", matrix[0].values(), dtype=NP.dtype(float))
                except ValueError:
                    raise defs.PmmlValidationError("Covariances/Matrix must contain real numbers for distribution-based clustering")

            else:
                covarianceMatrix = None

            state = self._State()
            metric.initialize(state, len(dataTable), len(clusteringFields), distributionBased)

            for clusteringField, centerString, fieldWeight in zip(clusteringFields, centerStrings, fieldWeights):
                if isinstance(metric, PmmlClusteringMetricBinary):
                    metric.accumulateBinary(state, dataTable.fields[clusteringField["field"]], centerString, distributionBased)
                else:
                    performanceTable.pause(metrictag)
                    cxy = clusteringField.compare(dataTable, functionTable, performanceTable, centerString, defaultCompareFunction, anyInvalid)
                    performanceTable.unpause(metrictag)
                    metric.accumulate(state, cxy, fieldWeight, distributionBased)

            distance = metric.finalizeDistance(state, adjustM, distributionBased, covarianceMatrix)
            del state

            performanceTable.end(metrictag)

            if index == 0:
                bestClusterId = NP("ones", len(dataTable), dtype=NP.dtype(int))   # 1-based index
                bestClusterAffinity = distance

            better = NP(distance < bestClusterAffinity)
            bestClusterId[better] = index + 1   # 1-based index
            bestClusterAffinity[better] = distance[better]

            allClusterAffinities[cluster.get("id", "%d" % (index + 1))] = distance

        if not anyInvalid.any():
            scoreMask = None
        else:
            scoreMask = NP(anyInvalid * defs.INVALID)

        performanceTable.begin("set scores")
        score = {}

        performanceTable.begin("predictedValue")
        fieldType = FakeFieldType("string", "categorical")
        clusterIdentifiers = NP("empty", len(dataTable), dtype=fieldType.dtype)
        for index, cluster in enumerate(clusters):
            value = fieldType.stringToValue(cluster.get("id", "%d" % (index + 1)))
            clusterIdentifiers[NP(bestClusterId == (index + 1))] = value
        score[None] = DataColumn(fieldType, clusterIdentifiers, scoreMask)
        performanceTable.end("predictedValue")

        if self.subFields["predictedDisplayValue"]:
            performanceTable.begin("predictedDisplayValue")
            fieldType = FakeFieldType("string", "categorical")
            clusterNames = NP("empty", len(dataTable), dtype=fieldType.dtype)
            for index, cluster in enumerate(clusters):
                value = fieldType.stringToValue(cluster.get("name", ""))
                clusterNames[NP(bestClusterId == (index + 1))] = value
            score["predictedDisplayValue"] = DataColumn(fieldType, clusterNames, scoreMask)
            performanceTable.end("predictedDisplayValue")

        if self.subFields["entity"]:
            performanceTable.begin("entity")
            fieldType = FakeFieldType("object", "any")
            entities = NP("empty", len(dataTable), dtype=fieldType.dtype)
            for index, cluster in enumerate(clusters):
                value = fieldType.stringToValue(cluster.get("name", ""))
                indexPlusOne = index + 1
                for i in xrange(len(entities)):
                    if bestClusterId[i] == indexPlusOne:
                        entities[i] = cluster
            score["entity"] = DataColumn(fieldType, entities, scoreMask)
            performanceTable.end("entity")

        if self.subFields["clusterId"]:
            performanceTable.begin("clusterId")
            fieldType = FakeFieldType("integer", "continuous")
            score["clusterId"] = DataColumn(fieldType, bestClusterId, scoreMask)
            performanceTable.end("clusterId")

        if self.subFields["entityId"]:
            performanceTable.begin("entityId")
            fieldType = FakeFieldType("integer", "continuous")
            score["entityId"] = DataColumn(fieldType, bestClusterId, scoreMask)
            performanceTable.end("entityId")

        if self.subFields["clusterAffinity"]:
            performanceTable.begin("clusterAffinity")
            fieldType = FakeFieldType("double", "continuous")
            score["clusterAffinity"] = DataColumn(fieldType, bestClusterAffinity, scoreMask)
            performanceTable.end("clusterAffinity")

        if self.subFields["affinity"]:
            performanceTable.begin("affinity")
            fieldType = FakeFieldType("double", "continuous")
            score["affinity"] = DataColumn(fieldType, bestClusterAffinity, scoreMask)
            performanceTable.end("affinity")

        if self.subFields["all"]:
            performanceTable.begin("all")
            fieldType = FakeFieldType("double", "continuous")
            for identifier, distance in allClusterAffinities.items():
                score["all.%s" % identifier] = DataColumn(fieldType, distance, scoreMask)
            performanceTable.end("all")

        performanceTable.end("set scores")
        performanceTable.end("ClusteringModel")
        return score
示例#50
0
    def _selectMax(self, dataTable, functionTable, performanceTable,
                   segmentation):
        """Used by C{calculateScore}."""

        performanceTable.begin("Segmentation max")

        scoresData = NP("empty", len(dataTable), dtype=NP.dtype(object))
        filled = NP("zeros", len(dataTable), dtype=NP.dtype(bool))
        unfilled = NP("ones", len(dataTable), dtype=NP.dtype(bool))

        newOutputData = []
        for segment in segmentation.childrenOfTag("Segment", iterator=True):
            performanceTable.pause("Segmentation max")
            selection = segment.childOfClass(PmmlPredicate).evaluate(
                dataTable, functionTable, performanceTable)
            performanceTable.unpause("Segmentation max")
            if not selection.any():
                continue

            subTable = dataTable.subTable(selection)
            subModel = segment.childOfClass(PmmlModel)
            performanceTable.pause("Segmentation max")
            subModel.calculate(subTable, functionTable, performanceTable)
            performanceTable.unpause("Segmentation max")

            if subTable.score.fieldType.dataType in ("string", "boolean",
                                                     "object"):
                raise defs.PmmlValidationError(
                    "Segmentation with multipleModelMethod=\"max\" cannot be applied to models that produce dataType \"%s\""
                    % subTable.score.fieldType.dataType)

            # ignore invalid in matches (like the built-in "min" Apply function)
            if subTable.score.mask is not None:
                NP("logical_and", selection,
                   NP(subTable.score.mask == defs.VALID), selection)

            selectionFilled = NP("logical_and", selection, filled)
            selectionUnfilled = NP("logical_and", selection, unfilled)
            filled_selection = filled[selection]
            unfilled_selection = unfilled[selection]

            left, right = subTable.score.data[filled_selection], scoresData[
                selectionFilled]
            condition = NP(left > right)
            scoresData[selectionFilled] = NP("where", condition, left, right)
            scoresData[selectionUnfilled] = subTable.score.data[
                unfilled_selection]

            for fieldName, dataColumn in subTable.output.items():
                if fieldName not in dataTable.output:
                    data = NP("empty",
                              len(dataTable),
                              dtype=dataColumn.fieldType.dtype)
                    data[selectionUnfilled] = dataColumn.data

                    mask = NP(
                        NP("ones", len(dataTable), dtype=defs.maskType) *
                        defs.MISSING)
                    if dataColumn.mask is None:
                        mask[selectionUnfilled] = defs.VALID
                    else:
                        mask[selectionUnfilled] = dataColumn.mask

                    newDataColumn = DataColumn(dataColumn.fieldType, data,
                                               mask)
                    newDataColumn._unlock()
                    dataTable.output[fieldName] = newDataColumn
                    newOutputData.append(newDataColumn)

                else:
                    newDataColumn = dataTable.output[fieldName]

                    newDataColumn.data[selectionFilled] = NP(
                        "where", condition, dataColumn.data[filled_selection],
                        newDataColumn.data[selectionFilled])
                    newDataColumn.data[selectionUnfilled] = dataColumn.data[
                        unfilled_selection]

                    if dataColumn.mask is None:
                        newDataColumn.mask[selectionUnfilled] = defs.VALID
                    else:
                        newDataColumn.mask[selectionUnfilled] = dataColumn.mask

            filled += selectionUnfilled
            unfilled -= selectionUnfilled

        for newDataColumn in newOutputData:
            if not newDataColumn.mask.any():
                newDataColumn._mask = None
            newDataColumn._lock()

        if filled.all():
            scoresMask = None
        else:
            scoresMask = NP(NP("logical_not", filled) * defs.MISSING)

        scores = DataColumn(self.scoreType, scoresData, scoresMask)

        performanceTable.end("Segmentation max")
        return {None: scores}