def evaluate(self, dataTable, functionTable, performanceTable, returnUnknowns=False): """Evaluate the predicate, using a DataTable as input. @type dataTable: DataTable @param dataTable: The input DataTable, containing any fields that might be used to evaluate this predicate. @type functionTable: FunctionTable @param functionTable: The FunctionTable, containing any functions that might be called in this predicate. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @type returnUnknowns: bool @param returnUnknowns: If True, return a "mask" for the selection that indicates which rows are unknown, rather than True or False. @rtype: 1d Numpy array of bool or 3-tuple of arrays @return: Either a simple selection array or selection, unknowns, encounteredUnknowns """ performanceTable.begin("Predicate False") result = NP("zeros", len(dataTable), dtype=NP.dtype(bool)) if returnUnknowns: unknowns = NP("zeros", len(dataTable), dtype=NP.dtype(bool)) result = result, unknowns, unknowns performanceTable.end("Predicate False") return result
def cusum(self, testDistributions, fieldName, dataColumn, state, performanceTable): """Calculate the score of a CUSUM TestStatistic. The CUSUM cumulative sum is a stateful calculation: each row depends on the result of the previous row. To continue calculations through multiple calls to C{calc} or C{calculate}, pass a DataTableState object and give the BaselineModel a C{stateId} attribute. The C{stateId} is not valid in strict PMML, but it can be inserted after validation or used in custom-ODG models (C{from augustus.odg import *}). @type testDistributions: PmmlBinding @param testDistributions: The <TestDistributions> element. @type fieldName: string @param fieldName: The field name (for error messages). @type dataColumn: DataColumn @param dataColumn: The field. @type state: DataTableState @param state: The persistent state object, which is used to initialize the start state and save the end state of the cumulative sum. @type performanceTable: PerformanceTable or None @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: dict @return: A dictionary mapping PMML "feature" strings to DataColumns; CUSUM only defines the None key ("predictedValue"). """ baseline = testDistributions.xpath("pmml:Baseline/pmml:GaussianDistribution | pmml:Baseline/pmml:PoissonDistribution") alternate = testDistributions.xpath("pmml:Alternate/pmml:GaussianDistribution | pmml:Alternate/pmml:PoissonDistribution") if len(baseline) == 0 or len(alternate) == 0: raise defs.PmmlValidationError("BaselineModel CUSUM requires a Baseline and an Alternate that are either GaussianDistribution or PoissonDistribution") ratios = alternate[0].logpdf(dataColumn.data) - baseline[0].logpdf(dataColumn.data) if dataColumn.mask is None: good = NP("ones", len(dataColumn), dtype=NP.dtype(bool)) else: good = NP(dataColumn.mask == defs.VALID) stateId = self.get("stateId") last = None if stateId is not None: last = state.get(stateId) if last is None: last = 0.0 resetValue = testDistributions.get("resetValue", defaultFromXsd=True, convertType=True) output = NP("empty", len(dataColumn), dtype=NP.dtype(float)) performanceTable.begin("fill CUSUM") for index in xrange(len(dataColumn)): if good[index]: last = max(resetValue, last + ratios[index]) output[index] = last performanceTable.end("fill CUSUM") if stateId is not None: state[stateId] = last return {None: DataColumn(self.scoreType, output, None)}
def pointsToSmoothCurve(xarray, yarray, samples, smoothingScale, loop): """Fit a smooth line through a set of given numeric points with a characteristic smoothing scale. This is a non-parametric locally linear fit, used to plot data as a smooth line. @type xarray: 1d Numpy array of numbers @param xarray: Array of x values. @type yarray: 1d Numpy array of numbers @param yarray: Array of y values. @type samples: 1d Numpy array of numbers @param samples: Locations at which to fit the C{xarray} and C{yarray} with best-fit positions and derivatives. @type smoothingScale: number @param smoothingScale: Standard deviation of the Gaussian kernel used to smooth the locally linear fit. @type loop: bool @param loop: If False, disconnect the end of the fitted curve from the beginning. @rtype: 4-tuple of 1d Numpy arrays @return: C{xlist}, C{ylist}, C{dxlist}, C{dylist} appropriate for C{formatPathdata}. """ ylist = [] dylist = [] for sample in samples: weights = NP( NP( NP( "exp", NP( NP(-0.5 * NP("power", NP(xarray - sample), 2)) / NP(smoothingScale * smoothingScale))) / smoothingScale) / (math.sqrt(2.0 * math.pi))) sum1 = weights.sum() sumx = NP(weights * xarray).sum() sumxx = NP(weights * NP(xarray * xarray)).sum() sumy = NP(weights * yarray).sum() sumxy = NP(weights * NP(xarray * yarray)).sum() delta = (sum1 * sumxx) - (sumx * sumx) intercept = ((sumxx * sumy) - (sumx * sumxy)) / delta slope = ((sum1 * sumxy) - (sumx * sumy)) / delta ylist.append(intercept + (sample * slope)) dylist.append(slope) xlist = samples ylist = NP("array", ylist, dtype=NP.dtype(float)) dxlist = NP((NP("roll", xlist, -1) - NP("roll", xlist, 1)) / 2.0) dylist = NP("array", dylist, dtype=NP.dtype(float)) * dxlist if not loop: dxlist[0] = 0.0 dxlist[-1] = 0.0 dylist[0] = 0.0 dylist[-1] = 0.0 return xlist, ylist, dxlist, dylist
def functionMultiset(self, dataColumn, whereMask, groupSelection, getstate, setstate): """Derives a multiset of rows in a DataColumn, possibly with an SQL where mask and groupField. @type dataColumn: DataColumn @param dataColumn: The input data column. @type whereMask: 1d Numpy array of bool, or None @param whereMask: The result of the SQL where selection. @type groupSelection: 1d Numpy array of bool, or None. @param groupSelection: Rows corresponding to a particular value of the groupField. @type getstate: callable function @param getstate: Retrieve staring values from the DataTableState. @type setstate: callable function @param setstate: Store ending values to the DataTableState. @rtype: DataColumn of dict objects @return: A column of multisetted rows. """ fieldType = FakeFieldType("object", "any") selection = NP("ones", len(dataColumn), dtype=NP.dtype(bool)) if dataColumn.mask is not None: selection = NP("logical_and", selection, NP(dataColumn.mask == defs.VALID)) if whereMask is not None: NP("logical_and", selection, whereMask, selection) if groupSelection is not None: NP("logical_and", selection, groupSelection, selection) multiset = {} if getstate is not None: startingState = getstate() if startingState is not None: multiset = startingState current = dict(multiset) data = NP("empty", len(dataColumn), dtype=NP.dtype(object)) toPython = dataColumn.fieldType.valueToPython for i, x in enumerate(dataColumn.data): if selection[i]: value = toPython(x) if value not in multiset: multiset[value] = 0 multiset[value] += 1 current = dict(multiset) data[i] = current if setstate is not None: setstate(multiset) return DataColumn(fieldType, data, None)
def _checkValues(self, data, mask): values = self.values if len(values) == 0: return data, mask if mask is None: missing = NP("zeros", len(data), dtype=NP.dtype(bool)) invalid = NP("zeros", len(data), dtype=NP.dtype(bool)) else: missing = NP(mask == defs.MISSING) invalid = NP(mask == defs.INVALID) valid = NP("zeros", len(data), dtype=NP.dtype(bool)) numberOfValidSpecified = 0 for value in values: v = value.get("value") displayValue = value.get("displayValue") if displayValue is not None: self._displayValue[v] = displayValue prop = value.get("property", "valid") try: v2 = self.stringToValue(v) except ValueError: raise defs.PmmlValidationError("Improper value in Value specification: \"%s\"" % v) if prop == "valid": NP("logical_or", valid, NP(data == v2), valid) numberOfValidSpecified += 1 elif prop == "missing": NP("logical_or", missing, NP(data == v2), missing) elif prop == "invalid": NP("logical_or", invalid, NP(data == v2), invalid) if numberOfValidSpecified > 0: # guilty until proven innocent NP("logical_and", valid, NP("logical_not", missing), valid) if valid.all(): return data, None mask = NP(NP("ones", len(data), dtype=defs.maskType) * defs.INVALID) mask[missing] = defs.MISSING mask[valid] = defs.VALID else: # innocent until proven guilty NP("logical_and", invalid, NP("logical_not", missing), invalid) if not NP("logical_or", invalid, missing).any(): return data, None mask = NP("zeros", len(data), dtype=defs.maskType) mask[missing] = defs.MISSING mask[invalid] = defs.INVALID return data, mask
def _fromDataColumn_number(self, dataColumn): if dataColumn.mask is None: return NP("array", dataColumn.data, dtype=NP.dtype(object)) else: output = NP("empty", len(dataColumn), dtype=NP.dtype(object)) mask = dataColumn.mask for i, x in enumerate(dataColumn.data): if mask[i] == defs.VALID: output[i] = x elif mask[i] == defs.MISSING: output[i] = defs.NAN else: output[i] = None return output
def pointsToSmoothCurve(xarray, yarray, samples, smoothingScale, loop): """Fit a smooth line through a set of given numeric points with a characteristic smoothing scale. This is a non-parametric locally linear fit, used to plot data as a smooth line. @type xarray: 1d Numpy array of numbers @param xarray: Array of x values. @type yarray: 1d Numpy array of numbers @param yarray: Array of y values. @type samples: 1d Numpy array of numbers @param samples: Locations at which to fit the C{xarray} and C{yarray} with best-fit positions and derivatives. @type smoothingScale: number @param smoothingScale: Standard deviation of the Gaussian kernel used to smooth the locally linear fit. @type loop: bool @param loop: If False, disconnect the end of the fitted curve from the beginning. @rtype: 4-tuple of 1d Numpy arrays @return: C{xlist}, C{ylist}, C{dxlist}, C{dylist} appropriate for C{formatPathdata}. """ ylist = [] dylist = [] for sample in samples: weights = NP(NP(NP("exp", NP(NP(-0.5 * NP("power", NP(xarray - sample), 2)) / NP(smoothingScale * smoothingScale))) / smoothingScale) / (math.sqrt(2.0*math.pi))) sum1 = weights.sum() sumx = NP(weights * xarray).sum() sumxx = NP(weights * NP(xarray * xarray)).sum() sumy = NP(weights * yarray).sum() sumxy = NP(weights * NP(xarray * yarray)).sum() delta = (sum1 * sumxx) - (sumx * sumx) intercept = ((sumxx * sumy) - (sumx * sumxy)) / delta slope = ((sum1 * sumxy) - (sumx * sumy)) / delta ylist.append(intercept + (sample * slope)) dylist.append(slope) xlist = samples ylist = NP("array", ylist, dtype=NP.dtype(float)) dxlist = NP((NP("roll", xlist, -1) - NP("roll", xlist, 1)) / 2.0) dylist = NP("array", dylist, dtype=NP.dtype(float)) * dxlist if not loop: dxlist[0] = 0.0 dxlist[-1] = 0.0 dylist[0] = 0.0 dylist[-1] = 0.0 return xlist, ylist, dxlist, dylist
def mapReduce(self): """Build a MapReduce-Ready K-means producer. Used by C{optimize} and C{hadoopOptimize}. @rtype: MapReduce @return: An instance of MapReduce that can either be run in pure-Python mode or submitted to Hadoop. """ class KMeansMapReduceApplication(MapReduceKMeans): metadata = {} allChangeThreshold = self.allChangeThreshold KMeansMapReduceApplication.metadata[ "ClusteringModel"] = self.clusteringModel clusterVectors = {} for index, cluster in enumerate( self.clusteringModel.xpath("pmml:Cluster")): clusterName = cluster.get("id", "%d" % (index + 1)) clusterVectors[clusterName] = NP( "array", cluster.childOfTag("Array").values(), dtype=NP.dtype(float)) KMeansMapReduceApplication.metadata["clusterVectors"] = clusterVectors self.KMeansMapReduceApplication = KMeansMapReduceApplication return MapReduce(KMeansMapReduceApplication)
def evaluate(self, dataTable, functionTable, performanceTable, arguments): arguments = [ x.evaluate(dataTable, functionTable, performanceTable) for x in arguments ] performanceTable.begin("built-in \"%s\"" % self.name) fieldType = self.allBooleanType(arguments, atleast=2) data = NP("zeros", len(dataTable), dtype=fieldType.dtype) mask = None allbad = NP("ones", len(dataTable), dtype=NP.dtype(bool)) (data, allbad), mask = self.applySkipMissing((data, allbad), mask, arguments) if allbad.any(): if mask is None: mask = allbad * defs.MISSING else: NP("logical_and", allbad, NP(mask == defs.VALID), allbad) mask[allbad] = defs.MISSING performanceTable.end("built-in \"%s\"" % self.name) return DataColumn(fieldType, data, mask)
def functionMax(self, dataColumn, whereMask, groupSelection, getstate, setstate): """Finds the maximum of rows in a DataColumn, possibly with an SQL where mask and groupField. @type dataColumn: DataColumn @param dataColumn: The input data column. @type whereMask: 1d Numpy array of bool, or None @param whereMask: The result of the SQL where selection. @type groupSelection: 1d Numpy array of bool, or None. @param groupSelection: Rows corresponding to a particular value of the groupField. @type getstate: callable function @param getstate: Retrieve staring values from the DataTableState. @type setstate: callable function @param setstate: Store ending values to the DataTableState. @rtype: DataColumn @return: A column of maximized rows. """ fieldType = dataColumn.fieldType if fieldType.optype not in ("continuous", "ordinal"): raise defs.PmmlValidationError("Aggregate function \"min\" requires a continuous or ordinal input field") if dataColumn.mask is None: selection = NP("ones", len(dataColumn), dtype=NP.dtype(bool)) else: selection = NP(dataColumn.mask == defs.VALID) if whereMask is not None: NP("logical_and", selection, whereMask, selection) if groupSelection is not None: NP("logical_and", selection, groupSelection, selection) maximum = None if getstate is not None: startingState = getstate() if startingState is not None: maximum = startingState data = NP("empty", len(dataColumn), dtype=fieldType.dtype) mask = NP("zeros", len(dataColumn), dtype=defs.maskType) for i, x in enumerate(dataColumn.data): if selection[i]: if maximum is None or x > maximum: maximum = x if maximum is None: mask[i] = defs.INVALID else: data[i] = maximum if not mask.any(): mask = None if setstate is not None: setstate(maximum) return DataColumn(fieldType, data, mask)
def evaluate(self, dataTable, functionTable, performanceTable, returnUnknowns=False): """Evaluate the predicate, using a DataTable as input. @type dataTable: DataTable @param dataTable: The input DataTable, containing any fields that might be used to evaluate this predicate. @type functionTable: FunctionTable @param functionTable: The FunctionTable, containing any functions that might be called in this predicate. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @type returnUnknowns: bool @param returnUnknowns: If True, return a "mask" for the selection that indicates which rows are unknown, rather than True or False. @rtype: 1d Numpy array of bool or 3-tuple of arrays @return: Either a simple selection array or selection, unknowns, encounteredUnknowns """ performanceTable.begin("SimpleSetPredicate") fieldName = self.get("field") dataColumn = dataTable.fields[fieldName] fromString = dataColumn.fieldType.stringToValue array = [ fromString(x) for x in self.childOfClass(Array).values(convertType=False) ] selection = NP("in1d", dataColumn.data, array) if self.get("booleanOperator") == "isNotIn": NP("logical_not", selection, selection) if returnUnknowns: if dataColumn.mask is None: unknowns = NP("zeros", len(dataTable), dtype=NP.dtype(bool)) else: unknowns = NP(dataColumn.mask != defs.VALID) performanceTable.end("SimpleSetPredicate") return selection, unknowns, unknowns else: if dataColumn.mask is not None: NP("logical_and", selection, NP(dataColumn.mask == defs.VALID), selection) performanceTable.end("SimpleSetPredicate") return selection
def initialize(self, state, numberOfRecords, numberOfFields, distributionBased): """First step in a vectorized metric calculation with missing values, called once before all fields and cluster centers. Only modifies the C{state} object. @type state: ad-hoc Python object @param state: State information that persists long enough to span the three steps of a metric calculation. This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState. @type numberOfRecords: int @param numberOfRecords: The number of rows in the dataset. @type numberOfFields: int @param numberOfFields: The number of columns in the dataset. @type distributionBased: bool @param distributionBased: If True, use a covariance matrix to scale the distance result. """ state.sumInQuadrature = NP("zeros", numberOfRecords, dtype=NP.dtype(float)) if distributionBased: state.displacements = NP("empty", (numberOfRecords, numberOfFields), dtype=NP.dtype(float)) state.displacementIndex = 0
def _fromDataColumn(self, dataColumn): # enumeration uses less memory and, interestingly, a little less time than a list comprehension (80 ns instead of 100 ns per record) output = NP("empty", len(dataColumn), dtype=NP.dtype(object)) if dataColumn.mask is None: for i, x in enumerate(dataColumn.data): output[i] = self.valueToPython(x) else: mask = dataColumn.mask for i, x in enumerate(dataColumn.data): if mask[i] == defs.VALID: output[i] = self.valueToPython(x) elif mask[i] == defs.MISSING: output[i] = defs.NAN else: output[i] = None return output
def initialize(self, state, numberOfRecords, numberOfFields, distributionBased): """First step in a vectorized metric calculation with missing values, called once before all fields and cluster centers. Only modifies the C{state} object. @type state: ad-hoc Python object @param state: State information that persists long enough to span the three steps of a metric calculation. This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState. @type numberOfRecords: int @param numberOfRecords: The number of rows in the dataset. @type numberOfFields: int @param numberOfFields: The number of columns in the dataset. @type distributionBased: bool @param distributionBased: If True, use a covariance matrix to scale the distance result. """ state.maximumComponent = NP("zeros", numberOfRecords, dtype=NP.dtype(float)) if distributionBased: raise NotImplementedError("Distribution-based clustering has not been implemented for the %s metric" % self.t)
def _checkIntervals(self, data, mask): intervals = self.intervals if len(intervals) == 0: return data, mask # innocent until proven guilty invalid = NP("zeros", len(data), dtype=NP.dtype(bool)) for interval in intervals: closure = interval["closure"] leftMargin = interval.get("leftMargin") rightMargin = interval.get("rightMargin") if leftMargin is not None: try: leftMargin = self.stringToValue(leftMargin) except ValueError: raise defs.PmmlValidationError("Improper value in Interval leftMargin specification: \"%s\"" % leftMargin) if closure in ("openClosed", "openOpen"): invalid[NP(data <= leftMargin)] = True elif closure in ("closedOpen", "closedClosed"): invalid[NP(data < leftMargin)] = True if rightMargin is not None: try: rightMargin = self.stringToValue(rightMargin) except ValueError: raise defs.PmmlValidationError("Improper value in Interval rightMargin specification: \"%s\"" % rightMargin) if closure in ("openOpen", "closedOpen"): invalid[NP(data >= rightMargin)] = True elif closure in ("openClosed", "closedClosed"): invalid[NP(data > rightMargin)] = True if not invalid.any(): return data, mask if mask is None: return data, NP(invalid * defs.INVALID) else: NP("logical_and", invalid, NP(mask == defs.VALID), invalid) # only change what wasn't already marked as MISSING mask[invalid] = defs.INVALID return data, mask
def evaluate(self, dataTable, functionTable, performanceTable, returnUnknowns=False): """Evaluate the predicate, using a DataTable as input. @type dataTable: DataTable @param dataTable: The input DataTable, containing any fields that might be used to evaluate this predicate. @type functionTable: FunctionTable @param functionTable: The FunctionTable, containing any functions that might be called in this predicate. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @type returnUnknowns: bool @param returnUnknowns: If True, return a "mask" for the selection that indicates which rows are unknown, rather than True or False. @rtype: 1d Numpy array of bool or 3-tuple of arrays @return: Either a simple selection array or selection, unknowns, encounteredUnknowns """ performanceTable.begin("SimpleSetPredicate") fieldName = self.get("field") dataColumn = dataTable.fields[fieldName] fromString = dataColumn.fieldType.stringToValue array = [fromString(x) for x in self.childOfClass(Array).values(convertType=False)] selection = NP("in1d", dataColumn.data, array) if self.get("booleanOperator") == "isNotIn": NP("logical_not", selection, selection) if returnUnknowns: if dataColumn.mask is None: unknowns = NP("zeros", len(dataTable), dtype=NP.dtype(bool)) else: unknowns = NP(dataColumn.mask != defs.VALID) performanceTable.end("SimpleSetPredicate") return selection, unknowns, unknowns else: if dataColumn.mask is not None: NP("logical_and", selection, NP(dataColumn.mask == defs.VALID), selection) performanceTable.end("SimpleSetPredicate") return selection
def evaluate(self, dataTable, functionTable, performanceTable, arguments): arguments = [x.evaluate(dataTable, functionTable, performanceTable) for x in arguments] performanceTable.begin("built-in \"%s\"" % self.name) fieldType = self.allBooleanType(arguments, atleast=2) data = NP("zeros", len(dataTable), dtype=fieldType.dtype) mask = None allbad = NP("ones", len(dataTable), dtype=NP.dtype(bool)) (data, allbad), mask = self.applySkipMissing((data, allbad), mask, arguments) if allbad.any(): if mask is None: mask = allbad * defs.MISSING else: NP("logical_and", allbad, NP(mask == defs.VALID), allbad) mask[allbad] = defs.MISSING performanceTable.end("built-in \"%s\"" % self.name) return DataColumn(fieldType, data, mask)
def prepare(self, state, dataTable, functionTable, performanceTable, plotRange): """Prepare a plot element for drawing. This stage consists of calculating all quantities and determing the bounds of the data. These bounds may be unioned with bounds from other plot elements that overlay this plot element, so the drawing (which requires a finalized coordinate system) cannot begin yet. This method modifies C{plotRange}. @type state: ad-hoc Python object @param state: State information that persists long enough to use quantities computed in C{prepare} in the C{draw} stage. This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState. @type dataTable: DataTable @param dataTable: Contains the data to plot. @type functionTable: FunctionTable @param functionTable: Defines functions that may be used to transform data for plotting. @type performanceTable: PerformanceTable @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process. @type plotRange: PlotRange @param plotRange: The bounding box of plot coordinates that this function will expand. """ self._saveContext(dataTable) self.checkRoles(["x", "y", "x-errorbar", "x-errorbar-up", "x-errorbar-down", "y-errorbar", "y-errorbar-up", "y-errorbar-down", "weight"]) xExpression = self.xpath("pmml:PlotNumericExpression[@role='x']") yExpression = self.xpath("pmml:PlotNumericExpression[@role='y']") cutExpression = self.xpath("pmml:PlotSelection") exExpression = self.xpath("pmml:PlotNumericExpression[@role='x-errorbar']") exupExpression = self.xpath("pmml:PlotNumericExpression[@role='x-errorbar-up']") exdownExpression = self.xpath("pmml:PlotNumericExpression[@role='x-errorbar-down']") eyExpression = self.xpath("pmml:PlotNumericExpression[@role='y-errorbar']") eyupExpression = self.xpath("pmml:PlotNumericExpression[@role='y-errorbar-up']") eydownExpression = self.xpath("pmml:PlotNumericExpression[@role='y-errorbar-down']") weightExpression = self.xpath("pmml:PlotNumericExpression[@role='weight']") if len(xExpression) != 1 or len(yExpression) != 1: raise defs.PmmlValidationError("PlotScatter requires two PlotNumericExpressions, one with role \"x\", the other with role \"y\"") xValues = xExpression[0].evaluate(dataTable, functionTable, performanceTable) yValues = yExpression[0].evaluate(dataTable, functionTable, performanceTable) if len(cutExpression) == 1: selection = cutExpression[0].select(dataTable, functionTable, performanceTable) else: selection = NP("ones", len(dataTable), NP.dtype(bool)) if len(exExpression) == 0 and len(exupExpression) == 0 and len(exdownExpression) == 0: exup, exdown = None, None elif len(exExpression) == 1 and len(exupExpression) == 0 and len(exdownExpression) == 0: exup = exExpression[0].evaluate(dataTable, functionTable, performanceTable) exdown = None elif len(exExpression) == 0 and len(exupExpression) == 1 and len(exdownExpression) == 1: exup = exupExpression[0].evaluate(dataTable, functionTable, performanceTable) exdown = exdownExpression[0].evaluate(dataTable, functionTable, performanceTable) else: raise defs.PmmlValidationError("Use \"x-errorbar\" for symmetric error bars or \"x-errorbar-up\" and \"x-errorbar-down\" for asymmetric errorbars, but no other combinations") if len(eyExpression) == 0 and len(eyupExpression) == 0 and len(eydownExpression) == 0: eyup, eydown = None, None elif len(eyExpression) == 1 and len(eyupExpression) == 0 and len(eydownExpression) == 0: eyup = eyExpression[0].evaluate(dataTable, functionTable, performanceTable) eydown = None elif len(eyExpression) == 0 and len(eyupExpression) == 1 and len(eydownExpression) == 1: eyup = eyupExpression[0].evaluate(dataTable, functionTable, performanceTable) eydown = eydownExpression[0].evaluate(dataTable, functionTable, performanceTable) else: raise defs.PmmlValidationError("Use \"y-errorbar\" for symmetric error bars or \"y-errorbar-up\" and \"y-errorbar-down\" for asymmetric errorbars, but no other combinations") if len(weightExpression) == 1: weight = weightExpression[0].evaluate(dataTable, functionTable, performanceTable) else: weight = None performanceTable.begin("PlotScatter prepare") if xValues.mask is not None: NP("logical_and", selection, NP(xValues.mask == defs.VALID), selection) if yValues.mask is not None: NP("logical_and", selection, NP(yValues.mask == defs.VALID), selection) if exup is not None and exup.mask is not None: NP("logical_and", selection, NP(exup.mask == defs.VALID), selection) if exdown is not None and exdown.mask is not None: NP("logical_and", selection, NP(exdown.mask == defs.VALID), selection) if eyup is not None and eyup.mask is not None: NP("logical_and", selection, NP(eyup.mask == defs.VALID), selection) if eydown is not None and eydown.mask is not None: NP("logical_and", selection, NP(eydown.mask == defs.VALID), selection) state.x = xValues.data[selection] state.y = yValues.data[selection] state.exup, state.exdown, state.eyup, state.eydown = None, None, None, None if exup is not None: state.exup = exup.data[selection] if exdown is not None: state.exdown = exdown.data[selection] if eyup is not None: state.eyup = eyup.data[selection] if eydown is not None: state.eydown = eydown.data[selection] state.weight = None if weight is not None: state.weight = weight.data[selection] stateId = self.get("stateId") if stateId is not None: persistentState = dataTable.state.get(stateId) if persistentState is None: persistentState = {} dataTable.state[stateId] = persistentState else: state.x = NP("concatenate", (persistentState["x"], state.x)) state.y = NP("concatenate", (persistentState["y"], state.y)) if exup is not None: state.exup = NP("concatenate", (persistentState["exup"], state.exup)) if exdown is not None: state.exdown = NP("concatenate", (persistentState["exdown"], state.exdown)) if eyup is not None: state.eyup = NP("concatenate", (persistentState["eyup"], state.eyup)) if eydown is not None: state.eydown = NP("concatenate", (persistentState["eydown"], state.eydown)) if weight is not None: state.weight = NP("concatenate", (persistentState["weight"], state.weight)) persistentState["x"] = state.x persistentState["y"] = state.y if exup is not None: persistentState["exup"] = state.exup if exdown is not None: persistentState["exdown"] = state.exdown if eyup is not None: persistentState["eyup"] = state.eyup if eydown is not None: persistentState["eydown"] = state.eydown if weight is not None: persistentState["weight"] = state.weight plotRange.expand(state.x, state.y, xValues.fieldType, yValues.fieldType) performanceTable.end("PlotScatter prepare")
def prepare(self, state, dataTable, functionTable, performanceTable, plotRange): """Prepare a plot element for drawing. This stage consists of calculating all quantities and determing the bounds of the data. These bounds may be unioned with bounds from other plot elements that overlay this plot element, so the drawing (which requires a finalized coordinate system) cannot begin yet. This method modifies C{plotRange}. @type state: ad-hoc Python object @param state: State information that persists long enough to use quantities computed in C{prepare} in the C{draw} stage. This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState. @type dataTable: DataTable @param dataTable: Contains the data to plot. @type functionTable: FunctionTable @param functionTable: Defines functions that may be used to transform data for plotting. @type performanceTable: PerformanceTable @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process. @type plotRange: PlotRange @param plotRange: The bounding box of plot coordinates that this function will expand. """ self.checkRoles(["z(x,y)", "x", "y", "zmean", "zweight"]) performanceTable.begin("PlotHeatMap prepare") self._saveContext(dataTable) zofxy = self.xpath("pmml:PlotFormula[@role='z(x,y)']") xexpr = self.xpath("pmml:PlotNumericExpression[@role='x']") yexpr = self.xpath("pmml:PlotNumericExpression[@role='y']") zmean = self.xpath("pmml:PlotNumericExpression[@role='zmean']") zweight = self.xpath("pmml:PlotNumericExpression[@role='zweight']") cutExpression = self.xpath("pmml:PlotSelection") if len(zofxy) == 1 and len(xexpr) == 0 and len(yexpr) == 0 and len( zmean) == 0 and len(zweight) == 0: xbins = self.get("xbins", convertType=True) xlow = self.get("xlow", convertType=True) xhigh = self.get("xhigh", convertType=True) ybins = self.get("ybins", convertType=True) ylow = self.get("ylow", convertType=True) yhigh = self.get("yhigh", convertType=True) if xbins is None or xlow is None or xhigh is None or ybins is None or ylow is None or yhigh is None: raise defs.PmmlValidationError( "xbins, xlow, xhigh, ybins, ylow, and yhigh are required for HeatMaps of a mathematical formula" ) if xlow >= xhigh or ylow >= yhigh: raise defs.PmmlValidationError( "xlow must be less than xhigh and ylow must be less than yhigh" ) if plotRange.xStrictlyPositive or plotRange.yStrictlyPositive: raise defs.PmmlValidationError( "PlotHeatMap can only be properly displayed in linear x, y coordinates" ) xbinWidth = (xhigh - xlow) / float(xbins) ybinWidth = (yhigh - ylow) / float(ybins) xarray = NP("tile", NP("linspace", xlow, xhigh, xbins, endpoint=True), ybins) yarray = NP("repeat", NP("linspace", ylow, yhigh, ybins, endpoint=True), xbins) sampleTable = DataTable({ "x": "double", "y": "double" }, { "x": xarray, "y": yarray }) parsed = Formula.parse(zofxy[0].text) performanceTable.pause("PlotHeatMap prepare") zdataColumn = parsed.evaluate(sampleTable, functionTable, performanceTable) performanceTable.unpause("PlotHeatMap prepare") if not zdataColumn.fieldType.isnumeric(): raise defs.PmmlValidationError( "PlotFormula z(x,y) must return a numeric expression, not %r" % zdataColumn.fieldType) selection = NP("isfinite", zdataColumn.data) if zdataColumn.mask is not None: NP("logical_and", selection, NP(zdataColumn.mask == defs.VALID), selection) if plotRange.zStrictlyPositive: NP("logical_and", selection, NP(zdataColumn.data > 0.0), selection) gooddata = zdataColumn.data[selection] plotRange.zminPush(gooddata.min(), zdataColumn.fieldType, sticky=False) plotRange.zmaxPush(gooddata.max(), zdataColumn.fieldType, sticky=False) state.zdata = zdataColumn.data state.zmask = NP("logical_not", selection) * defs.INVALID elif len(zofxy) == 0 and len(xexpr) == 1 and len(yexpr) == 1: performanceTable.pause("PlotHeatMap prepare") xdataColumn = xexpr[0].evaluate(dataTable, functionTable, performanceTable) ydataColumn = yexpr[0].evaluate(dataTable, functionTable, performanceTable) performanceTable.unpause("PlotHeatMap prepare") xbins = self.get("xbins", convertType=True) xlow = self.get("xlow", convertType=True) xhigh = self.get("xhigh", convertType=True) ybins = self.get("ybins", convertType=True) ylow = self.get("ylow", convertType=True) yhigh = self.get("yhigh", convertType=True) if len(xdataColumn) > 0: if xlow is None: xlow = NP("nanmin", xdataColumn.data) if xhigh is None: xhigh = NP("nanmax", xdataColumn.data) if ylow is None: ylow = NP("nanmin", ydataColumn.data) if yhigh is None: yhigh = NP("nanmax", ydataColumn.data) else: if xlow is None: xlow = 0.0 if xhigh is None: xhigh = 1.0 if ylow is None: ylow = 0.0 if yhigh is None: yhigh = 1.0 if xbins is None: q1, q3 = NP("percentile", xdataColumn.data, [25.0, 75.0]) binWidth = 2.0 * (q3 - q1) / math.pow(len(xdataColumn.data), 1.0 / 3.0) if binWidth > 0.0: xbins = max(10, int(math.ceil((xhigh - xlow) / binWidth))) else: xbins = 10 if ybins is None: q1, q3 = NP("percentile", ydataColumn.data, [25.0, 75.0]) binWidth = 2.0 * (q3 - q1) / math.pow(len(ydataColumn.data), 1.0 / 3.0) if binWidth > 0.0: ybins = max(10, int(math.ceil((yhigh - ylow) / binWidth))) else: ybins = 10 if xlow >= xhigh or ylow >= yhigh: raise defs.PmmlValidationError( "xlow must be less than xhigh and ylow must be less than yhigh" ) if plotRange.xStrictlyPositive or plotRange.yStrictlyPositive: raise defs.PmmlValidationError( "PlotHeatMap can only be properly displayed in linear x, y coordinates" ) persistentState = {} stateId = self.get("stateId") if stateId is not None: if stateId in dataTable.state: persistentState = dataTable.state[stateId] else: dataTable.state[stateId] = persistentState if len(zmean) == 0: if "xbins" in persistentState: xbins = persistentState["xbins"] if "xlow" in persistentState: xlow = persistentState["xlow"] if "xhigh" in persistentState: xhigh = persistentState["xhigh"] if "ybins" in persistentState: ybins = persistentState["ybins"] if "ylow" in persistentState: ylow = persistentState["ylow"] if "yhigh" in persistentState: yhigh = persistentState["yhigh"] persistentState["xbins"] = xbins persistentState["xlow"] = xlow persistentState["xhigh"] = xhigh persistentState["ybins"] = ybins persistentState["ylow"] = ylow persistentState["yhigh"] = yhigh xbinWidth = (xhigh - xlow) / float(xbins) ybinWidth = (yhigh - ylow) / float(ybins) mask = NP("ones", len(dataTable), dtype=NP.dtype(float)) if xdataColumn.mask is not None: NP("multiply", mask, (xdataColumn.mask == defs.VALID), mask) if ydataColumn.mask is not None: NP("multiply", mask, (ydataColumn.mask == defs.VALID), mask) if len(cutExpression) == 1: performanceTable.pause("PlotHeatMap prepare") NP( "multiply", mask, cutExpression[0].select(dataTable, functionTable, performanceTable), mask) performanceTable.unpause("PlotHeatMap prepare") if len(zmean) == 0 and len(zweight) == 0: histogram, xedges, yedges = NP("histogram2d", ydataColumn.data, xdataColumn.data, bins=(ybins, xbins), range=[[ylow, yhigh], [xlow, xhigh]], weights=mask) if len(dataTable) == 0: # work around Numpy <= 1.6.1 bug histogram = NP("zeros", (ybins, xbins), dtype=NP.dtype(float)) if "histogram" in persistentState: persistentState["histogram"] = NP( persistentState["histogram"] + histogram) else: persistentState["histogram"] = histogram histogram = persistentState["histogram"] if plotRange.zStrictlyPositive: zmin = 0.1 else: zmin = 0.0 zmax = NP("nanmax", histogram) plotRange.zminPush(zmin, self.zfieldType, sticky=True) if zmax > zmin: plotRange.zmaxPush(zmax, self.zfieldType, sticky=False) elif len(zmean) == 0 and len(zweight) == 1: performanceTable.pause("PlotHeatMap prepare") weightsDataColumn = zweight[0].evaluate( dataTable, functionTable, performanceTable) performanceTable.unpause("PlotHeatMap prepare") if weightsDataColumn.mask is not None: NP("multiply", mask, (weightsDataColumn.mask == defs.VALID), mask) weights = NP(weightsDataColumn.data * mask) histogram, xedges, yedges = NP("histogram2d", ydataColumn.data, xdataColumn.data, bins=(ybins, xbins), range=[[ylow, yhigh], [xlow, xhigh]], weights=weights) if "histogram" in persistentState: persistentState["histogram"] = NP( persistentState["histogram"] + histogram) else: persistentState["histogram"] = histogram histogram = persistentState["histogram"] if plotRange.zStrictlyPositive: w = weights[NP(weights > 0.0)] if len(w) > 0: zmin = 0.1 * NP("nanmin", w) else: zmin = 0.1 else: zmin = 0.0 zmax = NP("nanmax", histogram) plotRange.zminPush(zmin, self.zfieldType, sticky=True) if zmax > zmin: plotRange.zmaxPush(zmax, self.zfieldType, sticky=False) elif len(zmean) == 1 and len(zweight) == 0: performanceTable.pause("PlotHeatMap prepare") zdataColumn = zmean[0].evaluate(dataTable, functionTable, performanceTable) performanceTable.unpause("PlotHeatMap prepare") if zdataColumn.mask is not None: NP("multiply", mask, (zdataColumn.mask == defs.VALID), mask) weights = NP(zdataColumn.data * mask) numer, xedges, yedges = NP("histogram2d", ydataColumn.data, xdataColumn.data, bins=(ybins, xbins), range=[[ylow, yhigh], [xlow, xhigh]], weights=weights) denom, xedges, yedges = NP("histogram2d", ydataColumn.data, xdataColumn.data, bins=(ybins, xbins), range=[[ylow, yhigh], [xlow, xhigh]], weights=mask) if "numer" in persistentState: persistentState["numer"] = NP(persistentState["numer"] + numer) persistentState["denom"] = NP(persistentState["denom"] + denom) else: persistentState["numer"] = numer persistentState["denom"] = denom numer = persistentState["numer"] denom = persistentState["denom"] histogram = numer / denom selection = NP("isfinite", histogram) if plotRange.zStrictlyPositive: NP("logical_and", selection, NP(histogram > 0.0), selection) if NP("count_nonzero", selection) > 0: gooddata = histogram[selection] plotRange.zminPush(gooddata.min(), self.zfieldType, sticky=False) plotRange.zmaxPush(gooddata.max(), self.zfieldType, sticky=False) else: raise defs.PmmlValidationError( "The only allowed combinations of PlotFormula/PlotNumericExpressions are: \"z(x,y)\" (function), \"x y\" (histogram), \"x y zmean\" (mean of z in x y bins), \"x y zweight\" (weighted x y histogram)" ) state.zdata = NP("reshape", histogram, xbins * ybins) state.zmask = None else: raise defs.PmmlValidationError( "The only allowed combinations of PlotFormula/PlotNumericExpressions are: \"z(x,y)\" (function), \"x y\" (histogram), \"x y zmean\" (mean of z in x y bins), \"x y zweight\" (weighted x y histogram)" ) plotRange.xminPush(xlow, self.xyfieldType, sticky=True) plotRange.yminPush(ylow, self.xyfieldType, sticky=True) plotRange.xmaxPush(xhigh, self.xyfieldType, sticky=True) plotRange.ymaxPush(yhigh, self.xyfieldType, sticky=True) state.xbins = xbins state.xlow = xlow state.xhigh = xhigh state.ybins = ybins state.ylow = ylow state.yhigh = yhigh performanceTable.end("PlotHeatMap prepare")
def evaluate(self, dataTable, functionTable, performanceTable, returnUnknowns=False): """Evaluate the predicate, using a DataTable as input. @type dataTable: DataTable @param dataTable: The input DataTable, containing any fields that might be used to evaluate this predicate. @type functionTable: FunctionTable @param functionTable: The FunctionTable, containing any functions that might be called in this predicate. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @type returnUnknowns: bool @param returnUnknowns: If True, return a "mask" for the selection that indicates which rows are unknown, rather than True or False. @rtype: 1d Numpy array of bool or 3-tuple of arrays @return: Either a simple selection array or selection, unknowns, encounteredUnknowns """ performanceTable.begin("SimplePredicate") fieldName = self.get("field") dataColumn = dataTable.fields[fieldName] operator = self.get("operator") if operator == "isMissing": if dataColumn.mask is None: selection = NP("zeros", len(dataTable), dtype=NP.dtype(bool)) else: selection = NP(dataColumn.mask == defs.MISSING) elif operator == "isNotMissing": if dataColumn.mask is None: selection = NP("ones", len(dataTable), dtype=NP.dtype(bool)) else: selection = NP(dataColumn.mask != defs.MISSING) else: try: value = dataColumn.fieldType.stringToValue(self.get("value")) except ValueError as err: raise defs.PmmlValidationError("SimplePredicate.value \"%s\" cannot be cast as %r: %s" % (self.get("value"), dataColumn.fieldType, str(err))) if operator == "equal": selection = NP(dataColumn.data == value) elif operator == "notEqual": selection = NP(dataColumn.data != value) elif operator == "lessThan": if dataColumn.fieldType.optype == "categorical": raise TypeError("Categorical field \"%s\" cannot be compared using %s" % (fieldName, operator)) selection = NP(dataColumn.data < value) elif operator == "lessOrEqual": if dataColumn.fieldType.optype == "categorical": raise TypeError("Categorical field \"%s\" cannot be compared using %s" % (fieldName, operator)) selection = NP(dataColumn.data <= value) elif operator == "greaterThan": if dataColumn.fieldType.optype == "categorical": raise TypeError("Categorical field \"%s\" cannot be compared using %s" % (fieldName, operator)) selection = NP(dataColumn.data > value) elif operator == "greaterOrEqual": if dataColumn.fieldType.optype == "categorical": raise TypeError("Categorical field \"%s\" cannot be compared using %s" % (fieldName, operator)) selection = NP(dataColumn.data >= value) if returnUnknowns: if dataColumn.mask is None: unknowns = NP("zeros", len(dataTable), dtype=NP.dtype(bool)) else: unknowns = NP(dataColumn.mask != defs.VALID) performanceTable.end("SimplePredicate") return selection, unknowns, unknowns else: if dataColumn.mask is not None: NP("logical_and", selection, NP(dataColumn.mask == defs.VALID), selection) performanceTable.end("SimplePredicate") return selection
def prepare(self, state, dataTable, functionTable, performanceTable, plotRange): """Prepare a plot element for drawing. This stage consists of calculating all quantities and determing the bounds of the data. These bounds may be unioned with bounds from other plot elements that overlay this plot element, so the drawing (which requires a finalized coordinate system) cannot begin yet. This method modifies C{plotRange}. @type state: ad-hoc Python object @param state: State information that persists long enough to use quantities computed in C{prepare} in the C{draw} stage. This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState. @type dataTable: DataTable @param dataTable: Contains the data to plot. @type functionTable: FunctionTable @param functionTable: Defines functions that may be used to transform data for plotting. @type performanceTable: PerformanceTable @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process. @type plotRange: PlotRange @param plotRange: The bounding box of plot coordinates that this function will expand. """ self.checkRoles([ "y(x)", "dy/dx", "x(t)", "y(t)", "dx/dt", "dy/dt", "x", "y", "dx", "dy" ]) performanceTable.begin("PlotCurve prepare") self._saveContext(dataTable) yofx = self.xpath("pmml:PlotFormula[@role='y(x)']") dydx = self.xpath("pmml:PlotFormula[@role='dy/dx']") xoft = self.xpath("pmml:PlotFormula[@role='x(t)']") yoft = self.xpath("pmml:PlotFormula[@role='y(t)']") dxdt = self.xpath("pmml:PlotFormula[@role='dx/dt']") dydt = self.xpath("pmml:PlotFormula[@role='dy/dt']") nx = self.xpath("pmml:PlotNumericExpression[@role='x']") ny = self.xpath("pmml:PlotNumericExpression[@role='y']") ndx = self.xpath("pmml:PlotNumericExpression[@role='dx']") ndy = self.xpath("pmml:PlotNumericExpression[@role='dy']") cutExpression = self.xpath("pmml:PlotSelection") if len(yofx) + len(dydx) + len(xoft) + len(yoft) + len(dxdt) + len( dydt) > 0: if len(yofx) == 1 and len(dydx) == 0 and len(xoft) == 0 and len( yoft) == 0 and len(dxdt) == 0 and len(dydt) == 0: expression = (yofx[0].text, ) derivative = (None, ) elif len(yofx) == 1 and len(dydx) == 1 and len(xoft) == 0 and len( yoft) == 0 and len(dxdt) == 0 and len(dydt) == 0: expression = (yofx[0].text, ) derivative = (dydx[0].text, ) elif len(yofx) == 0 and len(dydx) == 0 and len(xoft) == 1 and len( yoft) == 1 and len(dxdt) == 0 and len(dydt) == 0: expression = xoft[0].text, yoft[0].text derivative = None, None elif len(yofx) == 0 and len(dydx) == 0 and len(xoft) == 1 and len( yoft) == 1 and len(dxdt) == 1 and len(dydt) == 1: expression = xoft[0].text, yoft[0].text derivative = dxdt[0].text, dydt[0].text else: raise defs.PmmlValidationError( "The only allowed combinations of PlotFormulae are: \"y(x)\", \"y(x) dy/dx\", \"x(t) y(t)\", and \"x(t) y(t) dx/dt dy/dt\"" ) low = self.get("low", convertType=True) high = self.get("high", convertType=True) if low is None or high is None: raise defs.PmmlValidationError( "The \"low\" and \"high\" attributes are required for PlotCurves defined by formulae" ) samples = self.generateSamples(low, high) loop = self.get("loop", defaultFromXsd=True, convertType=True) state.x, state.y, state.dx, state.dy, xfieldType, yfieldType = self.expressionsToPoints( expression, derivative, samples, loop, functionTable, performanceTable) else: performanceTable.pause("PlotCurve prepare") if len(ndx) == 1: dxdataColumn = ndx[0].evaluate(dataTable, functionTable, performanceTable) else: dxdataColumn = None if len(ndy) == 1: dydataColumn = ndy[0].evaluate(dataTable, functionTable, performanceTable) else: dydataColumn = None performanceTable.unpause("PlotCurve prepare") if len(nx) == 0 and len(ny) == 1: performanceTable.pause("PlotCurve prepare") ydataColumn = ny[0].evaluate(dataTable, functionTable, performanceTable) performanceTable.unpause("PlotCurve prepare") if len(cutExpression) == 1: performanceTable.pause("PlotCurve prepare") selection = cutExpression[0].select( dataTable, functionTable, performanceTable) performanceTable.unpause("PlotCurve prepare") else: selection = NP("ones", len(ydataColumn.data), NP.dtype(bool)) if ydataColumn.mask is not None: selection = NP("logical_and", selection, NP(ydataColumn.mask == defs.VALID), selection) if dxdataColumn is not None and dxdataColumn.mask is not None: selection = NP("logical_and", selection, NP(dxdataColumn.mask == defs.VALID), selection) if dydataColumn is not None and dydataColumn.mask is not None: selection = NP("logical_and", selection, NP(dydataColumn.mask == defs.VALID), selection) yarray = ydataColumn.data[selection] xarray = NP("ones", len(yarray), dtype=NP.dtype(float)) xarray[0] = 0.0 xarray = NP("cumsum", xarray) dxarray, dyarray = None, None if dxdataColumn is not None: dxarray = dxdataColumn.data[selection] if dydataColumn is not None: dyarray = dydataColumn.data[selection] xfieldType = self.xfieldType yfieldType = ydataColumn.fieldType elif len(nx) == 1 and len(ny) == 1: performanceTable.pause("PlotCurve prepare") xdataColumn = nx[0].evaluate(dataTable, functionTable, performanceTable) ydataColumn = ny[0].evaluate(dataTable, functionTable, performanceTable) performanceTable.unpause("PlotCurve prepare") if len(cutExpression) == 1: performanceTable.pause("PlotCurve prepare") selection = cutExpression[0].select( dataTable, functionTable, performanceTable) performanceTable.unpause("PlotCurve prepare") else: selection = NP("ones", len(ydataColumn.data), NP.dtype(bool)) if xdataColumn.mask is not None: selection = NP("logical_and", selection, NP(xdataColumn.mask == defs.VALID), selection) if ydataColumn.mask is not None: selection = NP("logical_and", selection, NP(ydataColumn.mask == defs.VALID), selection) if dxdataColumn is not None and dxdataColumn.mask is not None: selection = NP("logical_and", selection, NP(dxdataColumn.mask == defs.VALID), selection) if dydataColumn is not None and dydataColumn.mask is not None: selection = NP("logical_and", selection, NP(dydataColumn.mask == defs.VALID), selection) xarray = xdataColumn.data[selection] yarray = ydataColumn.data[selection] dxarray, dyarray = None, None if dxdataColumn is not None: dxarray = dxdataColumn.data[selection] if dydataColumn is not None: dyarray = dydataColumn.data[selection] xfieldType = xdataColumn.fieldType yfieldType = ydataColumn.fieldType else: raise defs.PmmlValidationError( "The only allowed combinations of PlotNumericExpressions are: \"y(x)\" and \"x(t) y(t)\"" ) persistentState = {} stateId = self.get("stateId") if stateId is not None: if stateId in dataTable.state: persistentState = dataTable.state[stateId] xarray = NP("concatenate", [xarray, persistentState["x"]]) yarray = NP("concatenate", [yarray, persistentState["y"]]) if dxarray is not None: dxarray = NP("concatenate", [dxarray, persistentState["dx"]]) if dyarray is not None: dyarray = NP("concatenate", [dyarray, persistentState["dy"]]) else: dataTable.state[stateId] = persistentState persistentState["x"] = xarray persistentState["y"] = yarray if dxarray is not None: persistentState["dx"] = dxarray if dyarray is not None: persistentState["dy"] = dyarray smooth = self.get("smooth", defaultFromXsd=True, convertType=True) if not smooth: if dyarray is not None and dxarray is None: dxarray = NP( (NP("roll", xarray, -1) - NP("roll", xarray, 1)) / 2.0) dyarray = dyarray * dxarray loop = self.get("loop", defaultFromXsd=True, convertType=True) if dxarray is not None and not loop: dxarray[0] = 0.0 dxarray[-1] = 0.0 if dyarray is not None and not loop: dyarray[0] = 0.0 dyarray[-1] = 0.0 state.x = xarray state.y = yarray state.dx = dxarray state.dy = dyarray else: smoothingScale = self.get("smoothingScale", defaultFromXsd=True, convertType=True) loop = self.get("loop", defaultFromXsd=True, convertType=True) samples = self.generateSamples(xarray.min(), xarray.max()) state.x, state.y, state.dx, state.dy = self.pointsToSmoothCurve( xarray, yarray, samples, smoothingScale, loop) if plotRange is not None: plotRange.expand(state.x, state.y, xfieldType, yfieldType) performanceTable.end("PlotCurve prepare")
def format(self, subTable, functionTable, performanceTable, score): """Extract or post-process output for the output field of a DataTable. @type subTable: DataTable @param subTable: The DataTable associated with this local lexical scope. @type functionTable: FunctionTable or None @param functionTable: A table of functions. @type performanceTable: PerformanceTable or None @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @type score: dict @param score: Dictionary mapping PMML score "feature" strings to DataColumns. This dictionary always contains a None key, which is the basic feature ("predictedValue"). @rtype: DataColumn @return: The output that would go into an output field of a DataTable. """ performanceTable.begin("OutputField") feature = self.get("feature") if feature is None: dataColumn = subTable.fields[self["name"]] elif feature == "predictedValue": dataColumn = score[None] elif feature == "predictedDisplayValue": original = score[None] toString = original.fieldType.valueToString data = NP("empty", len(subTable), dtype=NP.dtype(object)) for i, x in enumerate(original.data): data[i] = toString(x) dataColumn = DataColumn(FakeFieldType("string", "continuous"), data, None) elif feature == "transformedValue": expression = self.childOfClass(PmmlExpression) if expression is None: raise defs.PmmlValidationError("OutputField with feature \"transformedValue\" requires an EXPRESSION") performanceTable.pause("OutputField") dataColumn = expression.evaluate(subTable, functionTable, performanceTable) performanceTable.unpause("OutputField") elif feature == "decision": decisions = self.childOfTag("Decisions") if decisions is None: raise defs.PmmlValidationError("OutputField with feature \"decision\" requires a Decisions block") performanceTable.pause("OutputField") dataColumn = self.childOfClass(PmmlExpression).evaluate(subTable, functionTable, performanceTable) performanceTable.unpause("OutputField") if dataColumn.mask is None: valid = None else: valid = NP(dataColumn.mask == defs.VALID) fieldType = FakeFieldType("object", "any") data = NP("empty", len(subTable), dtype=fieldType.dtype) mask = NP(NP("ones", len(subTable), dtype=defs.maskType) * defs.MISSING) for decision in decisions.childrenOfTag("Decision"): value = dataColumn.fieldType.stringToValue(decision["value"]) selection = NP(dataColumn.data == value) if valid is not None: NP("logical_and", selection, valid, selection) for i in xrange(len(data)): if selection[i]: data[i] = decision mask[selection] = defs.VALID if not mask.any(): mask = None dataColumn = DataColumn(fieldType, data, mask) elif feature in score: dataColumn = score[feature] else: model = self.getparent() if model is not None: model = model.getparent() if model is None: model = "(orphaned OutputField; no parent model)" else: model = model.t raise defs.PmmlValidationError("Models of type %s do not produce \"%s\" features (or at least, it is not yet implemented by Augustus)" % (model, feature)) dataType = self.get("dataType", dataColumn.fieldType.dataType) optype = self.get("optype", dataColumn.fieldType.optype) if (dataType != dataColumn.fieldType.dataType or optype != dataColumn.fieldType.optype) and feature not in ("predictedDisplayValue", "decision"): dataColumn = FieldCastMethods.cast(FakeFieldType(dataType, optype), dataColumn) if feature is not None: subTable.fields[self.get("displayName", self["name"])] = dataColumn performanceTable.end("OutputField") return dataColumn
def draw(self, state, plotCoordinates, plotDefinitions, performanceTable): """Draw the plot element. This stage consists of creating an SVG image of the pre-computed data. @type state: ad-hoc Python object @param state: State information that persists long enough to use quantities computed in C{prepare} in the C{draw} stage. This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState. @type plotCoordinates: PlotCoordinates @param plotCoordinates: The coordinate system in which this plot element will be placed. @type plotDefinitions: PlotDefinitions @type plotDefinitions: The dictionary of key-value pairs that forms the <defs> section of the SVG document. @type performanceTable: PerformanceTable @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process. @rtype: SvgBinding @return: An SVG fragment representing the fully drawn plot element. """ svg = SvgBinding.elementMaker performanceTable.begin("PlotBoxAndWhisker draw") vertical = self.get("vertical", defaultFromXsd=True, convertType=True) gap = self.get("gap", defaultFromXsd=True, convertType=True) if state.slicedFieldType is not self.fieldTypeNumeric: if vertical: strings = plotCoordinates.xstrings else: strings = plotCoordinates.ystrings newRanges = [] for string in strings: try: index = state.edges.index(string) except ValueError: newRanges.append(None) else: newRanges.append(state.ranges[index]) state.ranges = newRanges state.edges = [(i - 0.5, i + 0.5) for i in xrange(len(strings))] lowEdge = NP("array", [low if low is not None else float("-inf") for low, high in state.edges], dtype=NP.dtype(float)) highEdge = NP("array", [high if high is not None else float("inf") for low, high in state.edges], dtype=NP.dtype(float)) selection = NP("array", [levels is not None for levels in state.ranges], dtype=NP.dtype(bool)) lowEdge = lowEdge[selection] highEdge = highEdge[selection] lowWhisker = NP("array", [levels[0] for levels in state.ranges if levels is not None], dtype=state.profiledFieldType.dtype) lowBox = NP("array", [levels[1] for levels in state.ranges if levels is not None], dtype=state.profiledFieldType.dtype) midLine = NP("array", [levels[2] for levels in state.ranges if levels is not None], dtype=state.profiledFieldType.dtype) highBox = NP("array", [levels[3] for levels in state.ranges if levels is not None], dtype=state.profiledFieldType.dtype) highWhisker = NP("array", [levels[4] for levels in state.ranges if levels is not None], dtype=state.profiledFieldType.dtype) output = svg.g() if len(lowEdge) > 0: if vertical: Ax = lowEdge Bx = lowEdge Cx = lowEdge Dx = highEdge Ex = highEdge Fx = highEdge Gx = NP(NP(lowEdge + highEdge) / 2.0) Hx = Gx Ix = Gx Jx = Gx Ay = lowBox By = midLine Cy = highBox Dy = lowBox Ey = midLine Fy = highBox Gy = lowWhisker Hy = lowBox Iy = highBox Jy = highWhisker else: Ax = lowBox Bx = midLine Cx = highBox Dx = lowBox Ex = midLine Fx = highBox Gx = lowWhisker Hx = lowBox Ix = highBox Jx = highWhisker Ay = lowEdge By = lowEdge Cy = lowEdge Dy = highEdge Ey = highEdge Fy = highEdge Gy = NP(NP(lowEdge + highEdge) / 2.0) Hy = Gy Iy = Gy Jy = Gy AX, AY = plotCoordinates(Ax, Ay) BX, BY = plotCoordinates(Bx, By) CX, CY = plotCoordinates(Cx, Cy) DX, DY = plotCoordinates(Dx, Dy) EX, EY = plotCoordinates(Ex, Ey) FX, FY = plotCoordinates(Fx, Fy) GX, GY = plotCoordinates(Gx, Gy) HX, HY = plotCoordinates(Hx, Hy) IX, IY = plotCoordinates(Ix, Iy) JX, JY = plotCoordinates(Jx, Jy) if vertical: if gap > 0.0 and NP(NP(DX - gap/2.0) - NP(AX + gap/2.0)).min() > 0.0: AX += gap/2.0 BX += gap/2.0 CX += gap/2.0 DX -= gap/2.0 EX -= gap/2.0 FX -= gap/2.0 else: if gap > 0.0 and NP(NP(DY - gap/2.0) - NP(AY + gap/2.0)).min() > 0.0: AY += gap/2.0 BY += gap/2.0 CY += gap/2.0 DY -= gap/2.0 EY -= gap/2.0 FY -= gap/2.0 style = self.getStyleState() strokeStyle = dict((x, style[x]) for x in style if x.startswith("stroke")) strokeStyle["fill"] = "none" style = PlotStyle.toString(style) strokeStyle = PlotStyle.toString(strokeStyle) for i in xrange(len(lowEdge)): pathdata = ["M %r %r" % (HX[i], HY[i]), "L %r %r" % (AX[i], AY[i]), "L %r %r" % (BX[i], BY[i]), "L %r %r" % (CX[i], CY[i]), "L %r %r" % (IX[i], IY[i]), "L %r %r" % (FX[i], FY[i]), "L %r %r" % (EX[i], EY[i]), "L %r %r" % (DX[i], DY[i]), "L %r %r" % (HX[i], HY[i]), "Z"] output.append(svg.path(d=" ".join(pathdata), style=style)) output.append(svg.path(d="M %r %r L %r %r" % (BX[i], BY[i], EX[i], EY[i]), style=strokeStyle)) output.append(svg.path(d="M %r %r L %r %r" % (HX[i], HY[i], GX[i], GY[i]), style=strokeStyle)) output.append(svg.path(d="M %r %r L %r %r" % (IX[i], IY[i], JX[i], JY[i]), style=strokeStyle)) if vertical: width = (DX[i] - AX[i]) / 4.0 output.append(svg.path(d="M %r %r L %r %r" % (GX[i] - width, GY[i], GX[i] + width, GY[i]), style=strokeStyle)) output.append(svg.path(d="M %r %r L %r %r" % (JX[i] - width, JY[i], JX[i] + width, JY[i]), style=strokeStyle)) else: width = (DY[i] - AY[i]) / 4.0 output.append(svg.path(d="M %r %r L %r %r" % (GX[i], GY[i] - width, GX[i], GY[i] + width), style=strokeStyle)) output.append(svg.path(d="M %r %r L %r %r" % (JX[i], JY[i] - width, JX[i], JY[i] + width), style=strokeStyle)) performanceTable.end("PlotBoxAndWhisker draw") svgId = self.get("svgId") if svgId is not None: output["id"] = svgId return output
def prepare(self, state, dataTable, functionTable, performanceTable, plotRange): """Prepare a plot element for drawing. This stage consists of calculating all quantities and determing the bounds of the data. These bounds may be unioned with bounds from other plot elements that overlay this plot element, so the drawing (which requires a finalized coordinate system) cannot begin yet. This method modifies C{plotRange}. @type state: ad-hoc Python object @param state: State information that persists long enough to use quantities computed in C{prepare} in the C{draw} stage. This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState. @type dataTable: DataTable @param dataTable: Contains the data to plot. @type functionTable: FunctionTable @param functionTable: Defines functions that may be used to transform data for plotting. @type performanceTable: PerformanceTable @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process. @type plotRange: PlotRange @param plotRange: The bounding box of plot coordinates that this function will expand. """ self.checkRoles(["sliced", "profiled"]) slicedExpression = self.xpath("pmml:PlotExpression[@role='sliced']") profiledExpression = self.xpath("pmml:PlotNumericExpression[@role='profiled']") cutExpression = self.xpath("pmml:PlotSelection") if len(slicedExpression) != 1: raise defs.PmmlValidationError("PlotHistogram requires a PlotExpression with role \"sliced\"") if len(profiledExpression) != 1: raise defs.PmmlValidationError("PlotHistogram requires a PlotNumericExpression with role \"profiled\"") slicedDataColumn = slicedExpression[0].evaluate(dataTable, functionTable, performanceTable) profiledDataColumn = profiledExpression[0].evaluate(dataTable, functionTable, performanceTable) if len(cutExpression) == 1: selection = cutExpression[0].select(dataTable, functionTable, performanceTable) else: selection = NP("ones", len(dataTable), NP.dtype(bool)) performanceTable.begin("PlotBoxAndWhisker prepare") self._saveContext(dataTable) if slicedDataColumn.mask is not None: NP("logical_and", selection, NP(slicedDataColumn.mask == defs.VALID), selection) if profiledDataColumn.mask is not None: NP("logical_and", selection, NP(profiledDataColumn.mask == defs.VALID), selection) slicedArray = slicedDataColumn.data[selection] profiledArray = profiledDataColumn.data[selection] persistentState = {} stateId = self.get("stateId") if stateId is not None: if stateId in dataTable.state: persistentState = dataTable.state[stateId] else: dataTable.state[stateId] = persistentState intervals = self.xpath("pmml:Interval") values = self.xpath("pmml:Value") if "binType" not in persistentState: performanceTable.begin("establish binType") binType = PlotHistogram.establishBinType(slicedDataColumn.fieldType, intervals, values) persistentState["binType"] = binType if binType == "nonuniform": persistentState["distributions"] = [NP("empty", 0, dtype=profiledDataColumn.fieldType.dtype) for x in xrange(len(intervals))] elif binType == "explicit": persistentState["distributions"] = [NP("empty", 0, dtype=profiledDataColumn.fieldType.dtype) for x in xrange(len(values))] elif binType == "unique": persistentState["distributions"] = {} elif binType == "scale": numBins = self.get("numBins", convertType=True) low = self.get("low", convertType=True) high = self.get("high", convertType=True) numBins, low, high = PlotHistogram.determineScaleBins(numBins, low, high, slicedArray) persistentState["low"] = low persistentState["high"] = high persistentState["numBins"] = numBins persistentState["distributions"] = [NP("empty", 0, dtype=profiledDataColumn.fieldType.dtype) for x in xrange(numBins)] performanceTable.end("establish binType") if persistentState["binType"] == "nonuniform": performanceTable.begin("binType nonuniform") distributions = [None] * len(intervals) state.edges = [] lastLimitPoint = None lastClosed = None lastInterval = None for index, interval in enumerate(intervals): selection, lastLimitPoint, lastClosed, lastInterval = PlotHistogram.selectInterval(slicedDataColumn.fieldType, slicedArray, index, len(intervals) - 1, interval, state.edges, lastLimitPoint, lastClosed, lastInterval) if selection is None: distributions[index] = profiledArray else: distributions[index] = profiledArray[selection] persistentState["distributions"] = [NP("concatenate", [x, y]) for x, y in itertools.izip(persistentState["distributions"], distributions)] distributions = persistentState["distributions"] lowEdge = min(low for low, high in state.edges if low is not None) highEdge = max(high for low, high in state.edges if high is not None) state.slicedFieldType = self.fieldTypeNumeric performanceTable.end("binType nonuniform") elif persistentState["binType"] == "explicit": performanceTable.begin("binType explicit") distributions = [None] * len(values) displayValues = [] for index, value in enumerate(values): internalValue = slicedDataColumn.fieldType.stringToValue(value["value"]) displayValues.append(value.get("displayValue", slicedDataColumn.fieldType.valueToString(internalValue, displayValue=True))) selection = NP(slicedArray == internalValue) distributions[index] = profiledArray[selection] persistentState["distributions"] = [NP("concatenate", [x, y]) for x, y in itertools.izip(persistentState["distributions"], distributions)] distributions = persistentState["distributions"] state.edges = displayValues state.slicedFieldType = slicedDataColumn.fieldType performanceTable.end("binType explicit") elif persistentState["binType"] == "unique": performanceTable.begin("binType unique") uniques, inverse = NP("unique", slicedArray, return_inverse=True) persistentDistributions = persistentState["distributions"] for i, u in enumerate(uniques): string = slicedDataColumn.fieldType.valueToString(u, displayValue=False) selection = NP(inverse == i) if string in persistentDistributions: persistentDistributions[string] = NP("concatenate", [persistentDistributions[string], profiledArray[selection]]) else: persistentDistributions[string] = profiledArray[selection] tosort = [(len(distribution), string) for string, distribution in persistentDistributions.items()] tosort.sort(reverse=True) numBins = self.get("numBins", convertType=True) if numBins is not None: tosort = tosort[:numBins] distributions = [persistentDistributions[string] for count, string in tosort] state.edges = [slicedDataColumn.fieldType.valueToString(slicedDataColumn.fieldType.stringToValue(string), displayValue=True) for count, string in tosort] state.slicedFieldType = slicedDataColumn.fieldType performanceTable.end("binType unique") elif persistentState["binType"] == "scale": performanceTable.begin("binType scale") numBins = persistentState["numBins"] low = persistentState["low"] high = persistentState["high"] binWidth = (high - low) / float(numBins) binAssignments = NP("array", NP("floor", NP(NP(slicedArray - low)/binWidth)), dtype=NP.dtype(int)) distributions = [None] * numBins for index in xrange(numBins): selection = NP(binAssignments == index) distributions[index] = profiledArray[selection] persistentState["distributions"] = [NP("concatenate", [x, y]) for x, y in itertools.izip(persistentState["distributions"], distributions)] distributions = persistentState["distributions"] state.edges = [(low + i*binWidth, low + (i + 1)*binWidth) for i in xrange(numBins)] lowEdge = low highEdge = high state.slicedFieldType = self.fieldTypeNumeric performanceTable.end("binType scale") levels = self.get("levels", defaultFromXsd=True) lowWhisker = self.get("lowWhisker", defaultFromXsd=True, convertType=True) lowBox = self.get("lowBox", defaultFromXsd=True, convertType=True) midLine = self.get("midLine", defaultFromXsd=True, convertType=True) highBox = self.get("highBox", defaultFromXsd=True, convertType=True) highWhisker = self.get("highWhisker", defaultFromXsd=True, convertType=True) state.ranges = [] minProfiled = None maxProfiled = None for distribution in distributions: if levels == "percentage": if len(distribution) > 0: state.ranges.append(NP("percentile", distribution, [lowWhisker, lowBox, midLine, highBox, highWhisker])) else: state.ranges.append(None) elif levels == "standardDeviation": mu = NP("mean", distribution) sigma = NP("std", distribution, ddof=1) if NP("isfinite", sigma) and sigma > 0.0: state.ranges.append([(lowWhisker - mu)/sigma, (lowBox - mu)/sigma, (midLine - mu)/sigma, (highBox - mu)/sigma, (highWhisker - mu)/sigma]) else: state.ranges.append(None) if state.ranges[-1] is not None: if minProfiled is None: minProfiled = min(state.ranges[-1]) maxProfiled = max(state.ranges[-1]) else: minProfiled = min(minProfiled, min(state.ranges[-1])) maxProfiled = max(maxProfiled, max(state.ranges[-1])) state.profiledFieldType = profiledDataColumn.fieldType if self.get("vertical", defaultFromXsd=True, convertType=True): if state.slicedFieldType is self.fieldTypeNumeric: plotRange.xminPush(lowEdge, state.slicedFieldType, sticky=False) plotRange.xmaxPush(highEdge, state.slicedFieldType, sticky=False) if minProfiled is not None: plotRange.yminPush(minProfiled, state.profiledFieldType, sticky=False) plotRange.ymaxPush(maxProfiled, state.profiledFieldType, sticky=False) else: strings = NP("array", state.edges, dtype=NP.dtype(object)) if minProfiled is not None: values = NP("ones", len(state.edges), dtype=state.profiledFieldType.dtype) * maxProfiled values[0] = minProfiled else: values = NP("zeros", len(state.edges), dtype=state.profiledFieldType.dtype) plotRange.expand(strings, values, state.slicedFieldType, state.profiledFieldType) else: if state.slicedFieldType is self.fieldTypeNumeric: plotRange.yminPush(lowEdge, state.slicedFieldType, sticky=False) plotRange.ymaxPush(highEdge, state.slicedFieldType, sticky=False) if minProfiled is not None: plotRange.xminPush(minProfiled, state.profiledFieldType, sticky=False) plotRange.xmaxPush(maxProfiled, state.profiledFieldType, sticky=False) else: strings = NP("array", state.edges, dtype=NP.dtype(object)) if minProfiled is not None: values = NP("ones", len(state.edges), dtype=state.profiledFieldType.dtype) * maxProfiled values[0] = minProfiled else: values = NP("zeros", len(state.edges), dtype=state.profiledFieldType.dtype) plotRange.expand(values, strings, state.profiledFieldType, state.slicedFieldType) performanceTable.end("PlotBoxAndWhisker prepare")
def mapReduce(self): """Build a MapReduce-Ready K-means producer. Used by C{optimize} and C{hadoopOptimize}. @rtype: MapReduce @return: An instance of MapReduce that can either be run in pure-Python mode or submitted to Hadoop. """ class KMeansMapReduceApplication(MapReduceKMeans): metadata = {} allChangeThreshold = self.allChangeThreshold KMeansMapReduceApplication.metadata["ClusteringModel"] = self.clusteringModel clusterVectors = {} for index, cluster in enumerate(self.clusteringModel.xpath("pmml:Cluster")): clusterName = cluster.get("id", "%d" % (index + 1)) clusterVectors[clusterName] = NP("array", cluster.childOfTag("Array").values(), dtype=NP.dtype(float)) KMeansMapReduceApplication.metadata["clusterVectors"] = clusterVectors self.KMeansMapReduceApplication = KMeansMapReduceApplication return MapReduce(KMeansMapReduceApplication)
def smallTrials(self, dataTable, numberOfTrials=5, recordsPerTrial=100, performanceTable=None): """Improve the initial seed with a few small trials on random subsets of the data. Modifies C{self.clusteringModel}. @type dataTable: DataTable @param dataTable: The input data. @type numberOfTrials: int @param numberOfTrials: The number of independent trials with the same number of C{recordsPerTrial}. The trial with the smallest sum of in-cluster variances wins. @type recordsPerTrial: int @param recordsPerTrial: The number of rows to randomly select from the DataTable in each trial. @type performanceTable: PerformanceTable or None @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. """ if performanceTable is None: performanceTable = FakePerformanceTable() performanceTable.begin("smallTrials") mapReduce = self.mapReduce() self.KMeansMapReduceApplication.metadata["ClusteringModel"] = copy.deepcopy(self.KMeansMapReduceApplication.metadata["ClusteringModel"]) bestVariance = None bestSeed = None for trialNumber in xrange(numberOfTrials): indexes = random.sample(xrange(len(dataTable)), recordsPerTrial) subTable = dataTable.subTable(NP("array", indexes, dtype=NP.dtype(int))) self.randomSeeds(dataTable) mapReduce.metadata["ClusteringModel"] = self.clusteringModel outputRecords, outputKeyValues, numberOfIterations = mapReduce.run([subTable], parallel=False, frozenClass=False, numberOfMappers=1, numberOfReducers=1, iterationLimit=self.iterationLimit) for extension in self.clusteringModel.xpath("pmml:Extension[@name='iterations.smallTrials']"): extension["value"] = repr(int(extension["value"]) + numberOfIterations) mapReduce.metadata["ClusteringModel"]["modelName"] = "smallTrials" mapReduce.metadata["ClusteringModel"].subFields = dict(mapReduce.metadata["ClusteringModel"].subFields) mapReduce.metadata["ClusteringModel"].subFields.update({"affinity": True}) mapReduce.metadata["ClusteringModel"].calculate(subTable) data = subTable.fields["smallTrials.affinity"].data mask = subTable.fields["smallTrials.affinity"].mask if mask is None: variance = NP(data**2).sum() / float(len(subTable)) else: selection = NP(mask == defs.VALID) denom = NP("count_nonzero", selection) if denom > 0: variance = NP(data[selection]**2).sum() / float(denom) else: variance = None if variance is not None and (bestVariance is None or variance < bestVariance): bestVariance = variance bestSeed = mapReduce.metadata["clusterVectors"] if bestSeed is not None: self.explicitSeeds(bestSeed) performanceTable.end("smallTrials")
def cusum(self, testDistributions, fieldName, dataColumn, state, performanceTable): """Calculate the score of a CUSUM TestStatistic. The CUSUM cumulative sum is a stateful calculation: each row depends on the result of the previous row. To continue calculations through multiple calls to C{calc} or C{calculate}, pass a DataTableState object and give the BaselineModel a C{stateId} attribute. The C{stateId} is not valid in strict PMML, but it can be inserted after validation or used in custom-ODG models (C{from augustus.odg import *}). @type testDistributions: PmmlBinding @param testDistributions: The <TestDistributions> element. @type fieldName: string @param fieldName: The field name (for error messages). @type dataColumn: DataColumn @param dataColumn: The field. @type state: DataTableState @param state: The persistent state object, which is used to initialize the start state and save the end state of the cumulative sum. @type performanceTable: PerformanceTable or None @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: dict @return: A dictionary mapping PMML "feature" strings to DataColumns; CUSUM only defines the None key ("predictedValue"). """ baseline = testDistributions.xpath( "pmml:Baseline/pmml:GaussianDistribution | pmml:Baseline/pmml:PoissonDistribution" ) alternate = testDistributions.xpath( "pmml:Alternate/pmml:GaussianDistribution | pmml:Alternate/pmml:PoissonDistribution" ) if len(baseline) == 0 or len(alternate) == 0: raise defs.PmmlValidationError( "BaselineModel CUSUM requires a Baseline and an Alternate that are either GaussianDistribution or PoissonDistribution" ) ratios = alternate[0].logpdf(dataColumn.data) - baseline[0].logpdf( dataColumn.data) if dataColumn.mask is None: good = NP("ones", len(dataColumn), dtype=NP.dtype(bool)) else: good = NP(dataColumn.mask == defs.VALID) stateId = self.get("stateId") last = None if stateId is not None: last = state.get(stateId) if last is None: last = 0.0 resetValue = testDistributions.get("resetValue", defaultFromXsd=True, convertType=True) output = NP("empty", len(dataColumn), dtype=NP.dtype(float)) performanceTable.begin("fill CUSUM") for index in xrange(len(dataColumn)): if good[index]: last = max(resetValue, last + ratios[index]) output[index] = last performanceTable.end("fill CUSUM") if stateId is not None: state[stateId] = last return {None: DataColumn(self.scoreType, output, None)}
def endReducerKey(self, key): for clusterName in self.clusterVectors.keys(): if clusterName == key: newPosition = NP("array", [self.numer[fieldName] / self.denom[fieldName] if self.denom[fieldName] > 0.0 else 0.0 for fieldName in self.fieldNames], dtype=NP.dtype(float)) self.emit(clusterName, newPosition) break
def _selectFirst(self, dataTable, functionTable, performanceTable, segmentation): """Used by C{calculateScore}.""" performanceTable.begin("Segmentation selectFirst") scoresData = NP("empty", len(dataTable), dtype=NP.dtype(object)) scoresMask = NP("zeros", len(dataTable), dtype=defs.maskType) unfilled = NP("ones", len(dataTable), dtype=NP.dtype(bool)) segments = NP("empty", len(dataTable), dtype=NP.dtype(object)) newOutputData = [] for segment in segmentation.childrenOfTag("Segment", iterator=True): performanceTable.pause("Segmentation selectFirst") selection = segment.childOfClass(PmmlPredicate).evaluate( dataTable, functionTable, performanceTable) performanceTable.unpause("Segmentation selectFirst") NP("logical_and", selection, unfilled, selection) if not selection.any(): continue subTable = dataTable.subTable(selection) subModel = segment.childOfClass(PmmlModel) performanceTable.pause("Segmentation selectFirst") subModel.calculate(subTable, functionTable, performanceTable) performanceTable.unpause("Segmentation selectFirst") scoresData[selection] = subTable.score.data if subTable.score.mask is not None: scoresMask[selection] = subTable.score.mask else: scoresMask[selection] = defs.VALID segmentName = segment.get("id") if segmentName is not None: segments[selection] = segmentName for fieldName, dataColumn in subTable.output.items(): if fieldName not in dataTable.output: data = NP("empty", len(dataTable), dtype=dataColumn.fieldType.dtype) data[selection] = dataColumn.data mask = NP( NP("ones", len(dataTable), dtype=defs.maskType) * defs.MISSING) if dataColumn.mask is None: mask[selection] = defs.VALID else: mask[selection] = dataColumn.mask newDataColumn = DataColumn(dataColumn.fieldType, data, mask) newDataColumn._unlock() dataTable.output[fieldName] = newDataColumn newOutputData.append(newDataColumn) else: newDataColumn = dataTable.output[fieldName] newDataColumn.data[selection] = dataColumn.data if dataColumn.mask is None: newDataColumn.mask[selection] = defs.VALID else: newDataColumn.mask[selection] = dataColumn.mask unfilled -= selection if not unfilled.any(): break for newDataColumn in newOutputData: if not newDataColumn.mask.any(): newDataColumn._mask = None newDataColumn._lock() if not scoresMask.any(): scoresMask = None scores = DataColumn(self.scoreType, scoresData, scoresMask) if self.name is None: performanceTable.end("Segmentation selectFirst") return {None: scores} else: performanceTable.end("Segmentation selectFirst") return { None: scores, "segment": DataColumn(self.scoreTypeSegment, segments, None) }
def smallTrials(self, dataTable, numberOfTrials=5, recordsPerTrial=100, performanceTable=None): """Improve the initial seed with a few small trials on random subsets of the data. Modifies C{self.clusteringModel}. @type dataTable: DataTable @param dataTable: The input data. @type numberOfTrials: int @param numberOfTrials: The number of independent trials with the same number of C{recordsPerTrial}. The trial with the smallest sum of in-cluster variances wins. @type recordsPerTrial: int @param recordsPerTrial: The number of rows to randomly select from the DataTable in each trial. @type performanceTable: PerformanceTable or None @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. """ if performanceTable is None: performanceTable = FakePerformanceTable() performanceTable.begin("smallTrials") mapReduce = self.mapReduce() self.KMeansMapReduceApplication.metadata[ "ClusteringModel"] = copy.deepcopy( self.KMeansMapReduceApplication.metadata["ClusteringModel"]) bestVariance = None bestSeed = None for trialNumber in xrange(numberOfTrials): indexes = random.sample(xrange(len(dataTable)), recordsPerTrial) subTable = dataTable.subTable( NP("array", indexes, dtype=NP.dtype(int))) self.randomSeeds(dataTable) mapReduce.metadata["ClusteringModel"] = self.clusteringModel outputRecords, outputKeyValues, numberOfIterations = mapReduce.run( [subTable], parallel=False, frozenClass=False, numberOfMappers=1, numberOfReducers=1, iterationLimit=self.iterationLimit) for extension in self.clusteringModel.xpath( "pmml:Extension[@name='iterations.smallTrials']"): extension["value"] = repr( int(extension["value"]) + numberOfIterations) mapReduce.metadata["ClusteringModel"]["modelName"] = "smallTrials" mapReduce.metadata["ClusteringModel"].subFields = dict( mapReduce.metadata["ClusteringModel"].subFields) mapReduce.metadata["ClusteringModel"].subFields.update( {"affinity": True}) mapReduce.metadata["ClusteringModel"].calculate(subTable) data = subTable.fields["smallTrials.affinity"].data mask = subTable.fields["smallTrials.affinity"].mask if mask is None: variance = NP(data**2).sum() / float(len(subTable)) else: selection = NP(mask == defs.VALID) denom = NP("count_nonzero", selection) if denom > 0: variance = NP(data[selection]**2).sum() / float(denom) else: variance = None if variance is not None and (bestVariance is None or variance < bestVariance): bestVariance = variance bestSeed = mapReduce.metadata["clusterVectors"] if bestSeed is not None: self.explicitSeeds(bestSeed) performanceTable.end("smallTrials")
def _selectAllMedianMajority(self, dataTable, functionTable, performanceTable, segmentation, which): """Used by C{calculateScore}.""" if which is self.SELECT_ALL: performanceLabel = "Segmentation selectAll" elif which is self.MEDIAN: performanceLabel = "Segmentation median" elif which is self.MAJORITY_VOTE: performanceLabel = "Segmentation majorityVote" elif which is self.WEIGHTED_MAJORITY_VOTE: performanceLabel = "Segmentation weightedMajorityVote" performanceTable.begin(performanceLabel) scores = [[] for x in xrange(len(dataTable))] if which is self.SELECT_ALL: segments = [[] for x in xrange(len(dataTable))] newOutputData = {} for segment in segmentation.childrenOfTag("Segment", iterator=True): performanceTable.pause(performanceLabel) selection = segment.childOfClass(PmmlPredicate).evaluate( dataTable, functionTable, performanceTable) performanceTable.unpause(performanceLabel) if not selection.any(): continue segmentName = segment.get("id") indexes = NP("nonzero", selection)[0] subTable = dataTable.subTable(selection) subModel = segment.childOfClass(PmmlModel) performanceTable.pause(performanceLabel) subModel.calculate(subTable, functionTable, performanceTable) performanceTable.unpause(performanceLabel) if which is self.MEDIAN and subTable.score.fieldType.dataType in ( "string", "boolean", "object"): raise defs.PmmlValidationError( "Segmentation with multipleModelMethod=\"median\" cannot be applied to models that produce dataType \"%s\"" % subTable.score.fieldType.dataType) scoreData = subTable.score.data scoreMask = subTable.score.mask indexesUsed = indexes if which is self.SELECT_ALL: for subIndex, index in enumerate(indexes): if scoreMask is None or scoreMask[subIndex] == defs.VALID: scores[index].append(scoreData[subIndex]) segments[index].append(segmentName) elif which is self.MEDIAN: for subIndex, index in enumerate(indexes): if scoreMask is None or scoreMask[subIndex] == defs.VALID: scores[index].append(scoreData[subIndex]) elif which in (self.MAJORITY_VOTE, self.WEIGHTED_MAJORITY_VOTE): if which is self.MAJORITY_VOTE: weight = 1.0 else: weight = float(segment.get("weight", 1.0)) for subIndex, index in enumerate(indexes): if scoreMask is None or scoreMask[subIndex] == defs.VALID: newValue = scoreData[subIndex] score = scores[index] found = False for pair in score: if pair[0] == newValue: pair[1] += weight found = True break if not found: score.append([newValue, weight]) if which is self.SELECT_ALL: for fieldName, dataColumn in subTable.output.items(): newData = newOutputData.get(fieldName) if newData is None: newData = [[] for x in xrange(len(dataTable))] newOutputData[fieldName] = newData dataColumnData = dataColumn.data dataColumnMask = dataColumn.mask for subIndex, index in enumerate(indexes): if scoreMask is None or scoreMask[ subIndex] == defs.VALID: if dataColumnMask is None or dataColumnMask[ subIndex] == defs.VALID: newData[index].append(dataColumnData[subIndex]) else: newData[index].append(None) if which is self.SELECT_ALL: for fieldName, newData in newOutputData.items(): finalNewData = NP("empty", len(dataTable), dtype=NP.dtype(object)) for index, newDatum in enumerate(newData): finalNewData[index] = tuple(newDatum) dataTable.output[fieldName] = DataColumn( self.scoreType, finalNewData, None) finalScoresData = NP("empty", len(dataTable), dtype=NP.dtype(object)) for index, score in enumerate(scores): finalScoresData[index] = tuple(score) finalScores = DataColumn(self.scoreType, finalScoresData, None) if self.name is None: performanceTable.end(performanceLabel) return {None: finalScores} else: finalSegmentsData = NP("empty", len(dataTable), dtype=NP.dtype(object)) for index, segment in enumerate(segments): finalSegmentsData[index] = tuple(segment) performanceTable.end(performanceLabel) return { None: finalScores, "segment": DataColumn(self.scoreTypeSegment, finalSegmentsData, None) } elif which is self.MEDIAN: finalScoresData = NP("empty", len(dataTable), dtype=NP.dtype(object)) finalScoresMask = NP("empty", len(dataTable), dtype=defs.maskType) for index, score in enumerate(scores): if len(score) > 0: finalScoresData[index] = NP("median", score) finalScoresMask[index] = defs.VALID else: finalScoresMask[index] = defs.INVALID if not finalScoresMask.any(): finalScoresMask = None finalScores = DataColumn(self.scoreType, finalScoresData, finalScoresMask) performanceTable.end(performanceLabel) return {None: finalScores} elif which in (self.MAJORITY_VOTE, self.WEIGHTED_MAJORITY_VOTE): finalScoresData = NP("empty", len(dataTable), dtype=NP.dtype(object)) finalScoresMask = NP("empty", len(dataTable), dtype=defs.maskType) cardinality = NP("empty", len(dataTable), dtype=self.scoreTypeCardinality.dtype) for index, score in enumerate(scores): bestN, bestValue = None, None for value, N in score: if bestN is None or N > bestN: bestN = N bestValue = value if bestN is not None: finalScoresData[index] = bestValue finalScoresMask[index] = defs.VALID cardinality[index] = bestN else: finalScoresMask[index] = defs.INVALID cardinality[index] = 0 if not finalScoresMask.any(): finalScoresMask = None finalScores = DataColumn(self.scoreType, finalScoresData, finalScoresMask) if self.name is None: performanceTable.end(performanceLabel) return {None: finalScores} else: finalCardinality = DataColumn(self.scoreTypeCardinality, cardinality, None) performanceTable.end(performanceLabel) return {None: finalScores, "cardinality": finalCardinality}
def draw(self, state, plotCoordinates, plotDefinitions, performanceTable): """Draw the plot element. This stage consists of creating an SVG image of the pre-computed data. @type state: ad-hoc Python object @param state: State information that persists long enough to use quantities computed in C{prepare} in the C{draw} stage. This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState. @type plotCoordinates: PlotCoordinates @param plotCoordinates: The coordinate system in which this plot element will be placed. @type plotDefinitions: PlotDefinitions @type plotDefinitions: The dictionary of key-value pairs that forms the <defs> section of the SVG document. @type performanceTable: PerformanceTable @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process. @rtype: SvgBinding @return: An SVG fragment representing the fully drawn plot element. """ svg = SvgBinding.elementMaker performanceTable.begin("PlotGuideLines draw") output = svg.g() for directive in self.xpath("pmml:PlotVerticalLines | pmml:PlotHorizontalLines | pmml:PlotLine"): style = dict(self.styleDefaults) currentStyle = directive.get("style") if currentStyle is not None: style.update(PlotStyle.toDict(currentStyle)) style["fill"] = "none" style = PlotStyle.toString(style) if directive.hasTag("PlotVerticalLines"): try: x0 = plotCoordinates.xfieldType.stringToValue(directive["x0"]) except ValueError: raise defs.PmmlValidationError("Invalid x0: %r" % directive["x0"]) spacing = float(directive["spacing"]) low = plotCoordinates.innerX1 high = plotCoordinates.innerX2 up = list(NP("arange", x0, high, spacing, dtype=NP.dtype(float))) down = list(NP("arange", x0 - spacing, low, -spacing, dtype=NP.dtype(float))) for x in up + down: x1, y1 = x, float("-inf") X1, Y1 = plotCoordinates(x1, y1) x2, y2 = x, float("inf") X2, Y2 = plotCoordinates(x2, y2) output.append(svg.path(d="M %r %r L %r %r" % (X1, Y1, X2, Y2), style=style)) elif directive.hasTag("PlotHorizontalLines"): try: y0 = plotCoordinates.xfieldType.stringToValue(directive["y0"]) except ValueError: raise defs.PmmlValidationError("Invalid y0: %r" % directive["y0"]) spacing = float(directive["spacing"]) low = plotCoordinates.innerY1 high = plotCoordinates.innerY2 up = list(NP("arange", y0, high, spacing, dtype=NP.dtype(float))) down = list(NP("arange", y0 - spacing, low, -spacing, dtype=NP.dtype(float))) for y in up + down: x1, y1 = float("-inf"), y X1, Y1 = plotCoordinates(x1, y1) x2, y2 = float("inf"), y X2, Y2 = plotCoordinates(x2, y2) output.append(svg.path(d="M %r %r L %r %r" % (X1, Y1, X2, Y2), style=style)) elif directive.hasTag("PlotLine"): try: x1 = plotCoordinates.xfieldType.stringToValue(directive["x1"]) y1 = plotCoordinates.xfieldType.stringToValue(directive["y1"]) x2 = plotCoordinates.xfieldType.stringToValue(directive["x2"]) y2 = plotCoordinates.xfieldType.stringToValue(directive["y2"]) except ValueError: raise defs.PmmlValidationError("Invalid x1, y1, x2, or y2: %r %r %r %r" % (directive["x1"], directive["y1"], directive["x2"], directive["y2"])) X1, Y1 = plotCoordinates(x1, y1) X2, Y2 = plotCoordinates(x2, y2) output.append(svg.path(d="M %r %r L %r %r" % (X1, Y1, X2, Y2), style=style)) svgId = self.get("svgId") if svgId is not None: output["id"] = svgId performanceTable.end("PlotGuideLines draw") return output
def _sumAverageWeighted(self, dataTable, functionTable, performanceTable, segmentation, which): """Used by C{calculateScore}.""" if which is self.SUM: performanceLabel = "Segmentation sum" elif which is self.AVERAGE: performanceLabel = "Segmentation average" elif which is self.WEIGHTED_AVERAGE: performanceLabel = "Segmentation weightedAverage" performanceTable.begin(performanceLabel) scoresData = NP("zeros", len(dataTable), dtype=NP.dtype(object)) if which is not self.SUM: denominator = NP("zeros", len(dataTable), dtype=NP.dtype(float)) invalid = NP("zeros", len(dataTable), dtype=NP.dtype(bool)) for segment in segmentation.childrenOfTag("Segment", iterator=True): performanceTable.pause(performanceLabel) selection = segment.childOfClass(PmmlPredicate).evaluate( dataTable, functionTable, performanceTable) performanceTable.unpause(performanceLabel) if not selection.any(): continue subTable = dataTable.subTable(selection) subModel = segment.childOfClass(PmmlModel) performanceTable.pause(performanceLabel) subModel.calculate(subTable, functionTable, performanceTable) performanceTable.unpause(performanceLabel) if subTable.score.fieldType.dataType in ("string", "boolean", "object"): raise defs.PmmlValidationError( "Segmentation with multipleModelMethod=\"%s\" cannot be applied to models that produce dataType \"%s\"" % (self.childOfTag("Segmentation").get( "multipleModelMethod"), subTable.score.fieldType.dataType)) # ignore invalid in matches (like the built-in "+" and "avg" Apply functions) if subTable.score.mask is not None: NP("logical_and", selection, NP(subTable.score.mask == defs.VALID), selection) if which is self.SUM: scoresData[selection] += subTable.score.data if which is self.AVERAGE: scoresData[selection] += subTable.score.data denominator[selection] += 1.0 elif which is self.WEIGHTED_AVERAGE: weight = float(segment.get("weight", 1.0)) scoresData[selection] += (subTable.score.data * weight) denominator[selection] += weight if subTable.score.mask is not None: invalid[selection] = NP("logical_or", invalid[selection], NP(subTable.score.mask != defs.VALID)) if which is not self.SUM: NP("logical_or", invalid, NP(denominator == 0.0), invalid) valid = NP("logical_not", invalid) scoresData[valid] /= denominator[valid] if invalid.any(): scoresMask = NP( NP("array", invalid, dtype=defs.maskType) * defs.INVALID) else: scoresMask = None scores = DataColumn(self.scoreType, scoresData, scoresMask) performanceTable.end(performanceLabel) return {None: scores}
def _setup(self): if self.optype != "continuous" and len(self.intervals) > 0: raise defs.PmmlValidationError("Non-continuous fields cannot have Intervals") self._displayValue = {} if self.dataType == "object": # for scoring results that don't fit the PMML pattern self.toDataColumn = self._toDataColumn_object self.fromDataColumn = self._fromDataColumn_object self.dtype = NP.dtype(object) self.stringToValue = self._stringToValue_object self.valueToString = self._valueToString_object self.valueToPython = self._valueToPython elif self.dataType == "string": if self.optype == "categorical": self._stringToValue = {} # TODO: merge categorical and ordinal <Value> handling self._valueToString = {} # into _checkValues(data, mask) self._newValuesAllowed = True for value in self.values: v = value.get("value") displayValue = value.get("displayValue") if displayValue is not None: self._displayValue[v] = displayValue if value.get("property", "valid") == "valid": self._addCategorical(v) if len(self._stringToValue) > 0: self._newValuesAllowed = False self.toDataColumn = self._toDataColumn_internal self.fromDataColumn = self._fromDataColumn self.dtype = NP.int64 self.stringToValue = self._stringToValue_categorical self.valueToString = self._valueToString_categorical self.valueToPython = self._valueToString_categorical elif self.optype == "ordinal": self._stringToValue = {} # TODO: see above self._valueToString = {} self._newValuesAllowed = True for value in self.values: v = value.get("value") displayValue = value.get("displayValue") if displayValue is not None: self._displayValue[v] = displayValue if value.get("property", "valid") == "valid": self._addOrdinal(v) self._newValuesAllowed = False self.toDataColumn = self._toDataColumn_internal self.fromDataColumn = self._fromDataColumn self.dtype = NP.dtype(int) self.stringToValue = self._stringToValue_ordinal self.valueToString = self._valueToString_ordinal self.valueToPython = self._valueToString_ordinal elif self.optype == "continuous": self.toDataColumn = self._toDataColumn_string self.fromDataColumn = self._fromDataColumn_object self.dtype = NP.dtype(object) self.stringToValue = self._stringToValue_string self.valueToString = self._valueToString_string self.valueToPython = self._valueToString_string else: raise defs.PmmlValidationError("Unrecognized optype: %s" % self.optype) elif self.dataType == "integer": self.toDataColumn = self._toDataColumn_number self.fromDataColumn = self._fromDataColumn_number self.dtype = NP.dtype(int) self.stringToValue = self._stringToValue_integer self.valueToString = self._valueToString_integer self.valueToPython = self._valueToPython elif self.dataType == "float": self.toDataColumn = self._toDataColumn_number self.fromDataColumn = self._fromDataColumn_number self.dtype = NP.float32 self.stringToValue = self._stringToValue_float self.valueToString = self._valueToString_float self.valueToPython = self._valueToPython elif self.dataType == "double": self.toDataColumn = self._toDataColumn_number self.fromDataColumn = self._fromDataColumn_number self.dtype = NP.dtype(float) self.stringToValue = self._stringToValue_double self.valueToString = self._valueToString_double self.valueToPython = self._valueToPython elif self.dataType == "boolean": self.toDataColumn = self._toDataColumn_number self.fromDataColumn = self._fromDataColumn_number self.dtype = NP.dtype(bool) self.stringToValue = self._stringToValue_boolean self.valueToString = self._valueToString_boolean self.valueToPython = self._valueToPython elif self.dataType == "date": self.toDataColumn = self._toDataColumn_dateTime self.fromDataColumn = self._fromDataColumn self.dtype = NP.int64 self.stringToValue = self._stringToValue_date self.valueToString = self._valueToString_date self.valueToPython = self._valueToPython_date elif self.dataType == "time": self.toDataColumn = self._toDataColumn_dateTime self.fromDataColumn = self._fromDataColumn self.dtype = NP.int64 self.stringToValue = self._stringToValue_time self.valueToString = self._valueToString_time self.valueToPython = self._valueToPython_time elif self.dataType == "dateTime": self.toDataColumn = self._toDataColumn_dateTime self.fromDataColumn = self._fromDataColumn self.dtype = NP.int64 self.stringToValue = self._stringToValue_dateTime self.valueToString = self._valueToString_dateTime self.valueToPython = self._valueToPython_dateTime elif self.dataType == "dateDaysSince[0]": # _offset is the number of seconds between 1/1/1 B.C. and 1/1/1970, using the astronomical convention # that 1 B.C. is "year zero" (which does not exist, even in the proleptic Gregorian calendar) # and that this fictitious year would have been a leap year (366 full days) # http://en.wikipedia.org/wiki/Year_zero#Astronomers self._offset = -62167219200 * self._dateTimeResolution self._factor = 86400 * self._dateTimeResolution # number of microseconds in a day self.toDataColumn = self._toDataColumn_dateTimeNumber self.fromDataColumn = self._fromDataColumn_dateTimeNumber self.dtype = NP.int64 self.stringToValue = self._stringToValue_dateTimeNumber self.valueToString = self._valueToString_dateTimeNumber self.valueToPython = self._valueToPython_dateTimeNumber elif self.dataType == "dateDaysSince[1960]": self._offset = -315619200 * self._dateTimeResolution # number of seconds between 1/1/1960 and 1/1/1970, accounting for leap years/leap seconds self._factor = 86400 * self._dateTimeResolution # number of microseconds in a day self.toDataColumn = self._toDataColumn_dateTimeNumber self.fromDataColumn = self._fromDataColumn_dateTimeNumber self.dtype = NP.int64 self.stringToValue = self._stringToValue_dateTimeNumber self.valueToString = self._valueToString_dateTimeNumber self.valueToPython = self._valueToPython_dateTimeNumber elif self.dataType == "dateDaysSince[1970]": self._offset = 0 self._factor = 86400 * self._dateTimeResolution # number of microseconds in a day self.toDataColumn = self._toDataColumn_dateTimeNumber self.fromDataColumn = self._fromDataColumn_dateTimeNumber self.dtype = NP.int64 self.stringToValue = self._stringToValue_dateTimeNumber self.valueToString = self._valueToString_dateTimeNumber self.valueToPython = self._valueToPython_dateTimeNumber elif self.dataType == "dateDaysSince[1980]": self._offset = 315532800 * self._dateTimeResolution # number of seconds between 1/1/1980 and 1/1/1970, accounting for leap years/leap seconds self._factor = 86400 * self._dateTimeResolution # number of microseconds in a day self.toDataColumn = self._toDataColumn_dateTimeNumber self.fromDataColumn = self._fromDataColumn_dateTimeNumber self.dtype = NP.int64 self.stringToValue = self._stringToValue_dateTimeNumber self.valueToString = self._valueToString_dateTimeNumber self.valueToPython = self._valueToPython_dateTimeNumber elif self.dataType == "timeSeconds": self._offset = 0 self._factor = self._dateTimeResolution # number of microseconds in a second self.toDataColumn = self._toDataColumn_dateTimeNumber self.fromDataColumn = self._fromDataColumn_timeSeconds # reports modulo 1 day self.dtype = NP.int64 self.stringToValue = self._stringToValue_dateTimeNumber self.valueToString = self._valueToString_timeSeconds # reports modulo 1 day self.valueToPython = self._valueToPython_timeSeconds # reports modulo 1 day elif self.dataType == "dateTimeSecondsSince[0]": self._offset = -62167219200 * self._dateTimeResolution # number of seconds between 1/1/1 B.C. and 1/1/1970, accounting for leap years/leap seconds self._factor = self._dateTimeResolution # number of microseconds in a second self.toDataColumn = self._toDataColumn_dateTimeNumber self.fromDataColumn = self._fromDataColumn_dateTimeNumber self.dtype = NP.int64 self.stringToValue = self._stringToValue_dateTimeNumber self.valueToString = self._valueToString_dateTimeNumber self.valueToPython = self._valueToPython_dateTimeNumber elif self.dataType == "dateTimeSecondsSince[1960]": self._offset = -315619200 * self._dateTimeResolution # number of seconds between 1/1/1960 and 1/1/1970, accounting for leap years/leap seconds self._factor = self._dateTimeResolution # number of microseconds in a second self.toDataColumn = self._toDataColumn_dateTimeNumber self.fromDataColumn = self._fromDataColumn_dateTimeNumber self.dtype = NP.int64 self.stringToValue = self._stringToValue_dateTimeNumber self.valueToString = self._valueToString_dateTimeNumber self.valueToPython = self._valueToPython_dateTimeNumber elif self.dataType == "dateTimeSecondsSince[1970]": self._offset = 0 self._factor = self._dateTimeResolution # number of microseconds in a second self.toDataColumn = self._toDataColumn_dateTimeNumber self.fromDataColumn = self._fromDataColumn_dateTimeNumber self.dtype = NP.int64 self.stringToValue = self._stringToValue_dateTimeNumber self.valueToString = self._valueToString_dateTimeNumber self.valueToPython = self._valueToPython_dateTimeNumber elif self.dataType == "dateTimeSecondsSince[1980]": self._offset = 315532800 * self._dateTimeResolution # number of seconds between 1/1/1980 and 1/1/1970, accounting for leap years/leap seconds self._factor = self._dateTimeResolution # number of microseconds in a second self.toDataColumn = self._toDataColumn_dateTimeNumber self.fromDataColumn = self._fromDataColumn_dateTimeNumber self.dtype = NP.int64 self.stringToValue = self._stringToValue_dateTimeNumber self.valueToString = self._valueToString_dateTimeNumber self.valueToPython = self._valueToPython_dateTimeNumber else: raise defs.PmmlValidationError("Unrecognized dataType: %s" % self.dataType) self._hash = hash((self.dataType, self.optype, tuple(self.values), tuple(self.intervals), self.isCyclic))
def evaluate(self, dataTable, functionTable, performanceTable): """Evaluate the expression, using a DataTable as input. @type dataTable: DataTable @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression. @type functionTable: FunctionTable @param functionTable: The FunctionTable, containing any functions that might be called in this expression. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: DataColumn @return: The result of the calculation as a DataColumn. """ function = self["function"] groupField = self.get("groupField") if groupField is None: performanceTable.begin("Aggregate %s" % function) else: performanceTable.begin("Aggregate %s groupField" % function) dataColumn = dataTable.fields[self["field"]] whereMask = self.where(dataTable, functionTable, performanceTable) stateId = self.get("stateId") if groupField is None: if stateId is None: getstate = None setstate = None else: def getstate(): return dataTable.state.get(stateId) def setstate(value): dataTable.state[stateId] = value if function == "count": dataColumn = self.functionCount(dataColumn, whereMask, None, getstate, setstate) elif function == "sum": dataColumn = self.functionSum(dataColumn, whereMask, None, getstate, setstate) elif function == "average": dataColumn = self.functionAverage(dataColumn, whereMask, None, getstate, setstate) elif function == "min": dataColumn = self.functionMin(dataColumn, whereMask, None, getstate, setstate) elif function == "max": dataColumn = self.functionMax(dataColumn, whereMask, None, getstate, setstate) elif function == "multiset": dataColumn = self.functionMultiset(dataColumn, whereMask, None, getstate, setstate) performanceTable.end("Aggregate %s" % function) return dataColumn else: groupColumn = dataTable.fields[groupField] if groupColumn.mask is None: validGroup = groupColumn.data else: validGroup = groupColumn.data[NP(groupColumn.mask == defs.VALID)] if stateId is not None: state = dataTable.state.get(stateId) if state is None: record = {} else: record = state valuesSeen = dict((stringValue, False) for stringValue in record) groupTables = {} groupColumnFieldType = None for groupValue in NP("unique", validGroup): groupSelection = NP(groupColumn.data == groupValue) if groupColumn.mask is not None: NP("logical_and", groupSelection, NP(groupColumn.mask == defs.VALID), groupSelection) groupColumnFieldType = groupColumn.fieldType stringValue = groupColumnFieldType.valueToString(groupValue) if stringValue in record: def getstate(): return record[stringValue] else: getstate = None def setstate(value): record[stringValue] = value valuesSeen[stringValue] = True value = groupColumnFieldType.valueToPython(groupValue) if function == "count": groupTables[value] = self.functionCount(dataColumn, whereMask, groupSelection, getstate, setstate) elif function == "sum": groupTables[value] = self.functionSum(dataColumn, whereMask, groupSelection, getstate, setstate) elif function == "average": groupTables[value] = self.functionAverage(dataColumn, whereMask, groupSelection, getstate, setstate) elif function == "min": groupTables[value] = self.functionMin(dataColumn, whereMask, groupSelection, getstate, setstate) elif function == "max": groupTables[value] = self.functionMax(dataColumn, whereMask, groupSelection, getstate, setstate) elif function == "multiset": groupTables[value] = self.functionMultiset(dataColumn, whereMask, groupSelection, getstate, setstate) if stateId is not None: dataTable.state[stateId] = record for stringValue in valuesSeen: if not valuesSeen[stringValue]: value = groupColumnFieldType.valueToPython(groupColumnFieldType.stringToValue(stringValue)) if function == "count": groupTables[value] = self.functionCountFake(record[stringValue], len(dataTable), dataColumn.fieldType) elif function == "sum": groupTables[value] = self.functionSumFake(record[stringValue], len(dataTable), dataColumn.fieldType) elif function == "average": groupTables[value] = self.functionAverageFake(record[stringValue], len(dataTable), dataColumn.fieldType) elif function in ("min", "max"): groupTables[value] = self.functionMinMaxFake(record[stringValue], len(dataTable), dataColumn.fieldType) elif function == "multiset": groupTables[value] = self.functionMultisetFake(record[stringValue], len(dataTable), dataColumn.fieldType) performanceTable.begin("Aggregate %s groupField collect" % function) fieldType = FakeFieldType("object", "any") data = NP("empty", len(dataTable), dtype=NP.dtype(object)) if function == "count": for i in xrange(len(dataTable)): data[i] = dict((value, table.data[i]) for value, table in groupTables.items() if table.data[i] != 0) elif function == "sum": for i in xrange(len(dataTable)): data[i] = dict((value, table.data[i]) for value, table in groupTables.items() if table.data[i] != 0.0) elif function == "average": for i in xrange(len(dataTable)): data[i] = dict((value, table.data[i]) for value, table in groupTables.items() if table.data[i] > 0.0 or table.data[i] <= 0.0) elif function in ("min", "max"): for table in groupTables.values(): if table.mask is None: table._mask = NP("zeros", len(table), dtype=defs.maskType) for i in xrange(len(dataTable)): data[i] = dict((value, table.data[i]) for value, table in groupTables.items() if table.mask[i] == defs.VALID) elif function == "multiset": for i in xrange(len(dataTable)): data[i] = dict((value, table.data[i]) for value, table in groupTables.items() if len(table.data[i]) > 0) performanceTable.end("Aggregate %s groupField collect" % function) performanceTable.end("Aggregate %s groupField" % function) return DataColumn(fieldType, data, None)
def functionMax(self, dataColumn, whereMask, groupSelection, getstate, setstate): """Finds the maximum of rows in a DataColumn, possibly with an SQL where mask and groupField. @type dataColumn: DataColumn @param dataColumn: The input data column. @type whereMask: 1d Numpy array of bool, or None @param whereMask: The result of the SQL where selection. @type groupSelection: 1d Numpy array of bool, or None. @param groupSelection: Rows corresponding to a particular value of the groupField. @type getstate: callable function @param getstate: Retrieve staring values from the DataTableState. @type setstate: callable function @param setstate: Store ending values to the DataTableState. @rtype: DataColumn @return: A column of maximized rows. """ fieldType = dataColumn.fieldType if fieldType.optype not in ("continuous", "ordinal"): raise defs.PmmlValidationError( "Aggregate function \"min\" requires a continuous or ordinal input field" ) if dataColumn.mask is None: selection = NP("ones", len(dataColumn), dtype=NP.dtype(bool)) else: selection = NP(dataColumn.mask == defs.VALID) if whereMask is not None: NP("logical_and", selection, whereMask, selection) if groupSelection is not None: NP("logical_and", selection, groupSelection, selection) maximum = None if getstate is not None: startingState = getstate() if startingState is not None: maximum = startingState data = NP("empty", len(dataColumn), dtype=fieldType.dtype) mask = NP("zeros", len(dataColumn), dtype=defs.maskType) for i, x in enumerate(dataColumn.data): if selection[i]: if maximum is None or x > maximum: maximum = x if maximum is None: mask[i] = defs.INVALID else: data[i] = maximum if not mask.any(): mask = None if setstate is not None: setstate(maximum) return DataColumn(fieldType, data, mask)
def _selectMax(self, dataTable, functionTable, performanceTable, segmentation): """Used by C{calculateScore}.""" performanceTable.begin("Segmentation max") scoresData = NP("empty", len(dataTable), dtype=NP.dtype(object)) filled = NP("zeros", len(dataTable), dtype=NP.dtype(bool)) unfilled = NP("ones", len(dataTable), dtype=NP.dtype(bool)) newOutputData = [] for segment in segmentation.childrenOfTag("Segment", iterator=True): performanceTable.pause("Segmentation max") selection = segment.childOfClass(PmmlPredicate).evaluate(dataTable, functionTable, performanceTable) performanceTable.unpause("Segmentation max") if not selection.any(): continue subTable = dataTable.subTable(selection) subModel = segment.childOfClass(PmmlModel) performanceTable.pause("Segmentation max") subModel.calculate(subTable, functionTable, performanceTable) performanceTable.unpause("Segmentation max") if subTable.score.fieldType.dataType in ("string", "boolean", "object"): raise defs.PmmlValidationError("Segmentation with multipleModelMethod=\"max\" cannot be applied to models that produce dataType \"%s\"" % subTable.score.fieldType.dataType) # ignore invalid in matches (like the built-in "min" Apply function) if subTable.score.mask is not None: NP("logical_and", selection, NP(subTable.score.mask == defs.VALID), selection) selectionFilled = NP("logical_and", selection, filled) selectionUnfilled = NP("logical_and", selection, unfilled) filled_selection = filled[selection] unfilled_selection = unfilled[selection] left, right = subTable.score.data[filled_selection], scoresData[selectionFilled] condition = NP(left > right) scoresData[selectionFilled] = NP("where", condition, left, right) scoresData[selectionUnfilled] = subTable.score.data[unfilled_selection] for fieldName, dataColumn in subTable.output.items(): if fieldName not in dataTable.output: data = NP("empty", len(dataTable), dtype=dataColumn.fieldType.dtype) data[selectionUnfilled] = dataColumn.data mask = NP(NP("ones", len(dataTable), dtype=defs.maskType) * defs.MISSING) if dataColumn.mask is None: mask[selectionUnfilled] = defs.VALID else: mask[selectionUnfilled] = dataColumn.mask newDataColumn = DataColumn(dataColumn.fieldType, data, mask) newDataColumn._unlock() dataTable.output[fieldName] = newDataColumn newOutputData.append(newDataColumn) else: newDataColumn = dataTable.output[fieldName] newDataColumn.data[selectionFilled] = NP("where", condition, dataColumn.data[filled_selection], newDataColumn.data[selectionFilled]) newDataColumn.data[selectionUnfilled] = dataColumn.data[unfilled_selection] if dataColumn.mask is None: newDataColumn.mask[selectionUnfilled] = defs.VALID else: newDataColumn.mask[selectionUnfilled] = dataColumn.mask filled += selectionUnfilled unfilled -= selectionUnfilled for newDataColumn in newOutputData: if not newDataColumn.mask.any(): newDataColumn._mask = None newDataColumn._lock() if filled.all(): scoresMask = None else: scoresMask = NP(NP("logical_not", filled) * defs.MISSING) scores = DataColumn(self.scoreType, scoresData, scoresMask) performanceTable.end("Segmentation max") return {None: scores}
def evaluate(self, dataTable, functionTable, performanceTable): """Evaluate the expression, using a DataTable as input. @type dataTable: DataTable @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression. @type functionTable: FunctionTable @param functionTable: The FunctionTable, containing any functions that might be called in this expression. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: DataColumn @return: The result of the calculation as a DataColumn. """ function = self["function"] groupField = self.get("groupField") if groupField is None: performanceTable.begin("Aggregate %s" % function) else: performanceTable.begin("Aggregate %s groupField" % function) dataColumn = dataTable.fields[self["field"]] whereMask = self.where(dataTable, functionTable, performanceTable) stateId = self.get("stateId") if groupField is None: if stateId is None: getstate = None setstate = None else: def getstate(): return dataTable.state.get(stateId) def setstate(value): dataTable.state[stateId] = value if function == "count": dataColumn = self.functionCount(dataColumn, whereMask, None, getstate, setstate) elif function == "sum": dataColumn = self.functionSum(dataColumn, whereMask, None, getstate, setstate) elif function == "average": dataColumn = self.functionAverage(dataColumn, whereMask, None, getstate, setstate) elif function == "min": dataColumn = self.functionMin(dataColumn, whereMask, None, getstate, setstate) elif function == "max": dataColumn = self.functionMax(dataColumn, whereMask, None, getstate, setstate) elif function == "multiset": dataColumn = self.functionMultiset(dataColumn, whereMask, None, getstate, setstate) performanceTable.end("Aggregate %s" % function) return dataColumn else: groupColumn = dataTable.fields[groupField] if groupColumn.mask is None: validGroup = groupColumn.data else: validGroup = groupColumn.data[NP( groupColumn.mask == defs.VALID)] if stateId is not None: state = dataTable.state.get(stateId) if state is None: record = {} else: record = state valuesSeen = dict((stringValue, False) for stringValue in record) groupTables = {} groupColumnFieldType = None for groupValue in NP("unique", validGroup): groupSelection = NP(groupColumn.data == groupValue) if groupColumn.mask is not None: NP("logical_and", groupSelection, NP(groupColumn.mask == defs.VALID), groupSelection) groupColumnFieldType = groupColumn.fieldType stringValue = groupColumnFieldType.valueToString(groupValue) if stringValue in record: def getstate(): return record[stringValue] else: getstate = None def setstate(value): record[stringValue] = value valuesSeen[stringValue] = True value = groupColumnFieldType.valueToPython(groupValue) if function == "count": groupTables[value] = self.functionCount( dataColumn, whereMask, groupSelection, getstate, setstate) elif function == "sum": groupTables[value] = self.functionSum( dataColumn, whereMask, groupSelection, getstate, setstate) elif function == "average": groupTables[value] = self.functionAverage( dataColumn, whereMask, groupSelection, getstate, setstate) elif function == "min": groupTables[value] = self.functionMin( dataColumn, whereMask, groupSelection, getstate, setstate) elif function == "max": groupTables[value] = self.functionMax( dataColumn, whereMask, groupSelection, getstate, setstate) elif function == "multiset": groupTables[value] = self.functionMultiset( dataColumn, whereMask, groupSelection, getstate, setstate) if stateId is not None: dataTable.state[stateId] = record for stringValue in valuesSeen: if not valuesSeen[stringValue]: value = groupColumnFieldType.valueToPython( groupColumnFieldType.stringToValue(stringValue)) if function == "count": groupTables[value] = self.functionCountFake( record[stringValue], len(dataTable), dataColumn.fieldType) elif function == "sum": groupTables[value] = self.functionSumFake( record[stringValue], len(dataTable), dataColumn.fieldType) elif function == "average": groupTables[value] = self.functionAverageFake( record[stringValue], len(dataTable), dataColumn.fieldType) elif function in ("min", "max"): groupTables[value] = self.functionMinMaxFake( record[stringValue], len(dataTable), dataColumn.fieldType) elif function == "multiset": groupTables[value] = self.functionMultisetFake( record[stringValue], len(dataTable), dataColumn.fieldType) performanceTable.begin("Aggregate %s groupField collect" % function) fieldType = FakeFieldType("object", "any") data = NP("empty", len(dataTable), dtype=NP.dtype(object)) if function == "count": for i in xrange(len(dataTable)): data[i] = dict((value, table.data[i]) for value, table in groupTables.items() if table.data[i] != 0) elif function == "sum": for i in xrange(len(dataTable)): data[i] = dict((value, table.data[i]) for value, table in groupTables.items() if table.data[i] != 0.0) elif function == "average": for i in xrange(len(dataTable)): data[i] = dict( (value, table.data[i]) for value, table in groupTables.items() if table.data[i] > 0.0 or table.data[i] <= 0.0) elif function in ("min", "max"): for table in groupTables.values(): if table.mask is None: table._mask = NP("zeros", len(table), dtype=defs.maskType) for i in xrange(len(dataTable)): data[i] = dict((value, table.data[i]) for value, table in groupTables.items() if table.mask[i] == defs.VALID) elif function == "multiset": for i in xrange(len(dataTable)): data[i] = dict((value, table.data[i]) for value, table in groupTables.items() if len(table.data[i]) > 0) performanceTable.end("Aggregate %s groupField collect" % function) performanceTable.end("Aggregate %s groupField" % function) return DataColumn(fieldType, data, None)
def _toDataColumn_number(self, data, mask): data, mask = self._checkNumpy(data, mask) if isinstance(data, NP.ndarray) and (mask is None or isinstance(mask, NP.ndarray)) and data.dtype == self.dtype: mask2 = NP("isnan", data) if mask is None: mask = NP("array", mask2, defs.maskType) * defs.MISSING else: mask[mask2] = defs.MISSING else: data, mask = self._checkNonNumpy(data, mask) try: data = NP("array", data, dtype=self.dtype) # mask is handled in the else statement after the except block except (ValueError, TypeError): data2 = NP("empty", len(data), dtype=self.dtype) if mask is None: mask2 = NP("zeros", len(data), dtype=defs.maskType) else: mask2 = NP("fromiter", ((defs.VALID if not m else defs.MISSING) for m in mask), dtype=defs.maskType, count=len(mask)) for i, v in enumerate(data): try: data2[i] = v if mask2[i] == defs.VALID and ((isinstance(v, float) and math.isnan(v)) or (isinstance(v, basestring) and v.upper() == "NAN")): mask2[i] = defs.MISSING if v is None: raise TypeError except (ValueError, TypeError): data2[i] = defs.PADDING if mask2[i] == defs.VALID: if (isinstance(v, float) and math.isnan(v)) or (isinstance(v, basestring) and v.upper() == "NAN"): mask2[i] = defs.MISSING else: mask2[i] = defs.INVALID if not mask2.any(): mask2 = None data, mask = data2, mask2 else: mask2 = NP("isnan", data) if mask is None: mask = NP("array", mask2, defs.maskType) else: mask = NP(NP("array", NP("logical_or", mask2, NP("fromiter", (m != 0 for m in mask), dtype=NP.dtype(bool), count=len(mask))), defs.maskType) * defs.MISSING) if not mask.any(): mask = None data, mask = self._checkValues(data, mask) data, mask = self._checkIntervals(data, mask) return DataColumn(self, data, mask)
def compare(self, dataTable, functionTable, performanceTable, centerString, defaultCompareFunction, anyInvalid): """Compare input data with a cluster centern along the direction of this field. Cluster distances are computed in two steps: this C{compare} function, which determines the distance in the direction of a field, and the metric, which combines results from each field. @type dataTable: DataTable @param dataTable: The input data. @type functionTable: FunctionTable @param functionTable: A table of functions. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @type centerString: string @param centerString: The center of the cluster in this field, represented as a string. @type defaultCompareFunction: string @param defaultCompareFunction: The C{compareFunction} defined at the model level, which may be overruled on a per-field basis. @type anyInvalid: 1d Numpy array of bool @param anyInvalid: Mask for invalid data, accumulated with each C{compare} call. This method modifies it. @rtype: 1d Numpy array of numbers @return: The distances or similarities between the input data and the cluster center, along the distance of this field. """ performanceTable.begin("ClusteringField") dataColumn = dataTable.fields[self["field"]] if dataColumn.mask is not None: # even though DataColumns are immutable, we're allowed to change the invalid values # because they're not defined; set them so that x - y = 0, and hence they'll be # effectively skipped in summations without any extra work dataColumn._unlock() dataColumn.data[NP(dataColumn.mask != defs.VALID)] = dataColumn.fieldType.stringToValue(centerString) dataColumn._lock() compareFunction = self.get("compareFunction", defaultCompareFunction) if compareFunction == "absDiff": result = NP("absolute", NP(dataColumn.data - dataColumn.fieldType.stringToValue(centerString))) elif compareFunction == "gaussSim": similarityScale = self.get("similarityScale") if similarityScale is None: raise defs.PmmlValidationError("If compareFunction is \"gaussSim\", a similarityScale must be provided") s = float(similarityScale) z = NP(dataColumn.data - dataColumn.fieldType.stringToValue(centerString)) result = NP("exp", NP((-self.LOG2/s**2) * NP(z**2))) elif compareFunction == "delta": result = NP(dataColumn.data != dataColumn.fieldType.stringToValue(centerString)) elif compareFunction == "equal": result = NP(dataColumn.data == dataColumn.fieldType.stringToValue(centerString)) elif compareFunction == "table": if dataColumn.fieldType.dataType != "integer": raise defs.PmmlValidationError("If compareFunction is \"table\", the data must be integers") matrix = self.xpath("pmml:Comparisons/pmml:Matrix") if len(matrix) != 1: raise defs.PmmlValidationError("If compareFunction is \"table\", ClusteringFields needs a Comparisons/Matrix") values = matrix[0].values(convertType=False) centerValue = dataColumn.fieldType.stringToValue(centerString) try: row = values[centerValue] except IndexError: raise defs.PmmlValidationError("Cluster center component is %s, but this is an invalid row index for the Comparisons/Matrix (0-indexed)" % centerString) result = NP("empty", len(dataTable), dtype=NP.dtype(float)) valid = NP("zeros", len(dataTable), dtype=NP.dtype(bool)) for j, value in enumerate(row): selection = NP(dataColumn.data == j) result[selection] = dataColumn.fieldType.stringToValue(value) NP("logical_or", valid, selection, valid) NP("logical_or", anyInvalid, NP("logical_not", valid), anyInvalid) performanceTable.end("ClusteringField") return result
def _sumAverageWeighted(self, dataTable, functionTable, performanceTable, segmentation, which): """Used by C{calculateScore}.""" if which is self.SUM: performanceLabel = "Segmentation sum" elif which is self.AVERAGE: performanceLabel = "Segmentation average" elif which is self.WEIGHTED_AVERAGE: performanceLabel = "Segmentation weightedAverage" performanceTable.begin(performanceLabel) scoresData = NP("zeros", len(dataTable), dtype=NP.dtype(object)) if which is not self.SUM: denominator = NP("zeros", len(dataTable), dtype=NP.dtype(float)) invalid = NP("zeros", len(dataTable), dtype=NP.dtype(bool)) for segment in segmentation.childrenOfTag("Segment", iterator=True): performanceTable.pause(performanceLabel) selection = segment.childOfClass(PmmlPredicate).evaluate(dataTable, functionTable, performanceTable) performanceTable.unpause(performanceLabel) if not selection.any(): continue subTable = dataTable.subTable(selection) subModel = segment.childOfClass(PmmlModel) performanceTable.pause(performanceLabel) subModel.calculate(subTable, functionTable, performanceTable) performanceTable.unpause(performanceLabel) if subTable.score.fieldType.dataType in ("string", "boolean", "object"): raise defs.PmmlValidationError("Segmentation with multipleModelMethod=\"%s\" cannot be applied to models that produce dataType \"%s\"" % (self.childOfTag("Segmentation").get("multipleModelMethod"), subTable.score.fieldType.dataType)) # ignore invalid in matches (like the built-in "+" and "avg" Apply functions) if subTable.score.mask is not None: NP("logical_and", selection, NP(subTable.score.mask == defs.VALID), selection) if which is self.SUM: scoresData[selection] += subTable.score.data if which is self.AVERAGE: scoresData[selection] += subTable.score.data denominator[selection] += 1.0 elif which is self.WEIGHTED_AVERAGE: weight = float(segment.get("weight", 1.0)) scoresData[selection] += (subTable.score.data * weight) denominator[selection] += weight if subTable.score.mask is not None: invalid[selection] = NP("logical_or", invalid[selection], NP(subTable.score.mask != defs.VALID)) if which is not self.SUM: NP("logical_or", invalid, NP(denominator == 0.0), invalid) valid = NP("logical_not", invalid) scoresData[valid] /= denominator[valid] if invalid.any(): scoresMask = NP(NP("array", invalid, dtype=defs.maskType) * defs.INVALID) else: scoresMask = None scores = DataColumn(self.scoreType, scoresData, scoresMask) performanceTable.end(performanceLabel) return {None: scores}
def format(self, subTable, functionTable, performanceTable, score): """Extract or post-process output for the output field of a DataTable. @type subTable: DataTable @param subTable: The DataTable associated with this local lexical scope. @type functionTable: FunctionTable or None @param functionTable: A table of functions. @type performanceTable: PerformanceTable or None @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @type score: dict @param score: Dictionary mapping PMML score "feature" strings to DataColumns. This dictionary always contains a None key, which is the basic feature ("predictedValue"). @rtype: DataColumn @return: The output that would go into an output field of a DataTable. """ performanceTable.begin("OutputField") feature = self.get("feature") if feature is None: dataColumn = subTable.fields[self["name"]] elif feature == "predictedValue": dataColumn = score[None] elif feature == "predictedDisplayValue": original = score[None] toString = original.fieldType.valueToString data = NP("empty", len(subTable), dtype=NP.dtype(object)) for i, x in enumerate(original.data): data[i] = toString(x) dataColumn = DataColumn(FakeFieldType("string", "continuous"), data, None) elif feature == "transformedValue": expression = self.childOfClass(PmmlExpression) if expression is None: raise defs.PmmlValidationError( "OutputField with feature \"transformedValue\" requires an EXPRESSION" ) performanceTable.pause("OutputField") dataColumn = expression.evaluate(subTable, functionTable, performanceTable) performanceTable.unpause("OutputField") elif feature == "decision": decisions = self.childOfTag("Decisions") if decisions is None: raise defs.PmmlValidationError( "OutputField with feature \"decision\" requires a Decisions block" ) performanceTable.pause("OutputField") dataColumn = self.childOfClass(PmmlExpression).evaluate( subTable, functionTable, performanceTable) performanceTable.unpause("OutputField") if dataColumn.mask is None: valid = None else: valid = NP(dataColumn.mask == defs.VALID) fieldType = FakeFieldType("object", "any") data = NP("empty", len(subTable), dtype=fieldType.dtype) mask = NP( NP("ones", len(subTable), dtype=defs.maskType) * defs.MISSING) for decision in decisions.childrenOfTag("Decision"): value = dataColumn.fieldType.stringToValue(decision["value"]) selection = NP(dataColumn.data == value) if valid is not None: NP("logical_and", selection, valid, selection) for i in xrange(len(data)): if selection[i]: data[i] = decision mask[selection] = defs.VALID if not mask.any(): mask = None dataColumn = DataColumn(fieldType, data, mask) elif feature in score: dataColumn = score[feature] else: model = self.getparent() if model is not None: model = model.getparent() if model is None: model = "(orphaned OutputField; no parent model)" else: model = model.t raise defs.PmmlValidationError( "Models of type %s do not produce \"%s\" features (or at least, it is not yet implemented by Augustus)" % (model, feature)) dataType = self.get("dataType", dataColumn.fieldType.dataType) optype = self.get("optype", dataColumn.fieldType.optype) if (dataType != dataColumn.fieldType.dataType or optype != dataColumn.fieldType.optype) and feature not in ( "predictedDisplayValue", "decision"): dataColumn = FieldCastMethods.cast(FakeFieldType(dataType, optype), dataColumn) if feature is not None: subTable.fields[self.get("displayName", self["name"])] = dataColumn performanceTable.end("OutputField") return dataColumn
def _selectFirst(self, dataTable, functionTable, performanceTable, segmentation): """Used by C{calculateScore}.""" performanceTable.begin("Segmentation selectFirst") scoresData = NP("empty", len(dataTable), dtype=NP.dtype(object)) scoresMask = NP("zeros", len(dataTable), dtype=defs.maskType) unfilled = NP("ones", len(dataTable), dtype=NP.dtype(bool)) segments = NP("empty", len(dataTable), dtype=NP.dtype(object)) newOutputData = [] for segment in segmentation.childrenOfTag("Segment", iterator=True): performanceTable.pause("Segmentation selectFirst") selection = segment.childOfClass(PmmlPredicate).evaluate(dataTable, functionTable, performanceTable) performanceTable.unpause("Segmentation selectFirst") NP("logical_and", selection, unfilled, selection) if not selection.any(): continue subTable = dataTable.subTable(selection) subModel = segment.childOfClass(PmmlModel) performanceTable.pause("Segmentation selectFirst") subModel.calculate(subTable, functionTable, performanceTable) performanceTable.unpause("Segmentation selectFirst") scoresData[selection] = subTable.score.data if subTable.score.mask is not None: scoresMask[selection] = subTable.score.mask else: scoresMask[selection] = defs.VALID segmentName = segment.get("id") if segmentName is not None: segments[selection] = segmentName for fieldName, dataColumn in subTable.output.items(): if fieldName not in dataTable.output: data = NP("empty", len(dataTable), dtype=dataColumn.fieldType.dtype) data[selection] = dataColumn.data mask = NP(NP("ones", len(dataTable), dtype=defs.maskType) * defs.MISSING) if dataColumn.mask is None: mask[selection] = defs.VALID else: mask[selection] = dataColumn.mask newDataColumn = DataColumn(dataColumn.fieldType, data, mask) newDataColumn._unlock() dataTable.output[fieldName] = newDataColumn newOutputData.append(newDataColumn) else: newDataColumn = dataTable.output[fieldName] newDataColumn.data[selection] = dataColumn.data if dataColumn.mask is None: newDataColumn.mask[selection] = defs.VALID else: newDataColumn.mask[selection] = dataColumn.mask unfilled -= selection if not unfilled.any(): break for newDataColumn in newOutputData: if not newDataColumn.mask.any(): newDataColumn._mask = None newDataColumn._lock() if not scoresMask.any(): scoresMask = None scores = DataColumn(self.scoreType, scoresData, scoresMask) if self.name is None: performanceTable.end("Segmentation selectFirst") return {None: scores} else: performanceTable.end("Segmentation selectFirst") return {None: scores, "segment": DataColumn(self.scoreTypeSegment, segments, None)}
def calculateScore(self, dataTable, functionTable, performanceTable): """Calculate the score of this model. This method is called by C{calculate} to separate operations that are performed by all models (in C{calculate}) from operations that are performed by specific models (in C{calculateScore}). @type subTable: DataTable @param subTable: The DataTable representing this model's lexical scope. @type functionTable: FunctionTable or None @param functionTable: A table of functions. @type performanceTable: PerformanceTable or None @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: DataColumn @return: A DataColumn containing the score. """ performanceTable.begin("ClusteringModel") performanceTable.begin("set up") distributionBased = (self["modelClass"] == "distributionBased") clusteringFields = self.xpath( "pmml:ClusteringField[not(@isCenterField='false')]") fieldWeights = [ clusteringField.get("fieldWeight", defaultFromXsd=True, convertType=True) for clusteringField in clusteringFields ] for fieldWeight in fieldWeights: if fieldWeight < 0.0: raise defs.PmmlValidationError( "ClusteringField fieldWeights must all be non-negative (encountered %g)" % fieldWeight) clusters = self.xpath("pmml:Cluster") comparisonMeasure = self.childOfClass(ComparisonMeasure) defaultCompareFunction = comparisonMeasure.get("compareFunction", defaultFromXsd=True) metric = comparisonMeasure.childOfClass(PmmlClusteringMetric) metrictag = metric.t performanceTable.end("set up") for clusteringField in clusteringFields: dataType = dataTable.fields[ clusteringField["field"]].fieldType.dataType if dataType == "string": raise defs.PmmlValidationError( "ClusteringField \"%s\" has dataType \"%s\", which cannot be used for clustering" % (clusteringField["field"], dataType)) missingValueWeights = self.childOfTag("MissingValueWeights") if missingValueWeights is None: adjustM = None else: performanceTable.begin("MissingValueWeights") missingWeights = missingValueWeights.childOfClass( PmmlArray).values(convertType=True) sumNMqi = NP("zeros", len(dataTable), dtype=NP.dtype(float)) for clusteringField, missingWeight in zip(clusteringFields, missingWeights): clusteringField.addToAdjustM(dataTable, functionTable, performanceTable, sumNMqi, missingWeight) adjustM = NP(sum(missingWeights) / sumNMqi) adjustM[NP(sumNMqi == 0.0)] = 1.0 performanceTable.end("MissingValueWeights") anyInvalid = NP("zeros", len(dataTable), dtype=NP.dtype(bool)) for clusteringField in clusteringFields: mask = dataTable.fields[clusteringField["field"]].mask if mask is not None: NP("logical_or", anyInvalid, NP(mask == defs.INVALID), anyInvalid) bestClusterId = None bestClusterAffinity = None allClusterAffinities = {} for index, cluster in enumerate(clusters): array = cluster.childOfClass(PmmlArray) if array is None: raise defs.PmmlValidationError( "Cluster must have an array to designate its center") centerStrings = array.values(convertType=False) if len(centerStrings) != len(clusteringFields): raise defs.PmmlValidationError( "Cluster array has %d components, but there are %d ClusteringFields with isCenterField=true" % (len(centerStrings), len(clusteringFields))) performanceTable.begin(metrictag) if distributionBased: matrix = cluster.xpath("pmml:Covariances/pmml:Matrix") if len(matrix) != 1: raise defs.PmmlValidationError( "In distribution-based clustering, all clusters must have a Covariances/Matrix" ) try: covarianceMatrix = NP("array", matrix[0].values(), dtype=NP.dtype(float)) except ValueError: raise defs.PmmlValidationError( "Covariances/Matrix must contain real numbers for distribution-based clustering" ) else: covarianceMatrix = None state = self._State() metric.initialize(state, len(dataTable), len(clusteringFields), distributionBased) for clusteringField, centerString, fieldWeight in zip( clusteringFields, centerStrings, fieldWeights): if isinstance(metric, PmmlClusteringMetricBinary): metric.accumulateBinary( state, dataTable.fields[clusteringField["field"]], centerString, distributionBased) else: performanceTable.pause(metrictag) cxy = clusteringField.compare(dataTable, functionTable, performanceTable, centerString, defaultCompareFunction, anyInvalid) performanceTable.unpause(metrictag) metric.accumulate(state, cxy, fieldWeight, distributionBased) distance = metric.finalizeDistance(state, adjustM, distributionBased, covarianceMatrix) del state performanceTable.end(metrictag) if index == 0: bestClusterId = NP("ones", len(dataTable), dtype=NP.dtype(int)) # 1-based index bestClusterAffinity = distance better = NP(distance < bestClusterAffinity) bestClusterId[better] = index + 1 # 1-based index bestClusterAffinity[better] = distance[better] allClusterAffinities[cluster.get("id", "%d" % (index + 1))] = distance if not anyInvalid.any(): scoreMask = None else: scoreMask = NP(anyInvalid * defs.INVALID) performanceTable.begin("set scores") score = {} performanceTable.begin("predictedValue") fieldType = FakeFieldType("string", "categorical") clusterIdentifiers = NP("empty", len(dataTable), dtype=fieldType.dtype) for index, cluster in enumerate(clusters): value = fieldType.stringToValue( cluster.get("id", "%d" % (index + 1))) clusterIdentifiers[NP(bestClusterId == (index + 1))] = value score[None] = DataColumn(fieldType, clusterIdentifiers, scoreMask) performanceTable.end("predictedValue") if self.subFields["predictedDisplayValue"]: performanceTable.begin("predictedDisplayValue") fieldType = FakeFieldType("string", "categorical") clusterNames = NP("empty", len(dataTable), dtype=fieldType.dtype) for index, cluster in enumerate(clusters): value = fieldType.stringToValue(cluster.get("name", "")) clusterNames[NP(bestClusterId == (index + 1))] = value score["predictedDisplayValue"] = DataColumn( fieldType, clusterNames, scoreMask) performanceTable.end("predictedDisplayValue") if self.subFields["entity"]: performanceTable.begin("entity") fieldType = FakeFieldType("object", "any") entities = NP("empty", len(dataTable), dtype=fieldType.dtype) for index, cluster in enumerate(clusters): value = fieldType.stringToValue(cluster.get("name", "")) indexPlusOne = index + 1 for i in xrange(len(entities)): if bestClusterId[i] == indexPlusOne: entities[i] = cluster score["entity"] = DataColumn(fieldType, entities, scoreMask) performanceTable.end("entity") if self.subFields["clusterId"]: performanceTable.begin("clusterId") fieldType = FakeFieldType("integer", "continuous") score["clusterId"] = DataColumn(fieldType, bestClusterId, scoreMask) performanceTable.end("clusterId") if self.subFields["entityId"]: performanceTable.begin("entityId") fieldType = FakeFieldType("integer", "continuous") score["entityId"] = DataColumn(fieldType, bestClusterId, scoreMask) performanceTable.end("entityId") if self.subFields["clusterAffinity"]: performanceTable.begin("clusterAffinity") fieldType = FakeFieldType("double", "continuous") score["clusterAffinity"] = DataColumn(fieldType, bestClusterAffinity, scoreMask) performanceTable.end("clusterAffinity") if self.subFields["affinity"]: performanceTable.begin("affinity") fieldType = FakeFieldType("double", "continuous") score["affinity"] = DataColumn(fieldType, bestClusterAffinity, scoreMask) performanceTable.end("affinity") if self.subFields["all"]: performanceTable.begin("all") fieldType = FakeFieldType("double", "continuous") for identifier, distance in allClusterAffinities.items(): score["all.%s" % identifier] = DataColumn( fieldType, distance, scoreMask) performanceTable.end("all") performanceTable.end("set scores") performanceTable.end("ClusteringModel") return score
def calculateScore(self, dataTable, functionTable, performanceTable): """Calculate the score of this model. This method is called by C{calculate} to separate operations that are performed by all models (in C{calculate}) from operations that are performed by specific models (in C{calculateScore}). @type subTable: DataTable @param subTable: The DataTable representing this model's lexical scope. @type functionTable: FunctionTable or None @param functionTable: A table of functions. @type performanceTable: PerformanceTable or None @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: DataColumn @return: A DataColumn containing the score. """ performanceTable.begin("ClusteringModel") performanceTable.begin("set up") distributionBased = (self["modelClass"] == "distributionBased") clusteringFields = self.xpath("pmml:ClusteringField[not(@isCenterField='false')]") fieldWeights = [clusteringField.get("fieldWeight", defaultFromXsd=True, convertType=True) for clusteringField in clusteringFields] for fieldWeight in fieldWeights: if fieldWeight < 0.0: raise defs.PmmlValidationError("ClusteringField fieldWeights must all be non-negative (encountered %g)" % fieldWeight) clusters = self.xpath("pmml:Cluster") comparisonMeasure = self.childOfClass(ComparisonMeasure) defaultCompareFunction = comparisonMeasure.get("compareFunction", defaultFromXsd=True) metric = comparisonMeasure.childOfClass(PmmlClusteringMetric) metrictag = metric.t performanceTable.end("set up") for clusteringField in clusteringFields: dataType = dataTable.fields[clusteringField["field"]].fieldType.dataType if dataType == "string": raise defs.PmmlValidationError("ClusteringField \"%s\" has dataType \"%s\", which cannot be used for clustering" % (clusteringField["field"], dataType)) missingValueWeights = self.childOfTag("MissingValueWeights") if missingValueWeights is None: adjustM = None else: performanceTable.begin("MissingValueWeights") missingWeights = missingValueWeights.childOfClass(PmmlArray).values(convertType=True) sumNMqi = NP("zeros", len(dataTable), dtype=NP.dtype(float)) for clusteringField, missingWeight in zip(clusteringFields, missingWeights): clusteringField.addToAdjustM(dataTable, functionTable, performanceTable, sumNMqi, missingWeight) adjustM = NP(sum(missingWeights) / sumNMqi) adjustM[NP(sumNMqi == 0.0)] = 1.0 performanceTable.end("MissingValueWeights") anyInvalid = NP("zeros", len(dataTable), dtype=NP.dtype(bool)) for clusteringField in clusteringFields: mask = dataTable.fields[clusteringField["field"]].mask if mask is not None: NP("logical_or", anyInvalid, NP(mask == defs.INVALID), anyInvalid) bestClusterId = None bestClusterAffinity = None allClusterAffinities = {} for index, cluster in enumerate(clusters): array = cluster.childOfClass(PmmlArray) if array is None: raise defs.PmmlValidationError("Cluster must have an array to designate its center") centerStrings = array.values(convertType=False) if len(centerStrings) != len(clusteringFields): raise defs.PmmlValidationError("Cluster array has %d components, but there are %d ClusteringFields with isCenterField=true" % (len(centerStrings), len(clusteringFields))) performanceTable.begin(metrictag) if distributionBased: matrix = cluster.xpath("pmml:Covariances/pmml:Matrix") if len(matrix) != 1: raise defs.PmmlValidationError("In distribution-based clustering, all clusters must have a Covariances/Matrix") try: covarianceMatrix = NP("array", matrix[0].values(), dtype=NP.dtype(float)) except ValueError: raise defs.PmmlValidationError("Covariances/Matrix must contain real numbers for distribution-based clustering") else: covarianceMatrix = None state = self._State() metric.initialize(state, len(dataTable), len(clusteringFields), distributionBased) for clusteringField, centerString, fieldWeight in zip(clusteringFields, centerStrings, fieldWeights): if isinstance(metric, PmmlClusteringMetricBinary): metric.accumulateBinary(state, dataTable.fields[clusteringField["field"]], centerString, distributionBased) else: performanceTable.pause(metrictag) cxy = clusteringField.compare(dataTable, functionTable, performanceTable, centerString, defaultCompareFunction, anyInvalid) performanceTable.unpause(metrictag) metric.accumulate(state, cxy, fieldWeight, distributionBased) distance = metric.finalizeDistance(state, adjustM, distributionBased, covarianceMatrix) del state performanceTable.end(metrictag) if index == 0: bestClusterId = NP("ones", len(dataTable), dtype=NP.dtype(int)) # 1-based index bestClusterAffinity = distance better = NP(distance < bestClusterAffinity) bestClusterId[better] = index + 1 # 1-based index bestClusterAffinity[better] = distance[better] allClusterAffinities[cluster.get("id", "%d" % (index + 1))] = distance if not anyInvalid.any(): scoreMask = None else: scoreMask = NP(anyInvalid * defs.INVALID) performanceTable.begin("set scores") score = {} performanceTable.begin("predictedValue") fieldType = FakeFieldType("string", "categorical") clusterIdentifiers = NP("empty", len(dataTable), dtype=fieldType.dtype) for index, cluster in enumerate(clusters): value = fieldType.stringToValue(cluster.get("id", "%d" % (index + 1))) clusterIdentifiers[NP(bestClusterId == (index + 1))] = value score[None] = DataColumn(fieldType, clusterIdentifiers, scoreMask) performanceTable.end("predictedValue") if self.subFields["predictedDisplayValue"]: performanceTable.begin("predictedDisplayValue") fieldType = FakeFieldType("string", "categorical") clusterNames = NP("empty", len(dataTable), dtype=fieldType.dtype) for index, cluster in enumerate(clusters): value = fieldType.stringToValue(cluster.get("name", "")) clusterNames[NP(bestClusterId == (index + 1))] = value score["predictedDisplayValue"] = DataColumn(fieldType, clusterNames, scoreMask) performanceTable.end("predictedDisplayValue") if self.subFields["entity"]: performanceTable.begin("entity") fieldType = FakeFieldType("object", "any") entities = NP("empty", len(dataTable), dtype=fieldType.dtype) for index, cluster in enumerate(clusters): value = fieldType.stringToValue(cluster.get("name", "")) indexPlusOne = index + 1 for i in xrange(len(entities)): if bestClusterId[i] == indexPlusOne: entities[i] = cluster score["entity"] = DataColumn(fieldType, entities, scoreMask) performanceTable.end("entity") if self.subFields["clusterId"]: performanceTable.begin("clusterId") fieldType = FakeFieldType("integer", "continuous") score["clusterId"] = DataColumn(fieldType, bestClusterId, scoreMask) performanceTable.end("clusterId") if self.subFields["entityId"]: performanceTable.begin("entityId") fieldType = FakeFieldType("integer", "continuous") score["entityId"] = DataColumn(fieldType, bestClusterId, scoreMask) performanceTable.end("entityId") if self.subFields["clusterAffinity"]: performanceTable.begin("clusterAffinity") fieldType = FakeFieldType("double", "continuous") score["clusterAffinity"] = DataColumn(fieldType, bestClusterAffinity, scoreMask) performanceTable.end("clusterAffinity") if self.subFields["affinity"]: performanceTable.begin("affinity") fieldType = FakeFieldType("double", "continuous") score["affinity"] = DataColumn(fieldType, bestClusterAffinity, scoreMask) performanceTable.end("affinity") if self.subFields["all"]: performanceTable.begin("all") fieldType = FakeFieldType("double", "continuous") for identifier, distance in allClusterAffinities.items(): score["all.%s" % identifier] = DataColumn(fieldType, distance, scoreMask) performanceTable.end("all") performanceTable.end("set scores") performanceTable.end("ClusteringModel") return score
def _selectMax(self, dataTable, functionTable, performanceTable, segmentation): """Used by C{calculateScore}.""" performanceTable.begin("Segmentation max") scoresData = NP("empty", len(dataTable), dtype=NP.dtype(object)) filled = NP("zeros", len(dataTable), dtype=NP.dtype(bool)) unfilled = NP("ones", len(dataTable), dtype=NP.dtype(bool)) newOutputData = [] for segment in segmentation.childrenOfTag("Segment", iterator=True): performanceTable.pause("Segmentation max") selection = segment.childOfClass(PmmlPredicate).evaluate( dataTable, functionTable, performanceTable) performanceTable.unpause("Segmentation max") if not selection.any(): continue subTable = dataTable.subTable(selection) subModel = segment.childOfClass(PmmlModel) performanceTable.pause("Segmentation max") subModel.calculate(subTable, functionTable, performanceTable) performanceTable.unpause("Segmentation max") if subTable.score.fieldType.dataType in ("string", "boolean", "object"): raise defs.PmmlValidationError( "Segmentation with multipleModelMethod=\"max\" cannot be applied to models that produce dataType \"%s\"" % subTable.score.fieldType.dataType) # ignore invalid in matches (like the built-in "min" Apply function) if subTable.score.mask is not None: NP("logical_and", selection, NP(subTable.score.mask == defs.VALID), selection) selectionFilled = NP("logical_and", selection, filled) selectionUnfilled = NP("logical_and", selection, unfilled) filled_selection = filled[selection] unfilled_selection = unfilled[selection] left, right = subTable.score.data[filled_selection], scoresData[ selectionFilled] condition = NP(left > right) scoresData[selectionFilled] = NP("where", condition, left, right) scoresData[selectionUnfilled] = subTable.score.data[ unfilled_selection] for fieldName, dataColumn in subTable.output.items(): if fieldName not in dataTable.output: data = NP("empty", len(dataTable), dtype=dataColumn.fieldType.dtype) data[selectionUnfilled] = dataColumn.data mask = NP( NP("ones", len(dataTable), dtype=defs.maskType) * defs.MISSING) if dataColumn.mask is None: mask[selectionUnfilled] = defs.VALID else: mask[selectionUnfilled] = dataColumn.mask newDataColumn = DataColumn(dataColumn.fieldType, data, mask) newDataColumn._unlock() dataTable.output[fieldName] = newDataColumn newOutputData.append(newDataColumn) else: newDataColumn = dataTable.output[fieldName] newDataColumn.data[selectionFilled] = NP( "where", condition, dataColumn.data[filled_selection], newDataColumn.data[selectionFilled]) newDataColumn.data[selectionUnfilled] = dataColumn.data[ unfilled_selection] if dataColumn.mask is None: newDataColumn.mask[selectionUnfilled] = defs.VALID else: newDataColumn.mask[selectionUnfilled] = dataColumn.mask filled += selectionUnfilled unfilled -= selectionUnfilled for newDataColumn in newOutputData: if not newDataColumn.mask.any(): newDataColumn._mask = None newDataColumn._lock() if filled.all(): scoresMask = None else: scoresMask = NP(NP("logical_not", filled) * defs.MISSING) scores = DataColumn(self.scoreType, scoresData, scoresMask) performanceTable.end("Segmentation max") return {None: scores}