def applyWithoutMask(self, data, mask, argument): data, allbad = data NP("logical_xor", data, argument.data, data) if argument.mask is not None: NP("logical_and", allbad, NP(argument.mask != defs.VALID), allbad) return (data, allbad), mask
def evaluate(self, dataTable, functionTable, performanceTable): """Evaluate the expression, using a DataTable as input. @type dataTable: DataTable @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression. @type functionTable: FunctionTable @param functionTable: The FunctionTable, containing any functions that might be called in this expression. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: DataColumn @return: The result of the calculation as a DataColumn. """ performanceTable.begin("NormDiscrete") dataColumn = dataTable.fields[self["field"]] value = dataColumn.fieldType.stringToValue(self["value"]) data = NP("array", NP(dataColumn.data == value), dtype=self._fieldType.dtype) data, mask = FieldCastMethods.applyMapMissingTo( self._fieldType, data, dataColumn.mask, self.get("mapMissingTo")) performanceTable.end("NormDiscrete") return DataColumn(self._fieldType, data, mask)
def select(self, dataTable, functionTable, performanceTable): """Evaluate the expression or predicate, given input data and a function table. @type dataTable: DataTable @param dataTable: Contains the data to plot. @type functionTable: FunctionTable @param functionTable: Defines functions that may be used to transform data for plotting. @type performanceTable: PerformanceTable @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process. @rtype: 1d Numpy array of bool @return: The result of the expression or predicate as a Numpy mask. """ predicate = self.childOfClass(PmmlPredicate) if predicate is not None: return predicate.evaluate(dataTable, functionTable, performanceTable) expression = self.childOfClass(PmmlExpression) dataColumn = expression.evaluate(dataTable, functionTable, performanceTable) if not dataColumn.fieldType.isboolean(): raise defs.PmmlValidationError( "PlotSelection must evaluate to boolean, not %r" % dataColumn.fieldType) dataColumn._unlock() if dataColumn.mask is not None: NP("logical_and", dataColumn.data, NP(dataColumn.mask == defs.VALID), dataColumn.data) return dataColumn.data
def mapper(self, dataTable): dataTable = dataTable.subTable() # ensure that the results of this calculation do not get propagated self.metadata["ClusteringModel"].calculate(dataTable, performanceTable=self.performanceTable) data = dataTable.score.data mask = dataTable.score.mask stringToValue = dataTable.score.fieldType.stringToValue for index, cluster in enumerate(self.clusters): clusterName = cluster.get("id", "%d" % (index + 1)) value = stringToValue(clusterName) selection = NP(data == value) if mask is not None: NP("logical_and", selection, NP(mask == defs.VALID), selection) denominator = selection.sum() numer = dict((fieldName, 0.0) for fieldName in self.fieldNames) denom = dict((fieldName, 0.0) for fieldName in self.fieldNames) for fieldName in self.fieldNames: numer[fieldName] += dataTable.fields[fieldName].data[selection].sum() denom[fieldName] += denominator self.emit(clusterName, {"numer": numer, "denom": denom})
def evaluate(self, dataTable, functionTable, performanceTable, arguments): arguments = [ x.evaluate(dataTable, functionTable, performanceTable) for x in arguments ] performanceTable.begin("built-in \"%s\"" % self.name) fieldType = self.allBooleanType(arguments, atleast=2) data = NP("zeros", len(dataTable), dtype=fieldType.dtype) mask = None allbad = NP("ones", len(dataTable), dtype=NP.dtype(bool)) (data, allbad), mask = self.applySkipMissing((data, allbad), mask, arguments) if allbad.any(): if mask is None: mask = allbad * defs.MISSING else: NP("logical_and", allbad, NP(mask == defs.VALID), allbad) mask[allbad] = defs.MISSING performanceTable.end("built-in \"%s\"" % self.name) return DataColumn(fieldType, data, mask)
def applyInvalidValueTreatment(mask, invalidValueTreatment, overwrite=False): """Replace INVALID values with MISSING if invalidValueTreatment is "asMissing". This function does not modify the original data (unless C{overwrite} is True), but it returns a substitute. Example use:: mask = dataColumn.mask mask = FieldCastMethods.applyInvalidValueTreatment(mask, pmml.get("invalidValueTreatment")) return DataColumn(dataColumn.fieldType, dataColumn.data, mask) It can also be used in conjunction with other FieldCastMethods. @type mask: 1d Numpy array of dtype defs.maskType, or None @param mask: The mask. @type invalidValueTreatment: string @param invalidValueTreatment: One of "returnInvalid", "asIs", "asMissing"; only "asMissing" has an effect. @type overwrite: bool @param overwrite: If True, temporarily unlike and overwrite the original mask. @rtype: 1d Numpy array of dtype defs.maskType @return: The new mask. """ if mask is None: return mask if invalidValueTreatment == "asMissing": if overwrite: mask.setflags(write=True) else: mask = NP("copy", mask) mask.setflags(write=True) mask[NP(mask == defs.INVALID)] = defs.MISSING return mask
def generateSamples(self, low, high): """Used by C{prepare} to generate an array of samples. @type low: number @param low: Minimum value to sample. @type high: number @param high: Maximum value to sample. @rtype: 1d Numpy array @return: An array of uniform, random, or adaptive samples of an interval. """ numSamples = self.get("numSamples", defaultFromXsd=True, convertType=True) samplingMethod = self.get("samplingMethod", defaultFromXsd=True) if samplingMethod == "uniform": samples = NP("linspace", low, high, numSamples, endpoint=True) elif samplingMethod == "random": samples = NP( NP(NP(NP.random.rand(numSamples)) * (high - low)) + low) samples.sort() else: raise NotImplementedError("TODO: add 'adaptive'") return samples
def _checkNumpy(self, data, mask, tryToCast=True): if mask is None and isinstance(data, NP.ma.MaskedArray): m = NP.ma.getmask(data) if m is not None: mask = m if isinstance(data, NP.ma.MaskedArray): data = NP.ma.getdata(data) if isinstance(data, NP.ndarray): if len(data.shape) != 1: raise TypeError("DataColumns cannot be built from n > 1 dimensional arrays") if tryToCast and data.dtype != self.dtype: try: data = NP("array", data, dtype=self.dtype) except (TypeError, ValueError): pass if isinstance(mask, NP.ndarray): if mask.shape != data.shape: raise TypeError("Mask, if provided, must have the same shape as data") if mask.dtype != defs.maskType: mask = NP(NP(mask != 0) * defs.MISSING) return data, mask
def subDataColumn(self, selection=None): """Return or filter this DataColumn with C{selection}. If C{selection} is None, this function returns a shallow copy of the DataColumn. It has a new Python C{id}, but the potentially large numerical array is not copied. This function can therefore be used in performance-critical situtations. @type selection: 1d Numpy array of dtype bool, or None @param selection: If None, simply return the DataColumn; otherwise, use the boolean array to filter it. @rtype: DataColumn @return: A DataColumn of the same length or shorter. """ if selection is None: return DataColumn(self._fieldType, self._data, self._mask) else: subData = self.data[selection] if self.mask is None: subMask = None else: subMask = self.mask[selection] if not isinstance(subData, NP.ndarray): subData = NP("array", [subData]) if subMask != None: subMask = NP("array", [subMask]) return DataColumn(self._fieldType, subData, subMask)
def _toDataColumn_dateTime(self, data, mask): data, mask = self._checkNumpy(data, mask, tryToCast=False) data, mask = self._checkNonNumpy(data, mask) data2 = NP("empty", len(data), dtype=self.dtype) mask2 = NP("zeros", len(data), dtype=defs.maskType) for i, x in enumerate(data): if (mask is not None and mask[i]) or (isinstance(x, float) and math.isnan(x)) or (isinstance(x, basestring) and x.upper() == "NAN"): data2[i] = defs.PADDING mask2[i] = defs.MISSING else: try: data2[i] = self.stringToValue(x) except (ValueError, TypeError): data2[i] = defs.PADDING mask2[i] = defs.INVALID if not mask2.any(): data, mask = data2, None else: data, mask = data2, mask2 data, mask = self._checkValues(data, mask) data, mask = self._checkIntervals(data, mask) return DataColumn(self, data, mask)
def _toDataColumn_dateTimeNumber(self, data, mask): dataColumn = self._toDataColumn_number(data, mask) data, mask = NP(NP(dataColumn.data * self._factor) + self._offset), dataColumn.mask data, mask = self._checkValues(data, mask) data, mask = self._checkIntervals(data, mask) return DataColumn(self, data, mask)
def functionAverageFake(self, value, howmany, fieldType): """Averages rows in a DataColumn when it is known that there are no matches. @type value: number @param value: Initial and final value. @type howmany: int @param howmany: Number of rows. @type fieldType: FieldType @param fieldType: The type of field to emulate. @rtype: DataColumn @return: The faked results. """ fieldType = FakeFieldType("double", "continuous") numerator = NP("empty", howmany, dtype=fieldType.dtype) denominator = NP("empty", howmany, dtype=fieldType.dtype) numerator[:] = value[0] denominator[:] = value[1] data = NP(numerator / denominator) if value[1] == 0: mask = NP("empty", howmany, dtype=defs.maskType) mask[:] = defs.INVALID else: mask = None return DataColumn(fieldType, data, mask)
def maskInvalid(self, data, mask): """Helper method to replace NaN and infinite values with INVALID after a potentially dangerous operation. Example:: result = NP("log", dataColumn.data) # log(0) = -inf, log(-x) = nan resultMask = self.maskInvalid(result, dataColumn.mask) return DataColumn(fakeFieldType, result, resultMask) The input C{data} and C{mask} are not modified by this method; a substitute mask is returned. @type data: 1d Numpy array @param data: The dataset that may contain NaN and infinite values. @type mask: 1d Numpy array of C{defs.maskType}, or None @param mask: The original mask. @rtype: 1d Numpy array of C{defs.maskType}, or None @return: The new mask. """ bad = NP("logical_not", NP("isfinite", data)) if bad.any(): if mask is None: mask = bad * defs.INVALID else: NP("logical_and", bad, NP(mask == defs.VALID), bad) if not mask.flags.writeable: mask = NP("copy", mask) mask.setflags(write=True) mask[bad] = defs.INVALID if mask is not None and not mask.any(): mask = None return mask
def evaluate(self, dataTable, functionTable, performanceTable, returnUnknowns=False): """Evaluate the predicate, using a DataTable as input. @type dataTable: DataTable @param dataTable: The input DataTable, containing any fields that might be used to evaluate this predicate. @type functionTable: FunctionTable @param functionTable: The FunctionTable, containing any functions that might be called in this predicate. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @type returnUnknowns: bool @param returnUnknowns: If True, return a "mask" for the selection that indicates which rows are unknown, rather than True or False. @rtype: 1d Numpy array of bool or 3-tuple of arrays @return: Either a simple selection array or selection, unknowns, encounteredUnknowns """ performanceTable.begin("Predicate False") result = NP("zeros", len(dataTable), dtype=NP.dtype(bool)) if returnUnknowns: unknowns = NP("zeros", len(dataTable), dtype=NP.dtype(bool)) result = result, unknowns, unknowns performanceTable.end("Predicate False") return result
def applyWithMask(self, data, mask, argument, mask2): data, allbad = data data[mask2] = NP("logical_xor", data[mask2], argument.data[mask2]) if argument.mask is not None: allbad[mask2] = NP("logical_and", NP(allbad[mask2] != defs.VALID), argument.mask[mask2]) return (data, allbad), mask
def calculate(self, dataTable, functionTable=None, performanceTable=None): """Perform a calculation directly, without constructing a DataTable first. This method is intended for performance-critical cases where the DataTable would be built without having to analyze the PMML for field type context. This method modifies the input DataTable and FunctionTable. @type dataTable: DataTable @param dataTable: The pre-built DataTable. @type functionTable: FunctionTable or None @param functionTable: A table of functions. Initially, it contains only the built-in functions, but any user functions defined in PMML would be added to it. @type performanceTable: PerformanceTable or None @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: DataTable @return: A DataTable containing the result, usually a modified version of the input. """ if functionTable is None: functionTable = FunctionTable() if performanceTable is None: performanceTable = FakePerformanceTable() if not self.get("isScorable", defaultFromXsd=True, convertType=True): dataTable.score = DataColumn(self.scoreType, NP(NP("ones", len(dataTable), dtype=self.scoreType.dtype) * defs.PADDING), NP(NP("ones", len(dataTable), dtype=defs.maskType) * defs.INVALID)) return dataTable subTable = dataTable.subTable() for miningField in self.xpath("pmml:MiningSchema/pmml:MiningField"): miningField.replaceField(subTable, functionTable, performanceTable) for calculable in self.calculableTrans(): calculable.calculate(subTable, functionTable, performanceTable) score = self.calculateScore(subTable, functionTable, performanceTable) dataTable.score = score[None] if self.name is not None: for key, value in score.items(): if key is None: dataTable.fields[self.name] = value else: dataTable.fields["%s.%s" % (self.name, key)] = value for outputField in self.xpath("pmml:Output/pmml:OutputField"): displayName = outputField.get("displayName", outputField["name"]) dataTable.output[displayName] = outputField.format(subTable, functionTable, performanceTable, score) for fieldName in subTable.output: dataTable.output[fieldName] = subTable.output[fieldName] return dataTable.score
def finalizeDistance(self, state, adjustM, distributionBased, covarianceMatrix): """Third and final step in a vectorized metric calculation, called once after all fields and cluster centers. Only modifes the C{state} object. @type state: ad-hoc Python object @param state: State information that persists long enough to span the three steps of a metric calculation. This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState. @type adjustM: 1d Numpy array of numbers @param adjustM: The "adjustM" value, intended to adjust for missing values, as defined in the PMML specification. @type distributionBased: bool @param distributionBased: If True, use a covariance matrix to scale the distance result. @type covarianceMatrix: Numpy matrix @param covarianceMatrix: The covariance matrix to scale the result if C{distributionBased}. @rtype: 1d Numpy array of numbers @return: The array of distances or similarities for center-based clustering, and number of standard deviations for distribution-based clustering. """ if adjustM is None: result = state.sumInQuadrature else: result = NP(state.sumInQuadrature * adjustM) if distributionBased: normalizations = NP("sqrt", NP("sum", NP(state.displacements**2), axis=1)) selection = NP(normalizations > 0.0) state.displacements[selection] = state.displacements[selection] / (normalizations[:, NP.newaxis])[selection] lengthOfSigma = NP("sum", NP(NP(state.displacements.dot(covarianceMatrix)) * state.displacements), axis=1) result[selection] = NP(result[selection] / lengthOfSigma[selection]) return result
def _fromDataColumn_number(self, dataColumn): if dataColumn.mask is None: return NP("array", dataColumn.data, dtype=NP.dtype(object)) else: output = NP("empty", len(dataColumn), dtype=NP.dtype(object)) mask = dataColumn.mask for i, x in enumerate(dataColumn.data): if mask[i] == defs.VALID: output[i] = x elif mask[i] == defs.MISSING: output[i] = defs.NAN else: output[i] = None return output
def applyMapMissingTo(fieldType, data, mask, mapMissingTo, overwrite=False): """Replace MISSING values with a given substitute. This function does not modify the original data (unless C{overwrite} is True), but it returns a substitute. Example use:: data, mask = dataColumn.data, dataColumn.mask data, mask = FieldCastMethods.applyMapMissingTo(dataColumn.fieldType, data, mask, "-999") return DataColumn(dataColumn.fieldType, data, mask) It can also be used in conjunction with other FieldCastMethods. @type fieldType: FieldType @param fieldType: The data fieldType (to interpret C{mapMissingTo}). @type data: 1d Numpy array @param data: The data. @type mask: 1d Numpy array of dtype defs.maskType, or None @param mask: The mask. @type mapMissingTo: string @param mapMissingTo: The replacement value, represented as a string (e.g. directly from a PMML attribute). @type overwrite: bool @param overwrite: If True, temporarily unlike and overwrite the original mask. @rtype: 2-tuple of 1d Numpy arrays @return: The new data and mask. """ if mask is None: return data, mask if mapMissingTo is not None: selection = NP(mask == defs.MISSING) try: mappedValue = fieldType.stringToValue(mapMissingTo) except ValueError as err: raise defs.PmmlValidationError("mapMissingTo string \"%s\" cannot be cast as %r: %s" % (mapMissingTo, fieldType, str(err))) if overwrite: data.setflags(write=True) mask.setflags(write=True) else: data = NP("copy", data) mask = NP("copy", mask) data[selection] = mappedValue mask[selection] = defs.VALID if not mask.any(): mask = None return data, mask
def logpdf(self, array): """Vectorized logarithm of the probability density function (PDF). @type array: 1d Numpy array of numbers @param array: The input vector. @rtype: 1d Numpy array of numbers @return: The result of ln(PDF_Gaussian(x)) for all input values x. """ mean = float(self.attrib["mean"]) twovariance = 2.0 * float(self.attrib["variance"]) return NP( NP(NP("negative", NP("square", NP(array - mean))) / twovariance) - math.log(math.sqrt(math.pi * twovariance)))
def singleton(self, inputData, inputMask=None, inputState=None): """Create a single-row DataTable for event-based processes. This static method is to the DataTable constructor, but it creates a DataTable with only one row and it uses the Python data type of the C{inputData} to define a type, rather than an explicit C{context}. @type inputData: dict-like mapping from strings to single values (not lists) @param inputData: A single data record. @type inputMask: dict-like mapping from strings to single C{defs.maskType} values (not lists), or None @param inputMask: A single mask. @type inputState: DataTableState or None @param inputState: Initial state of the DataTable. To continue a previous calculation, use the C{dataTable.state} from the previous calculation. """ dataColumns = OrderedDict() for fieldName in sorted(inputData.keys()): value = inputData[fieldName] if isinstance(value, basestring): fieldType = FakeFieldType("string", "continuous") elif isinstance(value, float): fieldType = FakeFieldType("double", "continuous") elif isinstance(value, int): fieldType = FakeFieldType("integer", "continuous") elif isinstance(value, bool): fieldType = FakeFieldType("boolean", "continuous") # TODO: PMML date types (when passed a datetype.datetype object) else: fieldType = FakeFieldType("object", "any") data = NP("empty", 1, dtype=fieldType.dtype) data[0] = value if inputMask is None or inputMask.get(fieldName) is None: mask = None else: mask = NP("empty", 1, dtype=defs.maskType) mask[0] = inputMask.get(fieldName) dataColumns[fieldName] = DataColumn(fieldType, data, mask) dataTable = DataTable.__new__(DataTable) dataTable._configure(dataColumns, inputState) return dataTable
def accumulate(self, state, cxy, fieldWeight, distributionBased): """Second step in a vectorized metric calculation, called for each field and cluster center. Only modifies the C{state} object. @type state: ad-hoc Python object @param state: State information that persists long enough to span the three steps of a metric calculation. This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState. @type cxy: 1d Numpy array of numbers @param cxy: Comparison distance or similarity for all rows. @type fieldWeight: number @param fieldWeight: The weight of this field. @type distributionBased: bool @param distributionBased: If True, use a covariance matrix to scale the distance result. """ NP("maximum", state.maximumComponent, NP(cxy * fieldWeight), state.maximumComponent)
def _toDataColumn_string(self, data, mask): dataColumn = self._toDataColumn_object(data, mask) data = dataColumn.data mask = dataColumn.mask data.setflags(write=True) if mask is not None: mask.setflags(write=True) if mask is not None: for i, x in enumerate(dataColumn.data): if (x is None or (isinstance(x, float) and math.isnan(x))) and mask[i] == defs.VALID: mask[i] = defs.MISSING elif not isinstance(x, basestring): data[i] = repr(x) else: for i, x in enumerate(dataColumn.data): if x is None or (isinstance(x, float) and math.isnan(x)): if mask is None: mask = NP("zeros", len(data), dtype=defs.maskType) mask[i] = defs.MISSING elif not isinstance(x, basestring): data[i] = repr(x) if mask is not None: dataColumn._mask = mask data, mask = self._checkValues(data, mask) data, mask = self._checkIntervals(data, mask) return DataColumn(self, data, mask)
def zmaxPush(self, zmax, fieldType, sticky=False): """Make the z range of the bounding box larger by (possibly) pushing the z maximum higher. "Sticky" means that the final bounding box will not be expanded beyond this value, if it turns out to be the most extreme. This feature is used, for example, in the layout of a vertical histogram: the xmin and xmax of the plot window should align with the xmin and xmax of a histogram unless an overlaying graphic pushes the boundary farther. The ymax of the histogram should be inflated beyond the tallest bin so that it can be clearly seen. If C{zStrictlyPositive} is True, negative C{zmax} values are ignored. @type zmax: number @param zmax: The new C{zmax}, if this C{zmax} is larger than the currently largest C{zmax}. @type fieldType: FieldType @param fieldType: The FieldType of z. Only homogeneous FieldTypes are allowed. @type sticky: bool @param sticky: Label this zmax as a "sticky" zmax. @raise PmmlValidationError: If any z FieldTypes differ, this function will raise an error. """ self._checkFieldTypeZ(fieldType) if NP("isfinite", zmax) and (not self.zStrictlyPositive or zmax > 0.0) and (self.zmax is None or zmax > self.zmax): self.zmax = zmax if sticky: self.zmaxSticky = zmax
def yminPush(self, ymin, fieldType, sticky=False): """Make the y range of the bounding box larger by (possibly) pushing the y minimum lower. "Sticky" means that the final bounding box will not be expanded beyond this value, if it turns out to be the most extreme. This feature is used, for example, in the layout of a vertical histogram: the xmin and xmax of the plot window should align with the xmin and xmax of a histogram unless an overlaying graphic pushes the boundary farther. The ymax of the histogram should be inflated beyond the tallest bin so that it can be clearly seen. If C{yStrictlyPositive} is True, negative C{ymin} values are ignored. @type ymin: number @param ymin: The new C{ymin}, if this C{ymin} is smaller than the currently smallest C{ymin}. @type fieldType: FieldType @param fieldType: The FieldType of y. Only homogeneous FieldTypes are allowed. @type sticky: bool @param sticky: Label this ymin as a "sticky" ymin. @raise PmmlValidationError: If any y FieldTypes differ, this function will raise an error. """ self._checkFieldTypeY(fieldType) if NP("isfinite", ymin) and (not self.yStrictlyPositive or ymin > 0.0) and (self.ymin is None or ymin < self.ymin): self.ymin = ymin if sticky: self.yminSticky = ymin
def endReducerKey(self, key): for clusterName in self.clusterVectors.keys(): if clusterName == key: newPosition = NP("array", [self.numer[fieldName] / self.denom[fieldName] if self.denom[fieldName] > 0.0 else 0.0 for fieldName in self.fieldNames], dtype=NP.dtype(float)) self.emit(clusterName, newPosition) break
def mapReduce(self): """Build a MapReduce-Ready K-means producer. Used by C{optimize} and C{hadoopOptimize}. @rtype: MapReduce @return: An instance of MapReduce that can either be run in pure-Python mode or submitted to Hadoop. """ class KMeansMapReduceApplication(MapReduceKMeans): metadata = {} allChangeThreshold = self.allChangeThreshold KMeansMapReduceApplication.metadata[ "ClusteringModel"] = self.clusteringModel clusterVectors = {} for index, cluster in enumerate( self.clusteringModel.xpath("pmml:Cluster")): clusterName = cluster.get("id", "%d" % (index + 1)) clusterVectors[clusterName] = NP( "array", cluster.childOfTag("Array").values(), dtype=NP.dtype(float)) KMeansMapReduceApplication.metadata["clusterVectors"] = clusterVectors self.KMeansMapReduceApplication = KMeansMapReduceApplication return MapReduce(KMeansMapReduceApplication)
def determineScaleBins(numBins, low, high, array): """Determine the C{numBins}, C{low}, and C{high} of the histogram from explicitly set values where available and implicitly derived values where necessary. Explicitly set values always override implicit values derived from the dataset. - C{low}, C{high} implicit values are the extrema of the dataset. - C{numBins} implicit value is the Freedman-Diaconis heuristic for number of histogram bins. @type numBins: int or None @param numBins: Input number of bins. @type low: number or None @param low: Low edge. @type high: number or None @param high: High edge. @type array: 1d Numpy array of numbers @param array: Dataset to use to implicitly derive values. @rtype: 3-tuple @return: C{numBins}, C{low}, C{high} """ generateLow = (low is None) generateHigh = (high is None) if generateLow: low = float(array.min()) if generateHigh: high = float(array.max()) if low == high: low, high = low - 1.0, high + 1.0 elif high < low: if generateLow: low = high - 1.0 elif generateHigh: high = low + 1.0 else: raise defs.PmmlValidationError( "PlotHistogram attributes low and high must be in the right order: low = %g, high = %g" % (low, high)) else: if generateLow and generateHigh: low, high = low - 0.2 * (high - low), high + 0.2 * (high - low) elif generateLow: low = low - 0.2 * (high - low) elif generateHigh: high = high + 0.2 * (high - low) if numBins is None: # the Freedman-Diaconis rule q1, q3 = NP("percentile", array, [25.0, 75.0]) binWidth = 2.0 * (q3 - q1) / math.pow(len(array), 1.0 / 3.0) if binWidth > 0.0: numBins = max(10, int(math.ceil((high - low) / binWidth))) else: numBins = 10 return numBins, low, high
def evaluate(self, dataTable, functionTable, performanceTable, arguments): dataColumn = Between.evaluate(dataTable, functionTable, performanceTable, arguments) dataColumn._unlock() NP("logical_not", dataColumn.data, dataColumn.data) dataColumn._lock() return dataColumn
def finalizeDistance(self, state, adjustM, distributionBased, covarianceMatrix): """Third and final step in a vectorized metric calculation, called once after all fields and cluster centers. Only modifes the C{state} object. @type state: ad-hoc Python object @param state: State information that persists long enough to span the three steps of a metric calculation. This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState. @type adjustM: 1d Numpy array of numbers @param adjustM: The "adjustM" value, intended to adjust for missing values, as defined in the PMML specification. @type distributionBased: bool @param distributionBased: If True, use a covariance matrix to scale the distance result. @type covarianceMatrix: Numpy matrix @param covarianceMatrix: The covariance matrix to scale the result if C{distributionBased}. @rtype: 1d Numpy array of numbers @return: The array of distances or similarities for center-based clustering, and number of standard deviations for distribution-based clustering. """ return NP(NP(state.a11 + state.a00) / NP(NP(NP(state.a11 + state.a10) + state.a01) + state.a00))