def _toDataColumn_dateTime(self, data, mask): data, mask = self._checkNumpy(data, mask, tryToCast=False) data, mask = self._checkNonNumpy(data, mask) data2 = NP("empty", len(data), dtype=self.dtype) mask2 = NP("zeros", len(data), dtype=defs.maskType) for i, x in enumerate(data): if (mask is not None and mask[i]) or (isinstance(x, float) and math.isnan(x)) or (isinstance(x, basestring) and x.upper() == "NAN"): data2[i] = defs.PADDING mask2[i] = defs.MISSING else: try: data2[i] = self.stringToValue(x) except (ValueError, TypeError): data2[i] = defs.PADDING mask2[i] = defs.INVALID if not mask2.any(): data, mask = data2, None else: data, mask = data2, mask2 data, mask = self._checkValues(data, mask) data, mask = self._checkIntervals(data, mask) return DataColumn(self, data, mask)
def _checkNumpy(self, data, mask, tryToCast=True): if mask is None and isinstance(data, NP.ma.MaskedArray): m = NP.ma.getmask(data) if m is not None: mask = m if isinstance(data, NP.ma.MaskedArray): data = NP.ma.getdata(data) if isinstance(data, NP.ndarray): if len(data.shape) != 1: raise TypeError("DataColumns cannot be built from n > 1 dimensional arrays") if tryToCast and data.dtype != self.dtype: try: data = NP("array", data, dtype=self.dtype) except (TypeError, ValueError): pass if isinstance(mask, NP.ndarray): if mask.shape != data.shape: raise TypeError("Mask, if provided, must have the same shape as data") if mask.dtype != defs.maskType: mask = NP(NP(mask != 0) * defs.MISSING) return data, mask
def mapper(self, dataTable): dataTable = dataTable.subTable() # ensure that the results of this calculation do not get propagated self.metadata["ClusteringModel"].calculate(dataTable, performanceTable=self.performanceTable) data = dataTable.score.data mask = dataTable.score.mask stringToValue = dataTable.score.fieldType.stringToValue for index, cluster in enumerate(self.clusters): clusterName = cluster.get("id", "%d" % (index + 1)) value = stringToValue(clusterName) selection = NP(data == value) if mask is not None: NP("logical_and", selection, NP(mask == defs.VALID), selection) denominator = selection.sum() numer = dict((fieldName, 0.0) for fieldName in self.fieldNames) denom = dict((fieldName, 0.0) for fieldName in self.fieldNames) for fieldName in self.fieldNames: numer[fieldName] += dataTable.fields[fieldName].data[selection].sum() denom[fieldName] += denominator self.emit(clusterName, {"numer": numer, "denom": denom})
def evaluate(self, dataTable, functionTable, performanceTable): """Evaluate the expression, using a DataTable as input. @type dataTable: DataTable @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression. @type functionTable: FunctionTable @param functionTable: The FunctionTable, containing any functions that might be called in this expression. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: DataColumn @return: The result of the calculation as a DataColumn. """ performanceTable.begin("NormDiscrete") dataColumn = dataTable.fields[self["field"]] value = dataColumn.fieldType.stringToValue(self["value"]) data = NP("array", NP(dataColumn.data == value), dtype=self._fieldType.dtype) data, mask = FieldCastMethods.applyMapMissingTo( self._fieldType, data, dataColumn.mask, self.get("mapMissingTo")) performanceTable.end("NormDiscrete") return DataColumn(self._fieldType, data, mask)
def applyInvalidValueTreatment(mask, invalidValueTreatment, overwrite=False): """Replace INVALID values with MISSING if invalidValueTreatment is "asMissing". This function does not modify the original data (unless C{overwrite} is True), but it returns a substitute. Example use:: mask = dataColumn.mask mask = FieldCastMethods.applyInvalidValueTreatment(mask, pmml.get("invalidValueTreatment")) return DataColumn(dataColumn.fieldType, dataColumn.data, mask) It can also be used in conjunction with other FieldCastMethods. @type mask: 1d Numpy array of dtype defs.maskType, or None @param mask: The mask. @type invalidValueTreatment: string @param invalidValueTreatment: One of "returnInvalid", "asIs", "asMissing"; only "asMissing" has an effect. @type overwrite: bool @param overwrite: If True, temporarily unlike and overwrite the original mask. @rtype: 1d Numpy array of dtype defs.maskType @return: The new mask. """ if mask is None: return mask if invalidValueTreatment == "asMissing": if overwrite: mask.setflags(write=True) else: mask = NP("copy", mask) mask.setflags(write=True) mask[NP(mask == defs.INVALID)] = defs.MISSING return mask
def generateSamples(self, low, high): """Used by C{prepare} to generate an array of samples. @type low: number @param low: Minimum value to sample. @type high: number @param high: Maximum value to sample. @rtype: 1d Numpy array @return: An array of uniform, random, or adaptive samples of an interval. """ numSamples = self.get("numSamples", defaultFromXsd=True, convertType=True) samplingMethod = self.get("samplingMethod", defaultFromXsd=True) if samplingMethod == "uniform": samples = NP("linspace", low, high, numSamples, endpoint=True) elif samplingMethod == "random": samples = NP(NP(NP(NP.random.rand(numSamples)) * (high - low)) + low) samples.sort() else: raise NotImplementedError("TODO: add 'adaptive'") return samples
def select(self, dataTable, functionTable, performanceTable): """Evaluate the expression or predicate, given input data and a function table. @type dataTable: DataTable @param dataTable: Contains the data to plot. @type functionTable: FunctionTable @param functionTable: Defines functions that may be used to transform data for plotting. @type performanceTable: PerformanceTable @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process. @rtype: 1d Numpy array of bool @return: The result of the expression or predicate as a Numpy mask. """ predicate = self.childOfClass(PmmlPredicate) if predicate is not None: return predicate.evaluate(dataTable, functionTable, performanceTable) expression = self.childOfClass(PmmlExpression) dataColumn = expression.evaluate(dataTable, functionTable, performanceTable) if not dataColumn.fieldType.isboolean(): raise defs.PmmlValidationError( "PlotSelection must evaluate to boolean, not %r" % dataColumn.fieldType) dataColumn._unlock() if dataColumn.mask is not None: NP("logical_and", dataColumn.data, NP(dataColumn.mask == defs.VALID), dataColumn.data) return dataColumn.data
def _toDataColumn_dateTimeNumber(self, data, mask): dataColumn = self._toDataColumn_number(data, mask) data, mask = NP(NP(dataColumn.data * self._factor) + self._offset), dataColumn.mask data, mask = self._checkValues(data, mask) data, mask = self._checkIntervals(data, mask) return DataColumn(self, data, mask)
def subDataColumn(self, selection=None): """Return or filter this DataColumn with C{selection}. If C{selection} is None, this function returns a shallow copy of the DataColumn. It has a new Python C{id}, but the potentially large numerical array is not copied. This function can therefore be used in performance-critical situtations. @type selection: 1d Numpy array of dtype bool, or None @param selection: If None, simply return the DataColumn; otherwise, use the boolean array to filter it. @rtype: DataColumn @return: A DataColumn of the same length or shorter. """ if selection is None: return DataColumn(self._fieldType, self._data, self._mask) else: subData = self.data[selection] if self.mask is None: subMask = None else: subMask = self.mask[selection] if not isinstance(subData, NP.ndarray): subData = NP("array", [subData]) if subMask != None: subMask = NP("array", [subMask]) return DataColumn(self._fieldType, subData, subMask)
def mapReduce(self): """Build a MapReduce-Ready K-means producer. Used by C{optimize} and C{hadoopOptimize}. @rtype: MapReduce @return: An instance of MapReduce that can either be run in pure-Python mode or submitted to Hadoop. """ class KMeansMapReduceApplication(MapReduceKMeans): metadata = {} allChangeThreshold = self.allChangeThreshold KMeansMapReduceApplication.metadata[ "ClusteringModel"] = self.clusteringModel clusterVectors = {} for index, cluster in enumerate( self.clusteringModel.xpath("pmml:Cluster")): clusterName = cluster.get("id", "%d" % (index + 1)) clusterVectors[clusterName] = NP( "array", cluster.childOfTag("Array").values(), dtype=NP.dtype(float)) KMeansMapReduceApplication.metadata["clusterVectors"] = clusterVectors self.KMeansMapReduceApplication = KMeansMapReduceApplication return MapReduce(KMeansMapReduceApplication)
def applyWithoutMask(self, data, mask, argument): data, allbad = data NP("logical_xor", data, argument.data, data) if argument.mask is not None: NP("logical_and", allbad, NP(argument.mask != defs.VALID), allbad) return (data, allbad), mask
def evaluate(self, dataTable, functionTable, performanceTable, returnUnknowns=False): """Evaluate the predicate, using a DataTable as input. @type dataTable: DataTable @param dataTable: The input DataTable, containing any fields that might be used to evaluate this predicate. @type functionTable: FunctionTable @param functionTable: The FunctionTable, containing any functions that might be called in this predicate. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @type returnUnknowns: bool @param returnUnknowns: If True, return a "mask" for the selection that indicates which rows are unknown, rather than True or False. @rtype: 1d Numpy array of bool or 3-tuple of arrays @return: Either a simple selection array or selection, unknowns, encounteredUnknowns """ performanceTable.begin("Predicate False") result = NP("zeros", len(dataTable), dtype=NP.dtype(bool)) if returnUnknowns: unknowns = NP("zeros", len(dataTable), dtype=NP.dtype(bool)) result = result, unknowns, unknowns performanceTable.end("Predicate False") return result
def functionAverageFake(self, value, howmany, fieldType): """Averages rows in a DataColumn when it is known that there are no matches. @type value: number @param value: Initial and final value. @type howmany: int @param howmany: Number of rows. @type fieldType: FieldType @param fieldType: The type of field to emulate. @rtype: DataColumn @return: The faked results. """ fieldType = FakeFieldType("double", "continuous") numerator = NP("empty", howmany, dtype=fieldType.dtype) denominator = NP("empty", howmany, dtype=fieldType.dtype) numerator[:] = value[0] denominator[:] = value[1] data = NP(numerator / denominator) if value[1] == 0: mask = NP("empty", howmany, dtype=defs.maskType) mask[:] = defs.INVALID else: mask = None return DataColumn(fieldType, data, mask)
def cusum(self, testDistributions, fieldName, dataColumn, state, performanceTable): """Calculate the score of a CUSUM TestStatistic. The CUSUM cumulative sum is a stateful calculation: each row depends on the result of the previous row. To continue calculations through multiple calls to C{calc} or C{calculate}, pass a DataTableState object and give the BaselineModel a C{stateId} attribute. The C{stateId} is not valid in strict PMML, but it can be inserted after validation or used in custom-ODG models (C{from augustus.odg import *}). @type testDistributions: PmmlBinding @param testDistributions: The <TestDistributions> element. @type fieldName: string @param fieldName: The field name (for error messages). @type dataColumn: DataColumn @param dataColumn: The field. @type state: DataTableState @param state: The persistent state object, which is used to initialize the start state and save the end state of the cumulative sum. @type performanceTable: PerformanceTable or None @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: dict @return: A dictionary mapping PMML "feature" strings to DataColumns; CUSUM only defines the None key ("predictedValue"). """ baseline = testDistributions.xpath("pmml:Baseline/pmml:GaussianDistribution | pmml:Baseline/pmml:PoissonDistribution") alternate = testDistributions.xpath("pmml:Alternate/pmml:GaussianDistribution | pmml:Alternate/pmml:PoissonDistribution") if len(baseline) == 0 or len(alternate) == 0: raise defs.PmmlValidationError("BaselineModel CUSUM requires a Baseline and an Alternate that are either GaussianDistribution or PoissonDistribution") ratios = alternate[0].logpdf(dataColumn.data) - baseline[0].logpdf(dataColumn.data) if dataColumn.mask is None: good = NP("ones", len(dataColumn), dtype=NP.dtype(bool)) else: good = NP(dataColumn.mask == defs.VALID) stateId = self.get("stateId") last = None if stateId is not None: last = state.get(stateId) if last is None: last = 0.0 resetValue = testDistributions.get("resetValue", defaultFromXsd=True, convertType=True) output = NP("empty", len(dataColumn), dtype=NP.dtype(float)) performanceTable.begin("fill CUSUM") for index in xrange(len(dataColumn)): if good[index]: last = max(resetValue, last + ratios[index]) output[index] = last performanceTable.end("fill CUSUM") if stateId is not None: state[stateId] = last return {None: DataColumn(self.scoreType, output, None)}
def applyWithMask(self, data, mask, argument, mask2): data, allbad = data data[mask2] = NP("logical_xor", data[mask2], argument.data[mask2]) if argument.mask is not None: allbad[mask2] = NP("logical_and", NP(allbad[mask2] != defs.VALID), argument.mask[mask2]) return (data, allbad), mask
def functionMax(self, dataColumn, whereMask, groupSelection, getstate, setstate): """Finds the maximum of rows in a DataColumn, possibly with an SQL where mask and groupField. @type dataColumn: DataColumn @param dataColumn: The input data column. @type whereMask: 1d Numpy array of bool, or None @param whereMask: The result of the SQL where selection. @type groupSelection: 1d Numpy array of bool, or None. @param groupSelection: Rows corresponding to a particular value of the groupField. @type getstate: callable function @param getstate: Retrieve staring values from the DataTableState. @type setstate: callable function @param setstate: Store ending values to the DataTableState. @rtype: DataColumn @return: A column of maximized rows. """ fieldType = dataColumn.fieldType if fieldType.optype not in ("continuous", "ordinal"): raise defs.PmmlValidationError("Aggregate function \"min\" requires a continuous or ordinal input field") if dataColumn.mask is None: selection = NP("ones", len(dataColumn), dtype=NP.dtype(bool)) else: selection = NP(dataColumn.mask == defs.VALID) if whereMask is not None: NP("logical_and", selection, whereMask, selection) if groupSelection is not None: NP("logical_and", selection, groupSelection, selection) maximum = None if getstate is not None: startingState = getstate() if startingState is not None: maximum = startingState data = NP("empty", len(dataColumn), dtype=fieldType.dtype) mask = NP("zeros", len(dataColumn), dtype=defs.maskType) for i, x in enumerate(dataColumn.data): if selection[i]: if maximum is None or x > maximum: maximum = x if maximum is None: mask[i] = defs.INVALID else: data[i] = maximum if not mask.any(): mask = None if setstate is not None: setstate(maximum) return DataColumn(fieldType, data, mask)
def calculate(self, dataTable, functionTable=None, performanceTable=None): """Perform a calculation directly, without constructing a DataTable first. This method is intended for performance-critical cases where the DataTable would be built without having to analyze the PMML for field type context. This method modifies the input DataTable and FunctionTable. @type dataTable: DataTable @param dataTable: The pre-built DataTable. @type functionTable: FunctionTable or None @param functionTable: A table of functions. Initially, it contains only the built-in functions, but any user functions defined in PMML would be added to it. @type performanceTable: PerformanceTable or None @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: DataTable @return: A DataTable containing the result, usually a modified version of the input. """ if functionTable is None: functionTable = FunctionTable() if performanceTable is None: performanceTable = FakePerformanceTable() if not self.get("isScorable", defaultFromXsd=True, convertType=True): dataTable.score = DataColumn(self.scoreType, NP(NP("ones", len(dataTable), dtype=self.scoreType.dtype) * defs.PADDING), NP(NP("ones", len(dataTable), dtype=defs.maskType) * defs.INVALID)) return dataTable subTable = dataTable.subTable() for miningField in self.xpath("pmml:MiningSchema/pmml:MiningField"): miningField.replaceField(subTable, functionTable, performanceTable) for calculable in self.calculableTrans(): calculable.calculate(subTable, functionTable, performanceTable) score = self.calculateScore(subTable, functionTable, performanceTable) dataTable.score = score[None] if self.name is not None: for key, value in score.items(): if key is None: dataTable.fields[self.name] = value else: dataTable.fields["%s.%s" % (self.name, key)] = value for outputField in self.xpath("pmml:Output/pmml:OutputField"): displayName = outputField.get("displayName", outputField["name"]) dataTable.output[displayName] = outputField.format(subTable, functionTable, performanceTable, score) for fieldName in subTable.output: dataTable.output[fieldName] = subTable.output[fieldName] return dataTable.score
def finalizeDistance(self, state, adjustM, distributionBased, covarianceMatrix): """Third and final step in a vectorized metric calculation, called once after all fields and cluster centers. Only modifes the C{state} object. @type state: ad-hoc Python object @param state: State information that persists long enough to span the three steps of a metric calculation. This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState. @type adjustM: 1d Numpy array of numbers @param adjustM: The "adjustM" value, intended to adjust for missing values, as defined in the PMML specification. @type distributionBased: bool @param distributionBased: If True, use a covariance matrix to scale the distance result. @type covarianceMatrix: Numpy matrix @param covarianceMatrix: The covariance matrix to scale the result if C{distributionBased}. @rtype: 1d Numpy array of numbers @return: The array of distances or similarities for center-based clustering, and number of standard deviations for distribution-based clustering. """ if adjustM is None: result = state.sumInQuadrature else: result = NP(state.sumInQuadrature * adjustM) if distributionBased: normalizations = NP("sqrt", NP("sum", NP(state.displacements**2), axis=1)) selection = NP(normalizations > 0.0) state.displacements[selection] = state.displacements[selection] / (normalizations[:, NP.newaxis])[selection] lengthOfSigma = NP("sum", NP(NP(state.displacements.dot(covarianceMatrix)) * state.displacements), axis=1) result[selection] = NP(result[selection] / lengthOfSigma[selection]) return result
def _checkValues(self, data, mask): values = self.values if len(values) == 0: return data, mask if mask is None: missing = NP("zeros", len(data), dtype=NP.dtype(bool)) invalid = NP("zeros", len(data), dtype=NP.dtype(bool)) else: missing = NP(mask == defs.MISSING) invalid = NP(mask == defs.INVALID) valid = NP("zeros", len(data), dtype=NP.dtype(bool)) numberOfValidSpecified = 0 for value in values: v = value.get("value") displayValue = value.get("displayValue") if displayValue is not None: self._displayValue[v] = displayValue prop = value.get("property", "valid") try: v2 = self.stringToValue(v) except ValueError: raise defs.PmmlValidationError("Improper value in Value specification: \"%s\"" % v) if prop == "valid": NP("logical_or", valid, NP(data == v2), valid) numberOfValidSpecified += 1 elif prop == "missing": NP("logical_or", missing, NP(data == v2), missing) elif prop == "invalid": NP("logical_or", invalid, NP(data == v2), invalid) if numberOfValidSpecified > 0: # guilty until proven innocent NP("logical_and", valid, NP("logical_not", missing), valid) if valid.all(): return data, None mask = NP(NP("ones", len(data), dtype=defs.maskType) * defs.INVALID) mask[missing] = defs.MISSING mask[valid] = defs.VALID else: # innocent until proven guilty NP("logical_and", invalid, NP("logical_not", missing), invalid) if not NP("logical_or", invalid, missing).any(): return data, None mask = NP("zeros", len(data), dtype=defs.maskType) mask[missing] = defs.MISSING mask[invalid] = defs.INVALID return data, mask
def functionAverage(self, dataColumn, whereMask, groupSelection, getstate, setstate): """Averages rows in a DataColumn, possibly with an SQL where mask and groupField. @type dataColumn: DataColumn @param dataColumn: The input data column. @type whereMask: 1d Numpy array of bool, or None @param whereMask: The result of the SQL where selection. @type groupSelection: 1d Numpy array of bool, or None. @param groupSelection: Rows corresponding to a particular value of the groupField. @type getstate: callable function @param getstate: Retrieve staring values from the DataTableState. @type setstate: callable function @param setstate: Store ending values to the DataTableState. @rtype: DataColumn @return: A column of averaged rows. """ fieldType = FakeFieldType("double", "continuous") if dataColumn.fieldType.dataType not in ("integer", "float", "double"): raise defs.PmmlValidationError("Aggregate function \"average\" requires a numeric input field: \"integer\", \"float\", \"double\"") denominator = NP("ones", len(dataColumn), dtype=fieldType.dtype) if dataColumn.mask is not None: NP("logical_and", denominator, NP(dataColumn.mask == defs.VALID), denominator) if whereMask is not None: NP("logical_and", denominator, whereMask, denominator) if groupSelection is not None: NP("logical_and", denominator, groupSelection, denominator) numerator = NP("multiply", denominator, dataColumn.data) if getstate is not None and len(dataColumn) > 0: startingState = getstate() if startingState is not None: startingNumerator, startingDenominator = startingState numerator[0] += startingNumerator denominator[0] += startingDenominator numerator = NP("cumsum", numerator) denominator = NP("cumsum", denominator) data = NP(numerator / denominator) mask = NP(NP("logical_not", NP("isfinite", data)) * defs.INVALID) if not mask.any(): mask = None if setstate is not None and len(dataColumn) > 0: setstate((numerator[-1], denominator[-1])) return DataColumn(fieldType, data, mask)
def functionMultiset(self, dataColumn, whereMask, groupSelection, getstate, setstate): """Derives a multiset of rows in a DataColumn, possibly with an SQL where mask and groupField. @type dataColumn: DataColumn @param dataColumn: The input data column. @type whereMask: 1d Numpy array of bool, or None @param whereMask: The result of the SQL where selection. @type groupSelection: 1d Numpy array of bool, or None. @param groupSelection: Rows corresponding to a particular value of the groupField. @type getstate: callable function @param getstate: Retrieve staring values from the DataTableState. @type setstate: callable function @param setstate: Store ending values to the DataTableState. @rtype: DataColumn of dict objects @return: A column of multisetted rows. """ fieldType = FakeFieldType("object", "any") selection = NP("ones", len(dataColumn), dtype=NP.dtype(bool)) if dataColumn.mask is not None: selection = NP("logical_and", selection, NP(dataColumn.mask == defs.VALID)) if whereMask is not None: NP("logical_and", selection, whereMask, selection) if groupSelection is not None: NP("logical_and", selection, groupSelection, selection) multiset = {} if getstate is not None: startingState = getstate() if startingState is not None: multiset = startingState current = dict(multiset) data = NP("empty", len(dataColumn), dtype=NP.dtype(object)) toPython = dataColumn.fieldType.valueToPython for i, x in enumerate(dataColumn.data): if selection[i]: value = toPython(x) if value not in multiset: multiset[value] = 0 multiset[value] += 1 current = dict(multiset) data[i] = current if setstate is not None: setstate(multiset) return DataColumn(fieldType, data, None)
def pointsToSmoothCurve(xarray, yarray, samples, smoothingScale, loop): """Fit a smooth line through a set of given numeric points with a characteristic smoothing scale. This is a non-parametric locally linear fit, used to plot data as a smooth line. @type xarray: 1d Numpy array of numbers @param xarray: Array of x values. @type yarray: 1d Numpy array of numbers @param yarray: Array of y values. @type samples: 1d Numpy array of numbers @param samples: Locations at which to fit the C{xarray} and C{yarray} with best-fit positions and derivatives. @type smoothingScale: number @param smoothingScale: Standard deviation of the Gaussian kernel used to smooth the locally linear fit. @type loop: bool @param loop: If False, disconnect the end of the fitted curve from the beginning. @rtype: 4-tuple of 1d Numpy arrays @return: C{xlist}, C{ylist}, C{dxlist}, C{dylist} appropriate for C{formatPathdata}. """ ylist = [] dylist = [] for sample in samples: weights = NP(NP(NP("exp", NP(NP(-0.5 * NP("power", NP(xarray - sample), 2)) / NP(smoothingScale * smoothingScale))) / smoothingScale) / (math.sqrt(2.0*math.pi))) sum1 = weights.sum() sumx = NP(weights * xarray).sum() sumxx = NP(weights * NP(xarray * xarray)).sum() sumy = NP(weights * yarray).sum() sumxy = NP(weights * NP(xarray * yarray)).sum() delta = (sum1 * sumxx) - (sumx * sumx) intercept = ((sumxx * sumy) - (sumx * sumxy)) / delta slope = ((sum1 * sumxy) - (sumx * sumy)) / delta ylist.append(intercept + (sample * slope)) dylist.append(slope) xlist = samples ylist = NP("array", ylist, dtype=NP.dtype(float)) dxlist = NP((NP("roll", xlist, -1) - NP("roll", xlist, 1)) / 2.0) dylist = NP("array", dylist, dtype=NP.dtype(float)) * dxlist if not loop: dxlist[0] = 0.0 dxlist[-1] = 0.0 dylist[0] = 0.0 dylist[-1] = 0.0 return xlist, ylist, dxlist, dylist
def generateSamples(self, low, high): """Used by C{prepare} to generate an array of samples. @type low: number @param low: Minimum value to sample. @type high: number @param high: Maximum value to sample. @rtype: 1d Numpy array @return: An array of uniform, random, or adaptive samples of an interval. """ numSamples = self.get("numSamples", defaultFromXsd=True, convertType=True) samplingMethod = self.get("samplingMethod", defaultFromXsd=True) if samplingMethod == "uniform": samples = NP("linspace", low, high, numSamples, endpoint=True) elif samplingMethod == "random": samples = NP( NP(NP(NP.random.rand(numSamples)) * (high - low)) + low) samples.sort() else: raise NotImplementedError("TODO: add 'adaptive'") return samples
def _fromDataColumn_number(self, dataColumn): if dataColumn.mask is None: return NP("array", dataColumn.data, dtype=NP.dtype(object)) else: output = NP("empty", len(dataColumn), dtype=NP.dtype(object)) mask = dataColumn.mask for i, x in enumerate(dataColumn.data): if mask[i] == defs.VALID: output[i] = x elif mask[i] == defs.MISSING: output[i] = defs.NAN else: output[i] = None return output
def logpdf(self, array): """Vectorized logarithm of the probability density function (PDF). @type array: 1d Numpy array of numbers @param array: The input vector. @rtype: 1d Numpy array of numbers @return: The result of ln(PDF_Gaussian(x)) for all input values x. """ mean = float(self.attrib["mean"]) twovariance = 2.0 * float(self.attrib["variance"]) return NP( NP(NP("negative", NP("square", NP(array - mean))) / twovariance) - math.log(math.sqrt(math.pi * twovariance)))
def applyMapMissingTo(fieldType, data, mask, mapMissingTo, overwrite=False): """Replace MISSING values with a given substitute. This function does not modify the original data (unless C{overwrite} is True), but it returns a substitute. Example use:: data, mask = dataColumn.data, dataColumn.mask data, mask = FieldCastMethods.applyMapMissingTo(dataColumn.fieldType, data, mask, "-999") return DataColumn(dataColumn.fieldType, data, mask) It can also be used in conjunction with other FieldCastMethods. @type fieldType: FieldType @param fieldType: The data fieldType (to interpret C{mapMissingTo}). @type data: 1d Numpy array @param data: The data. @type mask: 1d Numpy array of dtype defs.maskType, or None @param mask: The mask. @type mapMissingTo: string @param mapMissingTo: The replacement value, represented as a string (e.g. directly from a PMML attribute). @type overwrite: bool @param overwrite: If True, temporarily unlike and overwrite the original mask. @rtype: 2-tuple of 1d Numpy arrays @return: The new data and mask. """ if mask is None: return data, mask if mapMissingTo is not None: selection = NP(mask == defs.MISSING) try: mappedValue = fieldType.stringToValue(mapMissingTo) except ValueError as err: raise defs.PmmlValidationError("mapMissingTo string \"%s\" cannot be cast as %r: %s" % (mapMissingTo, fieldType, str(err))) if overwrite: data.setflags(write=True) mask.setflags(write=True) else: data = NP("copy", data) mask = NP("copy", mask) data[selection] = mappedValue mask[selection] = defs.VALID if not mask.any(): mask = None return data, mask
def _toDataColumn_number(self, data, mask): data, mask = self._checkNumpy(data, mask) if isinstance(data, NP.ndarray) and (mask is None or isinstance(mask, NP.ndarray)) and data.dtype == self.dtype: mask2 = NP("isnan", data) if mask is None: mask = NP("array", mask2, defs.maskType) * defs.MISSING else: mask[mask2] = defs.MISSING else: data, mask = self._checkNonNumpy(data, mask) try: data = NP("array", data, dtype=self.dtype) # mask is handled in the else statement after the except block except (ValueError, TypeError): data2 = NP("empty", len(data), dtype=self.dtype) if mask is None: mask2 = NP("zeros", len(data), dtype=defs.maskType) else: mask2 = NP("fromiter", ((defs.VALID if not m else defs.MISSING) for m in mask), dtype=defs.maskType, count=len(mask)) for i, v in enumerate(data): try: data2[i] = v if mask2[i] == defs.VALID and ((isinstance(v, float) and math.isnan(v)) or (isinstance(v, basestring) and v.upper() == "NAN")): mask2[i] = defs.MISSING if v is None: raise TypeError except (ValueError, TypeError): data2[i] = defs.PADDING if mask2[i] == defs.VALID: if (isinstance(v, float) and math.isnan(v)) or (isinstance(v, basestring) and v.upper() == "NAN"): mask2[i] = defs.MISSING else: mask2[i] = defs.INVALID if not mask2.any(): mask2 = None data, mask = data2, mask2 else: mask2 = NP("isnan", data) if mask is None: mask = NP("array", mask2, defs.maskType) else: mask = NP(NP("array", NP("logical_or", mask2, NP("fromiter", (m != 0 for m in mask), dtype=NP.dtype(bool), count=len(mask))), defs.maskType) * defs.MISSING) if not mask.any(): mask = None data, mask = self._checkValues(data, mask) data, mask = self._checkIntervals(data, mask) return DataColumn(self, data, mask)
def singleton(self, inputData, inputMask=None, inputState=None): """Create a single-row DataTable for event-based processes. This static method is to the DataTable constructor, but it creates a DataTable with only one row and it uses the Python data type of the C{inputData} to define a type, rather than an explicit C{context}. @type inputData: dict-like mapping from strings to single values (not lists) @param inputData: A single data record. @type inputMask: dict-like mapping from strings to single C{defs.maskType} values (not lists), or None @param inputMask: A single mask. @type inputState: DataTableState or None @param inputState: Initial state of the DataTable. To continue a previous calculation, use the C{dataTable.state} from the previous calculation. """ dataColumns = OrderedDict() for fieldName in sorted(inputData.keys()): value = inputData[fieldName] if isinstance(value, basestring): fieldType = FakeFieldType("string", "continuous") elif isinstance(value, float): fieldType = FakeFieldType("double", "continuous") elif isinstance(value, int): fieldType = FakeFieldType("integer", "continuous") elif isinstance(value, bool): fieldType = FakeFieldType("boolean", "continuous") # TODO: PMML date types (when passed a datetype.datetype object) else: fieldType = FakeFieldType("object", "any") data = NP("empty", 1, dtype=fieldType.dtype) data[0] = value if inputMask is None or inputMask.get(fieldName) is None: mask = None else: mask = NP("empty", 1, dtype=defs.maskType) mask[0] = inputMask.get(fieldName) dataColumns[fieldName] = DataColumn(fieldType, data, mask) dataTable = DataTable.__new__(DataTable) dataTable._configure(dataColumns, inputState) return dataTable
def _stringToValue_date(self, string): regex = re.match(self._iso8601_date, string) if regex is None: raise ValueError("invalid ISO 8601 date string: \"%s\"" % string) year = regex.group(1) month = regex.group(3) day = regex.group(5) try: if year is not None and month is not None and day is not None: dateTimeObject = datetime.datetime(int(year), int(month), int(day)) elif year is not None and month is not None: dateTimeObject = datetime.datetime(int(year), int(month), 1) elif year is not None: dateTimeObject = datetime.datetime(int(year), 1, 1) else: raise ValueError except ValueError: raise ValueError("invalid ISO 8601 date string: \"%s\"" % string) td = dateTimeObject - self._dateTimeOrigin return NP.int64(td.days*86400 * self._dateTimeResolution)
def _toDataColumn_string(self, data, mask): dataColumn = self._toDataColumn_object(data, mask) data = dataColumn.data mask = dataColumn.mask data.setflags(write=True) if mask is not None: mask.setflags(write=True) if mask is not None: for i, x in enumerate(dataColumn.data): if (x is None or (isinstance(x, float) and math.isnan(x))) and mask[i] == defs.VALID: mask[i] = defs.MISSING elif not isinstance(x, basestring): data[i] = repr(x) else: for i, x in enumerate(dataColumn.data): if x is None or (isinstance(x, float) and math.isnan(x)): if mask is None: mask = NP("zeros", len(data), dtype=defs.maskType) mask[i] = defs.MISSING elif not isinstance(x, basestring): data[i] = repr(x) if mask is not None: dataColumn._mask = mask data, mask = self._checkValues(data, mask) data, mask = self._checkIntervals(data, mask) return DataColumn(self, data, mask)
def zmaxPush(self, zmax, fieldType, sticky=False): """Make the z range of the bounding box larger by (possibly) pushing the z maximum higher. "Sticky" means that the final bounding box will not be expanded beyond this value, if it turns out to be the most extreme. This feature is used, for example, in the layout of a vertical histogram: the xmin and xmax of the plot window should align with the xmin and xmax of a histogram unless an overlaying graphic pushes the boundary farther. The ymax of the histogram should be inflated beyond the tallest bin so that it can be clearly seen. If C{zStrictlyPositive} is True, negative C{zmax} values are ignored. @type zmax: number @param zmax: The new C{zmax}, if this C{zmax} is larger than the currently largest C{zmax}. @type fieldType: FieldType @param fieldType: The FieldType of z. Only homogeneous FieldTypes are allowed. @type sticky: bool @param sticky: Label this zmax as a "sticky" zmax. @raise PmmlValidationError: If any z FieldTypes differ, this function will raise an error. """ self._checkFieldTypeZ(fieldType) if NP("isfinite", zmax) and (not self.zStrictlyPositive or zmax > 0.0) and (self.zmax is None or zmax > self.zmax): self.zmax = zmax if sticky: self.zmaxSticky = zmax
def yminPush(self, ymin, fieldType, sticky=False): """Make the y range of the bounding box larger by (possibly) pushing the y minimum lower. "Sticky" means that the final bounding box will not be expanded beyond this value, if it turns out to be the most extreme. This feature is used, for example, in the layout of a vertical histogram: the xmin and xmax of the plot window should align with the xmin and xmax of a histogram unless an overlaying graphic pushes the boundary farther. The ymax of the histogram should be inflated beyond the tallest bin so that it can be clearly seen. If C{yStrictlyPositive} is True, negative C{ymin} values are ignored. @type ymin: number @param ymin: The new C{ymin}, if this C{ymin} is smaller than the currently smallest C{ymin}. @type fieldType: FieldType @param fieldType: The FieldType of y. Only homogeneous FieldTypes are allowed. @type sticky: bool @param sticky: Label this ymin as a "sticky" ymin. @raise PmmlValidationError: If any y FieldTypes differ, this function will raise an error. """ self._checkFieldTypeY(fieldType) if NP("isfinite", ymin) and (not self.yStrictlyPositive or ymin > 0.0) and (self.ymin is None or ymin < self.ymin): self.ymin = ymin if sticky: self.yminSticky = ymin
def accumulate(self, state, cxy, fieldWeight, distributionBased): """Second step in a vectorized metric calculation, called for each field and cluster center. Only modifies the C{state} object. @type state: ad-hoc Python object @param state: State information that persists long enough to span the three steps of a metric calculation. This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState. @type cxy: 1d Numpy array of numbers @param cxy: Comparison distance or similarity for all rows. @type fieldWeight: number @param fieldWeight: The weight of this field. @type distributionBased: bool @param distributionBased: If True, use a covariance matrix to scale the distance result. """ NP("maximum", state.maximumComponent, NP(cxy * fieldWeight), state.maximumComponent)
def _fromDataColumn(self, dataColumn): # enumeration uses less memory and, interestingly, a little less time than a list comprehension (80 ns instead of 100 ns per record) output = NP("empty", len(dataColumn), dtype=NP.dtype(object)) if dataColumn.mask is None: for i, x in enumerate(dataColumn.data): output[i] = self.valueToPython(x) else: mask = dataColumn.mask for i, x in enumerate(dataColumn.data): if mask[i] == defs.VALID: output[i] = self.valueToPython(x) elif mask[i] == defs.MISSING: output[i] = defs.NAN else: output[i] = None return output
def endReducerKey(self, key): for clusterName in self.clusterVectors.keys(): if clusterName == key: newPosition = NP("array", [self.numer[fieldName] / self.denom[fieldName] if self.denom[fieldName] > 0.0 else 0.0 for fieldName in self.fieldNames], dtype=NP.dtype(float)) self.emit(clusterName, newPosition) break
def determineScaleBins(numBins, low, high, array): """Determine the C{numBins}, C{low}, and C{high} of the histogram from explicitly set values where available and implicitly derived values where necessary. Explicitly set values always override implicit values derived from the dataset. - C{low}, C{high} implicit values are the extrema of the dataset. - C{numBins} implicit value is the Freedman-Diaconis heuristic for number of histogram bins. @type numBins: int or None @param numBins: Input number of bins. @type low: number or None @param low: Low edge. @type high: number or None @param high: High edge. @type array: 1d Numpy array of numbers @param array: Dataset to use to implicitly derive values. @rtype: 3-tuple @return: C{numBins}, C{low}, C{high} """ generateLow = (low is None) generateHigh = (high is None) if generateLow: low = float(array.min()) if generateHigh: high = float(array.max()) if low == high: low, high = low - 1.0, high + 1.0 elif high < low: if generateLow: low = high - 1.0 elif generateHigh: high = low + 1.0 else: raise defs.PmmlValidationError( "PlotHistogram attributes low and high must be in the right order: low = %g, high = %g" % (low, high)) else: if generateLow and generateHigh: low, high = low - 0.2 * (high - low), high + 0.2 * (high - low) elif generateLow: low = low - 0.2 * (high - low) elif generateHigh: high = high + 0.2 * (high - low) if numBins is None: # the Freedman-Diaconis rule q1, q3 = NP("percentile", array, [25.0, 75.0]) binWidth = 2.0 * (q3 - q1) / math.pow(len(array), 1.0 / 3.0) if binWidth > 0.0: numBins = max(10, int(math.ceil((high - low) / binWidth))) else: numBins = 10 return numBins, low, high
def evaluate(self, dataTable, functionTable, performanceTable, arguments): arguments = [x.evaluate(dataTable, functionTable, performanceTable) for x in arguments] performanceTable.begin("built-in \"%s\"" % self.name) fieldType = self.fieldTypeFromSignature(arguments) left, right = arguments zeroDenominators = NP(NP(right.data == 0.0) * defs.INVALID) if not zeroDenominators.any(): zeroDenominators = None mask = DataColumn.mapAnyMissingInvalid([zeroDenominators, left.mask, right.mask]) dataColumn = DataColumn(fieldType, NP("floor_divide", left.data, right.data), mask) performanceTable.end("built-in \"%s\"" % self.name) return dataColumn
def evaluate(self, dataTable, functionTable, performanceTable, arguments): dataColumn = Between.evaluate(dataTable, functionTable, performanceTable, arguments) dataColumn._unlock() NP("logical_not", dataColumn.data, dataColumn.data) dataColumn._lock() return dataColumn
def evaluate(self, dataTable, functionTable, performanceTable, arguments): arguments = [ x.evaluate(dataTable, functionTable, performanceTable) for x in arguments ] performanceTable.begin("built-in \"%s\"" % self.name) fieldType = self.allBooleanType(arguments, atleast=2) data = NP("zeros", len(dataTable), dtype=fieldType.dtype) mask = None allbad = NP("ones", len(dataTable), dtype=NP.dtype(bool)) (data, allbad), mask = self.applySkipMissing((data, allbad), mask, arguments) if allbad.any(): if mask is None: mask = allbad * defs.MISSING else: NP("logical_and", allbad, NP(mask == defs.VALID), allbad) mask[allbad] = defs.MISSING performanceTable.end("built-in \"%s\"" % self.name) return DataColumn(fieldType, data, mask)
def __call__(self, x, y): """Transform the point x, y from this inner coordinate system all the way out to the outermost global coordinates, the coordinates of the SVG file. @type x: number @param x: The horizontal position in this coordinate system. @type y: number @param y: The vertical position in this coordinate system. @rtype: 2-tuple of numbers @return: The X, Y position in the outermost global coordinates. """ if not isinstance(x, (NP.ndarray, NP.double)): x = NP.double(x) if not isinstance(y, (NP.ndarray, NP.double)): y = NP.double(y) x, y = self._fx(x), self._fy(y) if isinstance(x, NP.ndarray): infinite = NP("isinf", x) minusInfinity = NP("logical_and", infinite, NP(x < 0.0)) x[infinite] = self.outerYPlusInfinity x[minusInfinity] = self.outerYMinusInfinity else: if x == float("inf"): x = self.outerYPlusInfinity elif x == float("-inf"): x = self.outerYMinusInfinity if isinstance(y, NP.ndarray): infinite = NP("isinf", y) minusInfinity = NP("logical_and", infinite, NP(y < 0.0)) y[infinite] = self.outerYPlusInfinity y[minusInfinity] = self.outerYMinusInfinity else: if y == float("inf"): y = self.outerYPlusInfinity elif y == float("-inf"): y = self.outerYMinusInfinity x, y = super(PlotCoordinatesWindow, self).__call__(x, y) return x, y
def _toDataColumn_object(self, data, mask): data, mask = self._checkNumpy(data, mask) if isinstance(data, NP.ndarray) and (mask is None or isinstance(mask, NP.ndarray)) and data.dtype == self.dtype: pass # proceed to return statement (after checking values and intervals) else: data, mask = self._checkNonNumpy(data, mask) data = NP.array(data, dtype=self.dtype) if mask is None: mask = NP("fromiter", (defs.MISSING if (isinstance(d, float) and math.isnan(d)) else defs.VALID for d in data), dtype=defs.maskType, count=len(data)) else: mask = NP("fromiter", (defs.MISSING if (m != 0 or (isinstance(data[i], float) and math.isnan(data[i]))) else defs.VALID for i, m in enumerate(mask)), dtype=defs.maskType, count=len(mask)) if not mask.any(): mask = None data, mask = self._checkValues(data, mask) data, mask = self._checkIntervals(data, mask) return DataColumn(self, data, mask)
def outliersAsMissing(mask, originalMask, selection, overwrite=False): """Label all rows specified by a selection as MISSING. This function does not modify the original mask (unless C{overwrite} is True), but it returns a substitute. Example use:: mask = dataColumn.mask mask = FieldCastMethods.outliersAsMissing(mask, dataColumn.mask, dataColumn.data < MINIMUM_CUT) mask = FieldCastMethods.outliersAsMissing(mask, dataColumn.mask, dataColumn.data > MAXIMUM_CUT) return DataColumn(dataColumn.fieldType, dataColumn.data, mask) It can also be used in conjunction with other FieldCastMethods. @type mask: 1d Numpy array of type defs.maskType, or None @param mask: The mask to be updated. @type originalMask: 1d Numpy array of type defs.maskType, or None @param originalMask: The original mask. @type selection: 1d Numpy array of bool @param selection: The rows to label as MISSING. @type overwrite: bool @param overwrite: If True, temporarily unlock and overwrite the original mask. @rtype: 1d Numpy array of type defs.maskType @return: The new mask. """ if mask is None: mask = selection * defs.MISSING elif mask is originalMask: NP("logical_and", selection, NP(mask == defs.VALID), selection) if overwrite: mask.setflags(write=True) else: mask = NP("copy", mask) mask.setflags(write=True) mask[selection] = defs.MISSING else: NP("logical_and", selection, NP(mask == defs.VALID), selection) mask[selection] = defs.MISSING return mask
def _checkIntervals(self, data, mask): intervals = self.intervals if len(intervals) == 0: return data, mask # innocent until proven guilty invalid = NP("zeros", len(data), dtype=NP.dtype(bool)) for interval in intervals: closure = interval["closure"] leftMargin = interval.get("leftMargin") rightMargin = interval.get("rightMargin") if leftMargin is not None: try: leftMargin = self.stringToValue(leftMargin) except ValueError: raise defs.PmmlValidationError("Improper value in Interval leftMargin specification: \"%s\"" % leftMargin) if closure in ("openClosed", "openOpen"): invalid[NP(data <= leftMargin)] = True elif closure in ("closedOpen", "closedClosed"): invalid[NP(data < leftMargin)] = True if rightMargin is not None: try: rightMargin = self.stringToValue(rightMargin) except ValueError: raise defs.PmmlValidationError("Improper value in Interval rightMargin specification: \"%s\"" % rightMargin) if closure in ("openOpen", "closedOpen"): invalid[NP(data >= rightMargin)] = True elif closure in ("openClosed", "closedClosed"): invalid[NP(data > rightMargin)] = True if not invalid.any(): return data, mask if mask is None: return data, NP(invalid * defs.INVALID) else: NP("logical_and", invalid, NP(mask == defs.VALID), invalid) # only change what wasn't already marked as MISSING mask[invalid] = defs.INVALID return data, mask
def evaluate(self, dataTable, functionTable, performanceTable, arguments): arguments = [x.evaluate(dataTable, functionTable, performanceTable) for x in arguments] performanceTable.begin("built-in \"%s\"" % self.name) fieldType = self.allBooleanType(arguments, atleast=2) data = NP("zeros", len(dataTable), dtype=fieldType.dtype) mask = None allbad = NP("ones", len(dataTable), dtype=NP.dtype(bool)) (data, allbad), mask = self.applySkipMissing((data, allbad), mask, arguments) if allbad.any(): if mask is None: mask = allbad * defs.MISSING else: NP("logical_and", allbad, NP(mask == defs.VALID), allbad) mask[allbad] = defs.MISSING performanceTable.end("built-in \"%s\"" % self.name) return DataColumn(fieldType, data, mask)
def __call__(self, x, y): """Transform the point x, y from this inner coordinate system all the way out to the outermost global coordinates, the coordinates of the SVG file. @type x: number @param x: The horizontal position in this coordinate system. @type y: number @param y: The vertical position in this coordinate system. @rtype: 2-tuple of numbers @return: The X, Y position in the outermost global coordinates. """ if not isinstance(x, (NP.ndarray, NP.double)): x = NP.double(x) if not isinstance(y, (NP.ndarray, NP.double)): y = NP.double(y) x, y = self.xoffset + x, self.yoffset + y x, y = super(PlotCoordinatesOffset, self).__call__(x, y) return x, y
def maskInvalid(self, data, mask): """Helper method to replace NaN and infinite values with INVALID after a potentially dangerous operation. Example:: result = NP("log", dataColumn.data) # log(0) = -inf, log(-x) = nan resultMask = self.maskInvalid(result, dataColumn.mask) return DataColumn(fakeFieldType, result, resultMask) The input C{data} and C{mask} are not modified by this method; a substitute mask is returned. @type data: 1d Numpy array @param data: The dataset that may contain NaN and infinite values. @type mask: 1d Numpy array of C{defs.maskType}, or None @param mask: The original mask. @rtype: 1d Numpy array of C{defs.maskType}, or None @return: The new mask. """ bad = NP("logical_not", NP("isfinite", data)) if bad.any(): if mask is None: mask = bad * defs.INVALID else: NP("logical_and", bad, NP(mask == defs.VALID), bad) if not mask.flags.writeable: mask = NP("copy", mask) mask.setflags(write=True) mask[bad] = defs.INVALID if mask is not None and not mask.any(): mask = None return mask
def _toDataColumn_internal(self, data, mask): data, mask = self._checkNumpy(data, mask, tryToCast=False) data, mask = self._checkNonNumpy(data, mask) try: data = NP("fromiter", (self.stringToValue(d) for d in data), dtype=self.dtype, count=len(data)) # mask is handled in the else statement after the except block except ValueError: data2 = NP("empty", len(data), dtype=self.dtype) if mask is None: mask2 = NP("zeros", len(data), dtype=defs.maskType) else: mask2 = NP("fromiter", (defs.VALID if not m else defs.MISSING for m in mask), dtype=defs.maskType, count=len(mask)) for i, v in enumerate(data): if isinstance(v, float) and math.isnan(v): data2[i] = defs.PADDING mask2[i] = defs.MISSING else: try: data2[i] = self.stringToValue(v) except (ValueError, TypeError): data2[i] = defs.PADDING mask2[i] = defs.INVALID if not mask2.any(): mask2 = None data, mask = data2, mask2 else: if mask is not None and not isinstance(mask, NP.ndarray): mask = NP("array", mask, dtype=defs.maskType) # this is the only _toDataColumn that doesn't check values and intervals because these were checked in _setup for categorical and ordinal strings return DataColumn(self, data, mask)
def initialize(self, state, numberOfRecords, numberOfFields, distributionBased): """First step in a vectorized metric calculation with missing values, called once before all fields and cluster centers. Only modifies the C{state} object. @type state: ad-hoc Python object @param state: State information that persists long enough to span the three steps of a metric calculation. This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState. @type numberOfRecords: int @param numberOfRecords: The number of rows in the dataset. @type numberOfFields: int @param numberOfFields: The number of columns in the dataset. @type distributionBased: bool @param distributionBased: If True, use a covariance matrix to scale the distance result. """ state.maximumComponent = NP("zeros", numberOfRecords, dtype=NP.dtype(float)) if distributionBased: raise NotImplementedError("Distribution-based clustering has not been implemented for the %s metric" % self.t)
def evaluate(self, dataTable, functionTable, performanceTable, returnUnknowns=False): """Evaluate the predicate, using a DataTable as input. @type dataTable: DataTable @param dataTable: The input DataTable, containing any fields that might be used to evaluate this predicate. @type functionTable: FunctionTable @param functionTable: The FunctionTable, containing any functions that might be called in this predicate. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @type returnUnknowns: bool @param returnUnknowns: If True, return a "mask" for the selection that indicates which rows are unknown, rather than True or False. @rtype: 1d Numpy array of bool or 3-tuple of arrays @return: Either a simple selection array or selection, unknowns, encounteredUnknowns """ performanceTable.begin("SimpleSetPredicate") fieldName = self.get("field") dataColumn = dataTable.fields[fieldName] fromString = dataColumn.fieldType.stringToValue array = [fromString(x) for x in self.childOfClass(Array).values(convertType=False)] selection = NP("in1d", dataColumn.data, array) if self.get("booleanOperator") == "isNotIn": NP("logical_not", selection, selection) if returnUnknowns: if dataColumn.mask is None: unknowns = NP("zeros", len(dataTable), dtype=NP.dtype(bool)) else: unknowns = NP(dataColumn.mask != defs.VALID) performanceTable.end("SimpleSetPredicate") return selection, unknowns, unknowns else: if dataColumn.mask is not None: NP("logical_and", selection, NP(dataColumn.mask == defs.VALID), selection) performanceTable.end("SimpleSetPredicate") return selection
def initialize(self, state, numberOfRecords, numberOfFields, distributionBased): """First step in a vectorized metric calculation with missing values, called once before all fields and cluster centers. Only modifies the C{state} object. @type state: ad-hoc Python object @param state: State information that persists long enough to span the three steps of a metric calculation. This is a work-around of lxml's refusal to let its Python instances maintain C{self} and it is unrelated to DataTableState. @type numberOfRecords: int @param numberOfRecords: The number of rows in the dataset. @type numberOfFields: int @param numberOfFields: The number of columns in the dataset. @type distributionBased: bool @param distributionBased: If True, use a covariance matrix to scale the distance result. """ state.sumInQuadrature = NP("zeros", numberOfRecords, dtype=NP.dtype(float)) if distributionBased: state.displacements = NP("empty", (numberOfRecords, numberOfFields), dtype=NP.dtype(float)) state.displacementIndex = 0
def _stringToValue_time(self, string): regex = re.match(self._iso8601_time, string) if regex is None: raise ValueError("invalid ISO 8601 time string: \"%s\"" % string) hour = regex.group(1) minute = regex.group(2) second = regex.group(4) subsecond = regex.group(5) timezone = regex.group(6) timezoneOffset = 0 try: if hour is not None and minute is not None and second is not None: if subsecond is None: microsecond = 0 else: microsecond = int(round(float(subsecond) * 1e6)) dateTimeObject = datetime.datetime(1970, 1, 1, int(hour), int(minute), int(second), microsecond) elif hour is not None and minute is not None: if subsecond is not None: raise ValueError dateTimeObject = datetime.datetime(1970, 1, 1, int(hour), int(minute)) if timezone is not None: regex2 = re.match(self._timezone, timezone) if regex2 is not None: sign, hourOffset, minuteOffset = regex2.groups() timezoneOffset = ((int(hourOffset) * 60) + int(minuteOffset)) * 60 * self._dateTimeResolution # microseconds if sign == "-": timezoneOffset *= -1 except ValueError: raise ValueError("invalid ISO 8601 time string: \"%s\"" % string) td = dateTimeObject - self._dateTimeOrigin return NP.int64(td.seconds * self._dateTimeResolution + td.microseconds - timezoneOffset)
def calculateScore(self, dataTable, functionTable, performanceTable): """Calculate the score of this model. This method is called by C{calculate} to separate operations that are performed by all models (in C{calculate}) from operations that are performed by specific models (in C{calculateScore}). @type subTable: DataTable @param subTable: The DataTable representing this model's lexical scope. @type functionTable: FunctionTable or None @param functionTable: A table of functions. @type performanceTable: PerformanceTable or None @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: DataColumn @return: A DataColumn containing the score. """ performanceTable.begin("ClusteringModel") performanceTable.begin("set up") distributionBased = (self["modelClass"] == "distributionBased") clusteringFields = self.xpath("pmml:ClusteringField[not(@isCenterField='false')]") fieldWeights = [clusteringField.get("fieldWeight", defaultFromXsd=True, convertType=True) for clusteringField in clusteringFields] for fieldWeight in fieldWeights: if fieldWeight < 0.0: raise defs.PmmlValidationError("ClusteringField fieldWeights must all be non-negative (encountered %g)" % fieldWeight) clusters = self.xpath("pmml:Cluster") comparisonMeasure = self.childOfClass(ComparisonMeasure) defaultCompareFunction = comparisonMeasure.get("compareFunction", defaultFromXsd=True) metric = comparisonMeasure.childOfClass(PmmlClusteringMetric) metrictag = metric.t performanceTable.end("set up") for clusteringField in clusteringFields: dataType = dataTable.fields[clusteringField["field"]].fieldType.dataType if dataType == "string": raise defs.PmmlValidationError("ClusteringField \"%s\" has dataType \"%s\", which cannot be used for clustering" % (clusteringField["field"], dataType)) missingValueWeights = self.childOfTag("MissingValueWeights") if missingValueWeights is None: adjustM = None else: performanceTable.begin("MissingValueWeights") missingWeights = missingValueWeights.childOfClass(PmmlArray).values(convertType=True) sumNMqi = NP("zeros", len(dataTable), dtype=NP.dtype(float)) for clusteringField, missingWeight in zip(clusteringFields, missingWeights): clusteringField.addToAdjustM(dataTable, functionTable, performanceTable, sumNMqi, missingWeight) adjustM = NP(sum(missingWeights) / sumNMqi) adjustM[NP(sumNMqi == 0.0)] = 1.0 performanceTable.end("MissingValueWeights") anyInvalid = NP("zeros", len(dataTable), dtype=NP.dtype(bool)) for clusteringField in clusteringFields: mask = dataTable.fields[clusteringField["field"]].mask if mask is not None: NP("logical_or", anyInvalid, NP(mask == defs.INVALID), anyInvalid) bestClusterId = None bestClusterAffinity = None allClusterAffinities = {} for index, cluster in enumerate(clusters): array = cluster.childOfClass(PmmlArray) if array is None: raise defs.PmmlValidationError("Cluster must have an array to designate its center") centerStrings = array.values(convertType=False) if len(centerStrings) != len(clusteringFields): raise defs.PmmlValidationError("Cluster array has %d components, but there are %d ClusteringFields with isCenterField=true" % (len(centerStrings), len(clusteringFields))) performanceTable.begin(metrictag) if distributionBased: matrix = cluster.xpath("pmml:Covariances/pmml:Matrix") if len(matrix) != 1: raise defs.PmmlValidationError("In distribution-based clustering, all clusters must have a Covariances/Matrix") try: covarianceMatrix = NP("array", matrix[0].values(), dtype=NP.dtype(float)) except ValueError: raise defs.PmmlValidationError("Covariances/Matrix must contain real numbers for distribution-based clustering") else: covarianceMatrix = None state = self._State() metric.initialize(state, len(dataTable), len(clusteringFields), distributionBased) for clusteringField, centerString, fieldWeight in zip(clusteringFields, centerStrings, fieldWeights): if isinstance(metric, PmmlClusteringMetricBinary): metric.accumulateBinary(state, dataTable.fields[clusteringField["field"]], centerString, distributionBased) else: performanceTable.pause(metrictag) cxy = clusteringField.compare(dataTable, functionTable, performanceTable, centerString, defaultCompareFunction, anyInvalid) performanceTable.unpause(metrictag) metric.accumulate(state, cxy, fieldWeight, distributionBased) distance = metric.finalizeDistance(state, adjustM, distributionBased, covarianceMatrix) del state performanceTable.end(metrictag) if index == 0: bestClusterId = NP("ones", len(dataTable), dtype=NP.dtype(int)) # 1-based index bestClusterAffinity = distance better = NP(distance < bestClusterAffinity) bestClusterId[better] = index + 1 # 1-based index bestClusterAffinity[better] = distance[better] allClusterAffinities[cluster.get("id", "%d" % (index + 1))] = distance if not anyInvalid.any(): scoreMask = None else: scoreMask = NP(anyInvalid * defs.INVALID) performanceTable.begin("set scores") score = {} performanceTable.begin("predictedValue") fieldType = FakeFieldType("string", "categorical") clusterIdentifiers = NP("empty", len(dataTable), dtype=fieldType.dtype) for index, cluster in enumerate(clusters): value = fieldType.stringToValue(cluster.get("id", "%d" % (index + 1))) clusterIdentifiers[NP(bestClusterId == (index + 1))] = value score[None] = DataColumn(fieldType, clusterIdentifiers, scoreMask) performanceTable.end("predictedValue") if self.subFields["predictedDisplayValue"]: performanceTable.begin("predictedDisplayValue") fieldType = FakeFieldType("string", "categorical") clusterNames = NP("empty", len(dataTable), dtype=fieldType.dtype) for index, cluster in enumerate(clusters): value = fieldType.stringToValue(cluster.get("name", "")) clusterNames[NP(bestClusterId == (index + 1))] = value score["predictedDisplayValue"] = DataColumn(fieldType, clusterNames, scoreMask) performanceTable.end("predictedDisplayValue") if self.subFields["entity"]: performanceTable.begin("entity") fieldType = FakeFieldType("object", "any") entities = NP("empty", len(dataTable), dtype=fieldType.dtype) for index, cluster in enumerate(clusters): value = fieldType.stringToValue(cluster.get("name", "")) indexPlusOne = index + 1 for i in xrange(len(entities)): if bestClusterId[i] == indexPlusOne: entities[i] = cluster score["entity"] = DataColumn(fieldType, entities, scoreMask) performanceTable.end("entity") if self.subFields["clusterId"]: performanceTable.begin("clusterId") fieldType = FakeFieldType("integer", "continuous") score["clusterId"] = DataColumn(fieldType, bestClusterId, scoreMask) performanceTable.end("clusterId") if self.subFields["entityId"]: performanceTable.begin("entityId") fieldType = FakeFieldType("integer", "continuous") score["entityId"] = DataColumn(fieldType, bestClusterId, scoreMask) performanceTable.end("entityId") if self.subFields["clusterAffinity"]: performanceTable.begin("clusterAffinity") fieldType = FakeFieldType("double", "continuous") score["clusterAffinity"] = DataColumn(fieldType, bestClusterAffinity, scoreMask) performanceTable.end("clusterAffinity") if self.subFields["affinity"]: performanceTable.begin("affinity") fieldType = FakeFieldType("double", "continuous") score["affinity"] = DataColumn(fieldType, bestClusterAffinity, scoreMask) performanceTable.end("affinity") if self.subFields["all"]: performanceTable.begin("all") fieldType = FakeFieldType("double", "continuous") for identifier, distance in allClusterAffinities.items(): score["all.%s" % identifier] = DataColumn(fieldType, distance, scoreMask) performanceTable.end("all") performanceTable.end("set scores") performanceTable.end("ClusteringModel") return score