def evaluate(self, dataTable, functionTable, performanceTable): """Evaluate the expression, using a DataTable as input. @type dataTable: DataTable @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression. @type functionTable: FunctionTable @param functionTable: The FunctionTable, containing any functions that might be called in this expression. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: DataColumn @return: The result of the calculation as a DataColumn. """ performanceTable.begin("NormDiscrete") dataColumn = dataTable.fields[self["field"]] value = dataColumn.fieldType.stringToValue(self["value"]) data = NP("array", NP(dataColumn.data == value), dtype=self._fieldType.dtype) data, mask = FieldCastMethods.applyMapMissingTo( self._fieldType, data, dataColumn.mask, self.get("mapMissingTo")) performanceTable.end("NormDiscrete") return DataColumn(self._fieldType, data, mask)
def evaluate(self, dataTable, functionTable, performanceTable, text=None): """Evaluate the expression, using a DataTable as input. @type dataTable: DataTable @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression. @type functionTable: FunctionTable @param functionTable: The FunctionTable, containing any functions that might be called in this expression. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @type text: string or None @param text: If None, use the text of this Formula object; otherwise, use C{text} instead. @rtype: DataColumn @return: The result of the calculation as a DataColumn. """ if text is None: text = self.text performanceTable.begin("Formula parse") parsed = Formula.parse(text) performanceTable.end("Formula parse") performanceTable.begin("Formula evaluate") dataColumn = parsed.evaluate(dataTable, functionTable, performanceTable) if dataColumn.mask is None: return dataColumn data = dataColumn.data mask = dataColumn.mask mask = FieldCastMethods.applyInvalidValueTreatment(mask, self.get("invalidValueTreatment")) data, mask = FieldCastMethods.applyMapMissingTo(dataColumn.fieldType, data, mask, self.get("mapMissingTo")) performanceTable.end("Formula evaluate") return DataColumn(dataColumn.fieldType, data, mask)
def _toDataColumn_string(self, data, mask): dataColumn = self._toDataColumn_object(data, mask) data = dataColumn.data mask = dataColumn.mask data.setflags(write=True) if mask is not None: mask.setflags(write=True) if mask is not None: for i, x in enumerate(dataColumn.data): if (x is None or (isinstance(x, float) and math.isnan(x))) and mask[i] == defs.VALID: mask[i] = defs.MISSING elif not isinstance(x, basestring): data[i] = repr(x) else: for i, x in enumerate(dataColumn.data): if x is None or (isinstance(x, float) and math.isnan(x)): if mask is None: mask = NP("zeros", len(data), dtype=defs.maskType) mask[i] = defs.MISSING elif not isinstance(x, basestring): data[i] = repr(x) if mask is not None: dataColumn._mask = mask data, mask = self._checkValues(data, mask) data, mask = self._checkIntervals(data, mask) return DataColumn(self, data, mask)
def evaluate(self, dataTable, functionTable, performanceTable, arguments): arguments = [ x.evaluate(dataTable, functionTable, performanceTable) for x in arguments ] performanceTable.begin("built-in \"%s\"" % self.name) fieldType = self.fieldTypeFromSignature(arguments) left, right = arguments dataColumn = DataColumn( fieldType, NP("arctan2", left.data, right.data), DataColumn.mapAnyMissingInvalid([left.mask, right.mask])) performanceTable.end("built-in \"%s\"" % self.name) return dataColumn
def _toDataColumn_dateTime(self, data, mask): data, mask = self._checkNumpy(data, mask, tryToCast=False) data, mask = self._checkNonNumpy(data, mask) data2 = NP("empty", len(data), dtype=self.dtype) mask2 = NP("zeros", len(data), dtype=defs.maskType) for i, x in enumerate(data): if (mask is not None and mask[i]) or (isinstance(x, float) and math.isnan(x)) or (isinstance(x, basestring) and x.upper() == "NAN"): data2[i] = defs.PADDING mask2[i] = defs.MISSING else: try: data2[i] = self.stringToValue(x) except (ValueError, TypeError): data2[i] = defs.PADDING mask2[i] = defs.INVALID if not mask2.any(): data, mask = data2, None else: data, mask = data2, mask2 data, mask = self._checkValues(data, mask) data, mask = self._checkIntervals(data, mask) return DataColumn(self, data, mask)
def _toDataColumn_dateTimeNumber(self, data, mask): dataColumn = self._toDataColumn_number(data, mask) data, mask = NP(NP(dataColumn.data * self._factor) + self._offset), dataColumn.mask data, mask = self._checkValues(data, mask) data, mask = self._checkIntervals(data, mask) return DataColumn(self, data, mask)
def functionAverageFake(self, value, howmany, fieldType): """Averages rows in a DataColumn when it is known that there are no matches. @type value: number @param value: Initial and final value. @type howmany: int @param howmany: Number of rows. @type fieldType: FieldType @param fieldType: The type of field to emulate. @rtype: DataColumn @return: The faked results. """ fieldType = FakeFieldType("double", "continuous") numerator = NP("empty", howmany, dtype=fieldType.dtype) denominator = NP("empty", howmany, dtype=fieldType.dtype) numerator[:] = value[0] denominator[:] = value[1] data = NP(numerator / denominator) if value[1] == 0: mask = NP("empty", howmany, dtype=defs.maskType) mask[:] = defs.INVALID else: mask = None return DataColumn(fieldType, data, mask)
def evaluate(self, dataTable, functionTable, performanceTable, arguments): arguments = [ x.evaluate(dataTable, functionTable, performanceTable) for x in arguments ] performanceTable.begin("built-in \"%s\"" % self.name) fieldType = self.allBooleanType(arguments, atleast=2) data = NP("zeros", len(dataTable), dtype=fieldType.dtype) mask = None allbad = NP("ones", len(dataTable), dtype=NP.dtype(bool)) (data, allbad), mask = self.applySkipMissing((data, allbad), mask, arguments) if allbad.any(): if mask is None: mask = allbad * defs.MISSING else: NP("logical_and", allbad, NP(mask == defs.VALID), allbad) mask[allbad] = defs.MISSING performanceTable.end("built-in \"%s\"" % self.name) return DataColumn(fieldType, data, mask)
def evaluate(self, dataTable, functionTable, performanceTable, arguments): arguments = [ x.evaluate(dataTable, functionTable, performanceTable) for x in arguments ] performanceTable.begin("built-in \"%s\"" % self.name) fieldType = self.fieldTypeFromSignature(arguments) dataColumn = DataColumn( fieldType, NP("cos", arguments[0].data * arguments[1].data), DataColumn.mapAnyMissingInvalid( [arguments[0].mask, arguments[1].mask])) performanceTable.end("built-in \"%s\"" % self.name) return dataColumn
def functionAverage(self, dataColumn, whereMask, groupSelection, getstate, setstate): """Averages rows in a DataColumn, possibly with an SQL where mask and groupField. @type dataColumn: DataColumn @param dataColumn: The input data column. @type whereMask: 1d Numpy array of bool, or None @param whereMask: The result of the SQL where selection. @type groupSelection: 1d Numpy array of bool, or None. @param groupSelection: Rows corresponding to a particular value of the groupField. @type getstate: callable function @param getstate: Retrieve staring values from the DataTableState. @type setstate: callable function @param setstate: Store ending values to the DataTableState. @rtype: DataColumn @return: A column of averaged rows. """ fieldType = FakeFieldType("double", "continuous") if dataColumn.fieldType.dataType not in ("integer", "float", "double"): raise defs.PmmlValidationError( "Aggregate function \"average\" requires a numeric input field: \"integer\", \"float\", \"double\"" ) denominator = NP("ones", len(dataColumn), dtype=fieldType.dtype) if dataColumn.mask is not None: NP("logical_and", denominator, NP(dataColumn.mask == defs.VALID), denominator) if whereMask is not None: NP("logical_and", denominator, whereMask, denominator) if groupSelection is not None: NP("logical_and", denominator, groupSelection, denominator) numerator = NP("multiply", denominator, dataColumn.data) if getstate is not None and len(dataColumn) > 0: startingState = getstate() if startingState is not None: startingNumerator, startingDenominator = startingState numerator[0] += startingNumerator denominator[0] += startingDenominator numerator = NP("cumsum", numerator) denominator = NP("cumsum", denominator) data = NP(numerator / denominator) mask = NP(NP("logical_not", NP("isfinite", data)) * defs.INVALID) if not mask.any(): mask = None if setstate is not None and len(dataColumn) > 0: setstate((numerator[-1], denominator[-1])) return DataColumn(fieldType, data, mask)
def calculate(self, dataTable, functionTable=None, performanceTable=None): """Perform a calculation directly, without constructing a DataTable first. This method is intended for performance-critical cases where the DataTable would be built without having to analyze the PMML for field type context. This method modifies the input DataTable and FunctionTable. @type dataTable: DataTable @param dataTable: The pre-built DataTable. @type functionTable: FunctionTable or None @param functionTable: A table of functions. Initially, it contains only the built-in functions, but any user functions defined in PMML would be added to it. @type performanceTable: PerformanceTable or None @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: DataTable @return: A DataTable containing the result, usually a modified version of the input. """ if functionTable is None: functionTable = FunctionTable() if performanceTable is None: performanceTable = FakePerformanceTable() if not self.get("isScorable", defaultFromXsd=True, convertType=True): dataTable.score = DataColumn(self.scoreType, NP(NP("ones", len(dataTable), dtype=self.scoreType.dtype) * defs.PADDING), NP(NP("ones", len(dataTable), dtype=defs.maskType) * defs.INVALID)) return dataTable subTable = dataTable.subTable() for miningField in self.xpath("pmml:MiningSchema/pmml:MiningField"): miningField.replaceField(subTable, functionTable, performanceTable) for calculable in self.calculableTrans(): calculable.calculate(subTable, functionTable, performanceTable) score = self.calculateScore(subTable, functionTable, performanceTable) dataTable.score = score[None] if self.name is not None: for key, value in score.items(): if key is None: dataTable.fields[self.name] = value else: dataTable.fields["%s.%s" % (self.name, key)] = value for outputField in self.xpath("pmml:Output/pmml:OutputField"): displayName = outputField.get("displayName", outputField["name"]) dataTable.output[displayName] = outputField.format(subTable, functionTable, performanceTable, score) for fieldName in subTable.output: dataTable.output[fieldName] = subTable.output[fieldName] return dataTable.score
def evaluate(self, dataTable, functionTable, performanceTable, arguments): arguments = [x.evaluate(dataTable, functionTable, performanceTable) for x in arguments] performanceTable.begin("built-in \"%s\"" % self.name) fieldType = self.fieldTypeFromSignature(arguments) left, right = arguments dataColumn = DataColumn(fieldType, NP("arctan2", left.data, right.data), DataColumn.mapAnyMissingInvalid([left.mask, right.mask])) performanceTable.end("built-in \"%s\"" % self.name) return dataColumn
def functionMultiset(self, dataColumn, whereMask, groupSelection, getstate, setstate): """Derives a multiset of rows in a DataColumn, possibly with an SQL where mask and groupField. @type dataColumn: DataColumn @param dataColumn: The input data column. @type whereMask: 1d Numpy array of bool, or None @param whereMask: The result of the SQL where selection. @type groupSelection: 1d Numpy array of bool, or None. @param groupSelection: Rows corresponding to a particular value of the groupField. @type getstate: callable function @param getstate: Retrieve staring values from the DataTableState. @type setstate: callable function @param setstate: Store ending values to the DataTableState. @rtype: DataColumn of dict objects @return: A column of multisetted rows. """ fieldType = FakeFieldType("object", "any") selection = NP("ones", len(dataColumn), dtype=NP.dtype(bool)) if dataColumn.mask is not None: selection = NP("logical_and", selection, NP(dataColumn.mask == defs.VALID)) if whereMask is not None: NP("logical_and", selection, whereMask, selection) if groupSelection is not None: NP("logical_and", selection, groupSelection, selection) multiset = {} if getstate is not None: startingState = getstate() if startingState is not None: multiset = startingState current = dict(multiset) data = NP("empty", len(dataColumn), dtype=NP.dtype(object)) toPython = dataColumn.fieldType.valueToPython for i, x in enumerate(dataColumn.data): if selection[i]: value = toPython(x) if value not in multiset: multiset[value] = 0 multiset[value] += 1 current = dict(multiset) data[i] = current if setstate is not None: setstate(multiset) return DataColumn(fieldType, data, None)
def _toDataColumn_number(self, data, mask): data, mask = self._checkNumpy(data, mask) if isinstance(data, NP.ndarray) and (mask is None or isinstance(mask, NP.ndarray)) and data.dtype == self.dtype: mask2 = NP("isnan", data) if mask is None: mask = NP("array", mask2, defs.maskType) * defs.MISSING else: mask[mask2] = defs.MISSING else: data, mask = self._checkNonNumpy(data, mask) try: data = NP("array", data, dtype=self.dtype) # mask is handled in the else statement after the except block except (ValueError, TypeError): data2 = NP("empty", len(data), dtype=self.dtype) if mask is None: mask2 = NP("zeros", len(data), dtype=defs.maskType) else: mask2 = NP("fromiter", ((defs.VALID if not m else defs.MISSING) for m in mask), dtype=defs.maskType, count=len(mask)) for i, v in enumerate(data): try: data2[i] = v if mask2[i] == defs.VALID and ((isinstance(v, float) and math.isnan(v)) or (isinstance(v, basestring) and v.upper() == "NAN")): mask2[i] = defs.MISSING if v is None: raise TypeError except (ValueError, TypeError): data2[i] = defs.PADDING if mask2[i] == defs.VALID: if (isinstance(v, float) and math.isnan(v)) or (isinstance(v, basestring) and v.upper() == "NAN"): mask2[i] = defs.MISSING else: mask2[i] = defs.INVALID if not mask2.any(): mask2 = None data, mask = data2, mask2 else: mask2 = NP("isnan", data) if mask is None: mask = NP("array", mask2, defs.maskType) else: mask = NP(NP("array", NP("logical_or", mask2, NP("fromiter", (m != 0 for m in mask), dtype=NP.dtype(bool), count=len(mask))), defs.maskType) * defs.MISSING) if not mask.any(): mask = None data, mask = self._checkValues(data, mask) data, mask = self._checkIntervals(data, mask) return DataColumn(self, data, mask)
def evaluate(self, dataTable, functionTable, performanceTable, arguments): arguments = [ x.evaluate(dataTable, functionTable, performanceTable) for x in arguments ] performanceTable.begin("built-in \"%s\"" % self.name) fieldType = self.fieldTypeFromSignature(arguments) data = NP("arctanh", arguments[0].data) mask = self.maskInvalid(data, arguments[0].mask) performanceTable.end("built-in \"%s\"" % self.name) return DataColumn(fieldType, data, mask)
def evaluate(self, dataTable, functionTable, performanceTable, arguments): arguments = [ x.evaluate(dataTable, functionTable, performanceTable) for x in arguments ] performanceTable.begin("built-in \"%s\"" % self.name) fieldType = self.fieldTypeFromSignature(arguments) dataColumn = DataColumn(fieldType, NP("sin", arguments[0].data), arguments[0].mask) performanceTable.end("built-in \"%s\"" % self.name) return dataColumn
def evaluate(self, dataTable, functionTable, performanceTable, arguments): arguments = [ x.evaluate(dataTable, functionTable, performanceTable) for x in arguments ] performanceTable.begin("built-in \"%s\"" % self.name) fieldType = self.fieldTypeFromSignature(arguments) left, right = arguments zeroDenominators = NP(NP(right.data == 0.0) * defs.INVALID) if not zeroDenominators.any(): zeroDenominators = None mask = DataColumn.mapAnyMissingInvalid( [zeroDenominators, left.mask, right.mask]) dataColumn = DataColumn(fieldType, NP("floor_divide", left.data, right.data), mask) performanceTable.end("built-in \"%s\"" % self.name) return dataColumn
def functionSum(self, dataColumn, whereMask, groupSelection, getstate, setstate): """Adds up rows in a DataColumn, possibly with an SQL where mask and groupField. @type dataColumn: DataColumn @param dataColumn: The input data column. @type whereMask: 1d Numpy array of bool, or None @param whereMask: The result of the SQL where selection. @type groupSelection: 1d Numpy array of bool, or None. @param groupSelection: Rows corresponding to a particular value of the groupField. @type getstate: callable function @param getstate: Retrieve staring values from the DataTableState. @type setstate: callable function @param setstate: Store ending values to the DataTableState. @rtype: DataColumn @return: A column of added rows. """ fieldType = FakeFieldType("double", "continuous") if dataColumn.fieldType.dataType not in ("integer", "float", "double"): raise defs.PmmlValidationError( "Aggregate function \"sum\" requires a numeric input field: \"integer\", \"float\", \"double\"" ) ones = NP("ones", len(dataColumn), dtype=fieldType.dtype) if dataColumn.mask is not None: NP("logical_and", ones, NP(dataColumn.mask == defs.VALID), ones) if whereMask is not None: NP("logical_and", ones, whereMask, ones) if groupSelection is not None: NP("logical_and", ones, groupSelection, ones) NP("multiply", ones, dataColumn.data, ones) if getstate is not None and len(dataColumn) > 0: startingState = getstate() if startingState is not None: ones[0] += startingState data = NP("cumsum", ones) if setstate is not None and len(dataColumn) > 0: setstate(data[-1]) return DataColumn(fieldType, data, None)
def singleton(self, inputData, inputMask=None, inputState=None): """Create a single-row DataTable for event-based processes. This static method is to the DataTable constructor, but it creates a DataTable with only one row and it uses the Python data type of the C{inputData} to define a type, rather than an explicit C{context}. @type inputData: dict-like mapping from strings to single values (not lists) @param inputData: A single data record. @type inputMask: dict-like mapping from strings to single C{defs.maskType} values (not lists), or None @param inputMask: A single mask. @type inputState: DataTableState or None @param inputState: Initial state of the DataTable. To continue a previous calculation, use the C{dataTable.state} from the previous calculation. """ dataColumns = OrderedDict() for fieldName in sorted(inputData.keys()): value = inputData[fieldName] if isinstance(value, basestring): fieldType = FakeFieldType("string", "continuous") elif isinstance(value, float): fieldType = FakeFieldType("double", "continuous") elif isinstance(value, int): fieldType = FakeFieldType("integer", "continuous") elif isinstance(value, bool): fieldType = FakeFieldType("boolean", "continuous") # TODO: PMML date types (when passed a datetype.datetype object) else: fieldType = FakeFieldType("object", "any") data = NP("empty", 1, dtype=fieldType.dtype) data[0] = value if inputMask is None or inputMask.get(fieldName) is None: mask = None else: mask = NP("empty", 1, dtype=defs.maskType) mask[0] = inputMask.get(fieldName) dataColumns[fieldName] = DataColumn(fieldType, data, mask) dataTable = DataTable.__new__(DataTable) dataTable._configure(dataColumns, inputState) return dataTable
def functionMinMaxFake(self, value, howmany, fieldType): """Minimizes or maximizes rows in a DataColumn when it is known that there are no matches. @type value: number @param value: Initial and final value. @type howmany: int @param howmany: Number of rows. @type fieldType: FieldType @param fieldType: The type of field to emulate. @rtype: DataColumn @return: The faked results. """ data = NP("empty", howmany, dtype=fieldType.dtype) data[:] = value return DataColumn(fieldType, data, None)
def functionMultisetFake(self, value, howmany, fieldType): """Derives a multiset of rows in a DataColumn when it is known that there are no matches. @type value: number @param value: Initial and final value. @type howmany: int @param howmany: Number of rows. @type fieldType: FieldType @param fieldType: The type of field to emulate. @rtype: DataColumn @return: The faked results. """ fieldType = FakeFieldType("object", "any") data = NP("empty", howmany, dtype=fieldType.dtype) data[:] = value return DataColumn(fieldType, data, None)
def evaluate(self, dataTable, functionTable, performanceTable, arguments): arguments = [x.evaluate(dataTable, functionTable, performanceTable) for x in arguments] performanceTable.begin("built-in \"%s\"" % self.name) fieldType = self.fieldTypeFromSignature(arguments) left, right = arguments zeroDenominators = NP(NP(right.data == 0.0) * defs.INVALID) if not zeroDenominators.any(): zeroDenominators = None mask = DataColumn.mapAnyMissingInvalid([zeroDenominators, left.mask, right.mask]) dataColumn = DataColumn(fieldType, NP("floor_divide", left.data, right.data), mask) performanceTable.end("built-in \"%s\"" % self.name) return dataColumn
def zValue(self, testDistributions, fieldName, dataColumn, state, performanceTable): """Calculate the score of a zValue TestStatistic. @type testDistributions: PmmlBinding @param testDistributions: The <TestDistributions> element. @type fieldName: string @param fieldName: The field name (for error messages). @type dataColumn: DataColumn @param dataColumn: The field. @type state: DataTableState @param state: The persistent state object (not used). @type performanceTable: PerformanceTable or None @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: dict @return: A dictionary mapping PMML "feature" strings to DataColumns; zValue only defines the None key ("predictedValue"). """ if dataColumn.fieldType.dataType in ("object", "string", "boolean", "date", "time", "dateTime"): raise TypeError( "Field \"%s\" has dataType \"%s\", which is incompatible with BaselineModel.zValue" % (fieldName, dataColumn.fieldType.dataType)) distributions = testDistributions.xpath( "pmml:Baseline/*[@mean and @variance]") if len(distributions) == 0: raise defs.PmmlValidationError( "BaselineModel zValue requires a distribution with a mean and a variance" ) distribution = distributions[0] mean = float(distribution.get("mean")) variance = float(distribution.get("variance")) if variance <= 0.0: raise defs.PmmlValidationError( "Variance must be positive, not %g" % variance) return { None: DataColumn(self.scoreType, NP(NP(dataColumn.data - mean) / math.sqrt(variance)), dataColumn.mask) }
def evaluate(self, dataTable, functionTable, performanceTable, arguments): performanceTable.begin("built-in \"%s\"" % self.name) fieldType = self._typeReverseMap[BOOL] if len(arguments) != 2: raise defs.PmmlValidationError( "Function \"like\" requires exactly two arguments") if isinstance(arguments[1], Constant): pattern = arguments[1].evaluateOne(convertType=False) try: pattern = re.compile(pattern) except sre_constants as err: raise defs.PmmlValidationError( "Could not compile regex pattern \"%s\": %s" % (pattern, str(err))) else: raise defs.PmmlValidationError( "Function \"like\" requires its second argument (the regex pattern) to be a Constant" ) performanceTable.pause("built-in \"%s\"" % self.name) test = arguments[0].evaluate(dataTable, functionTable, performanceTable) performanceTable.unpause("built-in \"%s\"" % self.name) if test.fieldType.optype == "continuous": d = test.data data = NP("fromiter", (re.match(pattern, d[i]) is not None for i in xrange(len(dataTable))), dtype=fieldType.dtype, count=len(dataTable)) else: d = test.data ds = test.fieldType.valueToString data = NP("fromiter", (re.match(pattern, ds(d[i])) is not None for i in xrange(len(dataTable))), dtype=fieldType.dtype, count=len(dataTable)) performanceTable.end("built-in \"%s\"" % self.name) return DataColumn(fieldType, data, test.mask)
def _toDataColumn_object(self, data, mask): data, mask = self._checkNumpy(data, mask) if isinstance(data, NP.ndarray) and (mask is None or isinstance(mask, NP.ndarray)) and data.dtype == self.dtype: pass # proceed to return statement (after checking values and intervals) else: data, mask = self._checkNonNumpy(data, mask) data = NP.array(data, dtype=self.dtype) if mask is None: mask = NP("fromiter", (defs.MISSING if (isinstance(d, float) and math.isnan(d)) else defs.VALID for d in data), dtype=defs.maskType, count=len(data)) else: mask = NP("fromiter", (defs.MISSING if (m != 0 or (isinstance(data[i], float) and math.isnan(data[i]))) else defs.VALID for i, m in enumerate(mask)), dtype=defs.maskType, count=len(mask)) if not mask.any(): mask = None data, mask = self._checkValues(data, mask) data, mask = self._checkIntervals(data, mask) return DataColumn(self, data, mask)
def functionCount(self, dataColumn, whereMask, groupSelection, getstate, setstate): """Counts rows in a DataColumn, possibly with an SQL where mask and groupField. @type dataColumn: DataColumn @param dataColumn: The input data column. @type whereMask: 1d Numpy array of bool, or None @param whereMask: The result of the SQL where selection. @type groupSelection: 1d Numpy array of bool, or None. @param groupSelection: Rows corresponding to a particular value of the groupField. @type getstate: callable function @param getstate: Retrieve staring values from the DataTableState. @type setstate: callable function @param setstate: Store ending values to the DataTableState. @rtype: DataColumn @return: A column of counted rows. """ fieldType = FakeFieldType("integer", "continuous") ones = NP("ones", len(dataColumn), dtype=fieldType.dtype) if dataColumn.mask is not None: NP("logical_and", ones, NP(dataColumn.mask == defs.VALID), ones) if whereMask is not None: NP("logical_and", ones, whereMask, ones) if groupSelection is not None: NP("logical_and", ones, groupSelection, ones) if getstate is not None and len(dataColumn) > 0: startingState = getstate() if startingState is not None: ones[0] += startingState data = NP("cumsum", ones) if setstate is not None and len(dataColumn) > 0: setstate(data[-1]) return DataColumn(fieldType, data, None)
def evaluate(self, dataTable, functionTable, performanceTable, arguments): arguments = [x.evaluate(dataTable, functionTable, performanceTable) for x in arguments] performanceTable.begin("built-in \"%s\"" % self.name) fieldType = self.fieldTypeFromSignature(arguments) test, low, high = arguments if test.fieldType.dataType == "object" or (test.fieldType.dataType == "string" and test.fieldType.optype == "continuous" and low.fieldType.optype == "continuous"): ld = test.data rd = low.data data = NP("fromiter", (ld[i] >= rd[i] for i in xrange(len(dataTable))), dtype=fieldType.dtype, count=len(dataTable)) elif test.fieldType.dataType == "string": ld = test.data rd = low.data l2s = test.fieldType.valueToString r2s = low.fieldType.valueToString data = NP("fromiter", (l2s(ld[i]) >= r2s(rd[i]) for i in xrange(len(dataTable))), dtype=fieldType.dtype, count=len(dataTable)) else: data = NP("greater_equal", test.data, low.data) if test.fieldType.dataType == "object" or (test.fieldType.dataType == "string" and test.fieldType.optype == "continuous" and high.fieldType.optype == "continuous"): ld = test.data rd = high.data datahigh = NP("fromiter", (ld[i] <= rd[i] for i in xrange(len(dataTable))), dtype=fieldType.dtype, count=len(dataTable)) elif test.fieldType.dataType == "string": ld = test.data rd = high.data l2s = test.fieldType.valueToString r2s = high.fieldType.valueToString datahigh = NP("fromiter", (l2s(ld[i]) <= r2s(rd[i]) for i in xrange(len(dataTable))), dtype=fieldType.dtype, count=len(dataTable)) else: datahigh = NP("less_equal", test.data, high.data) NP("logical_and", data, datahigh, data) performanceTable.end("built-in \"%s\"" % self.name) return DataColumn(fieldType, data, DataColumn.mapAnyMissingInvalid([test.mask, low.mask, high.mask]))
def buildManually(self, fieldTypes, internalArrays, internalMasks=None, inputState=None): """Create a DataTable from pre-built Numpy arrays filled with internal values rather than user-friendly values. For experts only. @type fieldTypes: dict of str to FieldTypes @param fieldTypes: Maps field names to their FieldType. @type internalArrays: dict of str to 1d Numpy arrays. @param internalArrays: Maps field names to the internal data. @type internalMasks: dict of str to 1d Numpy arrays, or None @param internalMasks: Maps field names to the masks, or None for no masks. @type inputState: DataTableState or None @param inputState: Initial state of the DataTable. To continue a previous calculation, use the C{dataTable.state} from the previous calculation. @raise ValueError: If the C{fieldTypes}, C{internalArrays}, or C{internalMasks} have different field names, this function raises an error. """ if internalMasks is None: internalMasks = dict((x, None) for x in internalArrays) if set(fieldTypes) != set(internalArrays) or set(fieldTypes) != set( internalMasks): raise ValueError( "Mismatch between fieldType names, internalArray names, or internalMasks names" ) dataColumns = {} for name in sorted(fieldTypes): dataColumns[name] = DataColumn(fieldTypes[name], internalArrays[name], internalMasks[name]) dataTable = DataTable.__new__(DataTable) dataTable._configure(dataColumns, inputState) return dataTable
def _toDataColumn_internal(self, data, mask): data, mask = self._checkNumpy(data, mask, tryToCast=False) data, mask = self._checkNonNumpy(data, mask) try: data = NP("fromiter", (self.stringToValue(d) for d in data), dtype=self.dtype, count=len(data)) # mask is handled in the else statement after the except block except ValueError: data2 = NP("empty", len(data), dtype=self.dtype) if mask is None: mask2 = NP("zeros", len(data), dtype=defs.maskType) else: mask2 = NP("fromiter", (defs.VALID if not m else defs.MISSING for m in mask), dtype=defs.maskType, count=len(mask)) for i, v in enumerate(data): if isinstance(v, float) and math.isnan(v): data2[i] = defs.PADDING mask2[i] = defs.MISSING else: try: data2[i] = self.stringToValue(v) except (ValueError, TypeError): data2[i] = defs.PADDING mask2[i] = defs.INVALID if not mask2.any(): mask2 = None data, mask = data2, mask2 else: if mask is not None and not isinstance(mask, NP.ndarray): mask = NP("array", mask, dtype=defs.maskType) # this is the only _toDataColumn that doesn't check values and intervals because these were checked in _setup for categorical and ordinal strings return DataColumn(self, data, mask)
def evaluate(self, dataTable, functionTable, performanceTable): """Evaluate the expression, using a DataTable as input. @type dataTable: DataTable @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression. @type functionTable: FunctionTable @param functionTable: The FunctionTable, containing any functions that might be called in this expression. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: DataColumn @return: The result of the calculation as a DataColumn. """ performanceTable.begin("Apply") function = functionTable.get(self.get("function")) if function is None: raise LookupError( "Apply references function \"%s\", but it does not exist" % self.get("function")) arguments = self.childrenOfClass(PmmlExpression) performanceTable.pause("Apply") dataColumn = function.evaluate(dataTable, functionTable, performanceTable, arguments) performanceTable.unpause("Apply") mask = FieldCastMethods.applyInvalidValueTreatment( dataColumn.mask, self.get("invalidValueTreatment")) data, mask = FieldCastMethods.applyMapMissingTo( dataColumn.fieldType, dataColumn.data, mask, self.get("mapMissingTo")) performanceTable.end("Apply") return DataColumn(dataColumn.fieldType, data, mask)
def _selectMax(self, dataTable, functionTable, performanceTable, segmentation): """Used by C{calculateScore}.""" performanceTable.begin("Segmentation max") scoresData = NP("empty", len(dataTable), dtype=NP.dtype(object)) filled = NP("zeros", len(dataTable), dtype=NP.dtype(bool)) unfilled = NP("ones", len(dataTable), dtype=NP.dtype(bool)) newOutputData = [] for segment in segmentation.childrenOfTag("Segment", iterator=True): performanceTable.pause("Segmentation max") selection = segment.childOfClass(PmmlPredicate).evaluate(dataTable, functionTable, performanceTable) performanceTable.unpause("Segmentation max") if not selection.any(): continue subTable = dataTable.subTable(selection) subModel = segment.childOfClass(PmmlModel) performanceTable.pause("Segmentation max") subModel.calculate(subTable, functionTable, performanceTable) performanceTable.unpause("Segmentation max") if subTable.score.fieldType.dataType in ("string", "boolean", "object"): raise defs.PmmlValidationError("Segmentation with multipleModelMethod=\"max\" cannot be applied to models that produce dataType \"%s\"" % subTable.score.fieldType.dataType) # ignore invalid in matches (like the built-in "min" Apply function) if subTable.score.mask is not None: NP("logical_and", selection, NP(subTable.score.mask == defs.VALID), selection) selectionFilled = NP("logical_and", selection, filled) selectionUnfilled = NP("logical_and", selection, unfilled) filled_selection = filled[selection] unfilled_selection = unfilled[selection] left, right = subTable.score.data[filled_selection], scoresData[selectionFilled] condition = NP(left > right) scoresData[selectionFilled] = NP("where", condition, left, right) scoresData[selectionUnfilled] = subTable.score.data[unfilled_selection] for fieldName, dataColumn in subTable.output.items(): if fieldName not in dataTable.output: data = NP("empty", len(dataTable), dtype=dataColumn.fieldType.dtype) data[selectionUnfilled] = dataColumn.data mask = NP(NP("ones", len(dataTable), dtype=defs.maskType) * defs.MISSING) if dataColumn.mask is None: mask[selectionUnfilled] = defs.VALID else: mask[selectionUnfilled] = dataColumn.mask newDataColumn = DataColumn(dataColumn.fieldType, data, mask) newDataColumn._unlock() dataTable.output[fieldName] = newDataColumn newOutputData.append(newDataColumn) else: newDataColumn = dataTable.output[fieldName] newDataColumn.data[selectionFilled] = NP("where", condition, dataColumn.data[filled_selection], newDataColumn.data[selectionFilled]) newDataColumn.data[selectionUnfilled] = dataColumn.data[unfilled_selection] if dataColumn.mask is None: newDataColumn.mask[selectionUnfilled] = defs.VALID else: newDataColumn.mask[selectionUnfilled] = dataColumn.mask filled += selectionUnfilled unfilled -= selectionUnfilled for newDataColumn in newOutputData: if not newDataColumn.mask.any(): newDataColumn._mask = None newDataColumn._lock() if filled.all(): scoresMask = None else: scoresMask = NP(NP("logical_not", filled) * defs.MISSING) scores = DataColumn(self.scoreType, scoresData, scoresMask) performanceTable.end("Segmentation max") return {None: scores}
def evaluate(self, dataTable, functionTable, performanceTable, arguments): arguments = [x.evaluate(dataTable, functionTable, performanceTable) for x in arguments] performanceTable.begin("built-in \"%s\"" % self.name) fieldType = self.fieldTypeFromSignature(arguments) dataColumn = DataColumn(fieldType, NP("cos", arguments[0].data * arguments[1].data), DataColumn.mapAnyMissingInvalid([arguments[0].mask, arguments[1].mask])) performanceTable.end("built-in \"%s\"" % self.name) return dataColumn
def evaluate(self, dataTable, functionTable, performanceTable): """Evaluate the expression, using a DataTable as input. @type dataTable: DataTable @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression. @type functionTable: FunctionTable @param functionTable: The FunctionTable, containing any functions that might be called in this expression. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: DataColumn @return: The result of the calculation as a DataColumn. """ function = self["function"] groupField = self.get("groupField") if groupField is None: performanceTable.begin("Aggregate %s" % function) else: performanceTable.begin("Aggregate %s groupField" % function) dataColumn = dataTable.fields[self["field"]] whereMask = self.where(dataTable, functionTable, performanceTable) stateId = self.get("stateId") if groupField is None: if stateId is None: getstate = None setstate = None else: def getstate(): return dataTable.state.get(stateId) def setstate(value): dataTable.state[stateId] = value if function == "count": dataColumn = self.functionCount(dataColumn, whereMask, None, getstate, setstate) elif function == "sum": dataColumn = self.functionSum(dataColumn, whereMask, None, getstate, setstate) elif function == "average": dataColumn = self.functionAverage(dataColumn, whereMask, None, getstate, setstate) elif function == "min": dataColumn = self.functionMin(dataColumn, whereMask, None, getstate, setstate) elif function == "max": dataColumn = self.functionMax(dataColumn, whereMask, None, getstate, setstate) elif function == "multiset": dataColumn = self.functionMultiset(dataColumn, whereMask, None, getstate, setstate) performanceTable.end("Aggregate %s" % function) return dataColumn else: groupColumn = dataTable.fields[groupField] if groupColumn.mask is None: validGroup = groupColumn.data else: validGroup = groupColumn.data[NP( groupColumn.mask == defs.VALID)] if stateId is not None: state = dataTable.state.get(stateId) if state is None: record = {} else: record = state valuesSeen = dict((stringValue, False) for stringValue in record) groupTables = {} groupColumnFieldType = None for groupValue in NP("unique", validGroup): groupSelection = NP(groupColumn.data == groupValue) if groupColumn.mask is not None: NP("logical_and", groupSelection, NP(groupColumn.mask == defs.VALID), groupSelection) groupColumnFieldType = groupColumn.fieldType stringValue = groupColumnFieldType.valueToString(groupValue) if stringValue in record: def getstate(): return record[stringValue] else: getstate = None def setstate(value): record[stringValue] = value valuesSeen[stringValue] = True value = groupColumnFieldType.valueToPython(groupValue) if function == "count": groupTables[value] = self.functionCount( dataColumn, whereMask, groupSelection, getstate, setstate) elif function == "sum": groupTables[value] = self.functionSum( dataColumn, whereMask, groupSelection, getstate, setstate) elif function == "average": groupTables[value] = self.functionAverage( dataColumn, whereMask, groupSelection, getstate, setstate) elif function == "min": groupTables[value] = self.functionMin( dataColumn, whereMask, groupSelection, getstate, setstate) elif function == "max": groupTables[value] = self.functionMax( dataColumn, whereMask, groupSelection, getstate, setstate) elif function == "multiset": groupTables[value] = self.functionMultiset( dataColumn, whereMask, groupSelection, getstate, setstate) if stateId is not None: dataTable.state[stateId] = record for stringValue in valuesSeen: if not valuesSeen[stringValue]: value = groupColumnFieldType.valueToPython( groupColumnFieldType.stringToValue(stringValue)) if function == "count": groupTables[value] = self.functionCountFake( record[stringValue], len(dataTable), dataColumn.fieldType) elif function == "sum": groupTables[value] = self.functionSumFake( record[stringValue], len(dataTable), dataColumn.fieldType) elif function == "average": groupTables[value] = self.functionAverageFake( record[stringValue], len(dataTable), dataColumn.fieldType) elif function in ("min", "max"): groupTables[value] = self.functionMinMaxFake( record[stringValue], len(dataTable), dataColumn.fieldType) elif function == "multiset": groupTables[value] = self.functionMultisetFake( record[stringValue], len(dataTable), dataColumn.fieldType) performanceTable.begin("Aggregate %s groupField collect" % function) fieldType = FakeFieldType("object", "any") data = NP("empty", len(dataTable), dtype=NP.dtype(object)) if function == "count": for i in xrange(len(dataTable)): data[i] = dict((value, table.data[i]) for value, table in groupTables.items() if table.data[i] != 0) elif function == "sum": for i in xrange(len(dataTable)): data[i] = dict((value, table.data[i]) for value, table in groupTables.items() if table.data[i] != 0.0) elif function == "average": for i in xrange(len(dataTable)): data[i] = dict( (value, table.data[i]) for value, table in groupTables.items() if table.data[i] > 0.0 or table.data[i] <= 0.0) elif function in ("min", "max"): for table in groupTables.values(): if table.mask is None: table._mask = NP("zeros", len(table), dtype=defs.maskType) for i in xrange(len(dataTable)): data[i] = dict((value, table.data[i]) for value, table in groupTables.items() if table.mask[i] == defs.VALID) elif function == "multiset": for i in xrange(len(dataTable)): data[i] = dict((value, table.data[i]) for value, table in groupTables.items() if len(table.data[i]) > 0) performanceTable.end("Aggregate %s groupField collect" % function) performanceTable.end("Aggregate %s groupField" % function) return DataColumn(fieldType, data, None)
def _selectFirst(self, dataTable, functionTable, performanceTable, segmentation): """Used by C{calculateScore}.""" performanceTable.begin("Segmentation selectFirst") scoresData = NP("empty", len(dataTable), dtype=NP.dtype(object)) scoresMask = NP("zeros", len(dataTable), dtype=defs.maskType) unfilled = NP("ones", len(dataTable), dtype=NP.dtype(bool)) segments = NP("empty", len(dataTable), dtype=NP.dtype(object)) newOutputData = [] for segment in segmentation.childrenOfTag("Segment", iterator=True): performanceTable.pause("Segmentation selectFirst") selection = segment.childOfClass(PmmlPredicate).evaluate(dataTable, functionTable, performanceTable) performanceTable.unpause("Segmentation selectFirst") NP("logical_and", selection, unfilled, selection) if not selection.any(): continue subTable = dataTable.subTable(selection) subModel = segment.childOfClass(PmmlModel) performanceTable.pause("Segmentation selectFirst") subModel.calculate(subTable, functionTable, performanceTable) performanceTable.unpause("Segmentation selectFirst") scoresData[selection] = subTable.score.data if subTable.score.mask is not None: scoresMask[selection] = subTable.score.mask else: scoresMask[selection] = defs.VALID segmentName = segment.get("id") if segmentName is not None: segments[selection] = segmentName for fieldName, dataColumn in subTable.output.items(): if fieldName not in dataTable.output: data = NP("empty", len(dataTable), dtype=dataColumn.fieldType.dtype) data[selection] = dataColumn.data mask = NP(NP("ones", len(dataTable), dtype=defs.maskType) * defs.MISSING) if dataColumn.mask is None: mask[selection] = defs.VALID else: mask[selection] = dataColumn.mask newDataColumn = DataColumn(dataColumn.fieldType, data, mask) newDataColumn._unlock() dataTable.output[fieldName] = newDataColumn newOutputData.append(newDataColumn) else: newDataColumn = dataTable.output[fieldName] newDataColumn.data[selection] = dataColumn.data if dataColumn.mask is None: newDataColumn.mask[selection] = defs.VALID else: newDataColumn.mask[selection] = dataColumn.mask unfilled -= selection if not unfilled.any(): break for newDataColumn in newOutputData: if not newDataColumn.mask.any(): newDataColumn._mask = None newDataColumn._lock() if not scoresMask.any(): scoresMask = None scores = DataColumn(self.scoreType, scoresData, scoresMask) if self.name is None: performanceTable.end("Segmentation selectFirst") return {None: scores} else: performanceTable.end("Segmentation selectFirst") return {None: scores, "segment": DataColumn(self.scoreTypeSegment, segments, None)}
def _fromDataColumn_timeSeconds(self, dataColumn): transformedData = NP(NP("mod", NP(dataColumn.data - self._offset), self._microsecondsPerDay) / float(self._factor)) return self._fromDataColumn_number(DataColumn(self, transformedData, dataColumn.mask))
def functionMax(self, dataColumn, whereMask, groupSelection, getstate, setstate): """Finds the maximum of rows in a DataColumn, possibly with an SQL where mask and groupField. @type dataColumn: DataColumn @param dataColumn: The input data column. @type whereMask: 1d Numpy array of bool, or None @param whereMask: The result of the SQL where selection. @type groupSelection: 1d Numpy array of bool, or None. @param groupSelection: Rows corresponding to a particular value of the groupField. @type getstate: callable function @param getstate: Retrieve staring values from the DataTableState. @type setstate: callable function @param setstate: Store ending values to the DataTableState. @rtype: DataColumn @return: A column of maximized rows. """ fieldType = dataColumn.fieldType if fieldType.optype not in ("continuous", "ordinal"): raise defs.PmmlValidationError( "Aggregate function \"min\" requires a continuous or ordinal input field" ) if dataColumn.mask is None: selection = NP("ones", len(dataColumn), dtype=NP.dtype(bool)) else: selection = NP(dataColumn.mask == defs.VALID) if whereMask is not None: NP("logical_and", selection, whereMask, selection) if groupSelection is not None: NP("logical_and", selection, groupSelection, selection) maximum = None if getstate is not None: startingState = getstate() if startingState is not None: maximum = startingState data = NP("empty", len(dataColumn), dtype=fieldType.dtype) mask = NP("zeros", len(dataColumn), dtype=defs.maskType) for i, x in enumerate(dataColumn.data): if selection[i]: if maximum is None or x > maximum: maximum = x if maximum is None: mask[i] = defs.INVALID else: data[i] = maximum if not mask.any(): mask = None if setstate is not None: setstate(maximum) return DataColumn(fieldType, data, mask)