def maskInvalid(self, data, mask): """Helper method to replace NaN and infinite values with INVALID after a potentially dangerous operation. Example:: result = NP("log", dataColumn.data) # log(0) = -inf, log(-x) = nan resultMask = self.maskInvalid(result, dataColumn.mask) return DataColumn(fakeFieldType, result, resultMask) The input C{data} and C{mask} are not modified by this method; a substitute mask is returned. @type data: 1d Numpy array @param data: The dataset that may contain NaN and infinite values. @type mask: 1d Numpy array of C{defs.maskType}, or None @param mask: The original mask. @rtype: 1d Numpy array of C{defs.maskType}, or None @return: The new mask. """ bad = NP("logical_not", NP("isfinite", data)) if bad.any(): if mask is None: mask = bad * defs.INVALID else: NP("logical_and", bad, NP(mask == defs.VALID), bad) if not mask.flags.writeable: mask = NP("copy", mask) mask.setflags(write=True) mask[bad] = defs.INVALID if mask is not None and not mask.any(): mask = None return mask
def _toDataColumn_number(self, data, mask): data, mask = self._checkNumpy(data, mask) if isinstance(data, NP.ndarray) and (mask is None or isinstance(mask, NP.ndarray)) and data.dtype == self.dtype: mask2 = NP("isnan", data) if mask is None: mask = NP("array", mask2, defs.maskType) * defs.MISSING else: mask[mask2] = defs.MISSING else: data, mask = self._checkNonNumpy(data, mask) try: data = NP("array", data, dtype=self.dtype) # mask is handled in the else statement after the except block except (ValueError, TypeError): data2 = NP("empty", len(data), dtype=self.dtype) if mask is None: mask2 = NP("zeros", len(data), dtype=defs.maskType) else: mask2 = NP("fromiter", ((defs.VALID if not m else defs.MISSING) for m in mask), dtype=defs.maskType, count=len(mask)) for i, v in enumerate(data): try: data2[i] = v if mask2[i] == defs.VALID and ((isinstance(v, float) and math.isnan(v)) or (isinstance(v, basestring) and v.upper() == "NAN")): mask2[i] = defs.MISSING if v is None: raise TypeError except (ValueError, TypeError): data2[i] = defs.PADDING if mask2[i] == defs.VALID: if (isinstance(v, float) and math.isnan(v)) or (isinstance(v, basestring) and v.upper() == "NAN"): mask2[i] = defs.MISSING else: mask2[i] = defs.INVALID if not mask2.any(): mask2 = None data, mask = data2, mask2 else: mask2 = NP("isnan", data) if mask is None: mask = NP("array", mask2, defs.maskType) else: mask = NP(NP("array", NP("logical_or", mask2, NP("fromiter", (m != 0 for m in mask), dtype=NP.dtype(bool), count=len(mask))), defs.maskType) * defs.MISSING) if not mask.any(): mask = None data, mask = self._checkValues(data, mask) data, mask = self._checkIntervals(data, mask) return DataColumn(self, data, mask)
def evaluate(self, dataTable, functionTable, performanceTable, arguments): arguments = [ x.evaluate(dataTable, functionTable, performanceTable) for x in arguments ] performanceTable.begin("built-in \"%s\"" % self.name) fieldType = self.allBooleanType(arguments, atleast=2) data = NP("zeros", len(dataTable), dtype=fieldType.dtype) mask = None allbad = NP("ones", len(dataTable), dtype=NP.dtype(bool)) (data, allbad), mask = self.applySkipMissing((data, allbad), mask, arguments) if allbad.any(): if mask is None: mask = allbad * defs.MISSING else: NP("logical_and", allbad, NP(mask == defs.VALID), allbad) mask[allbad] = defs.MISSING performanceTable.end("built-in \"%s\"" % self.name) return DataColumn(fieldType, data, mask)
def _toDataColumn_dateTime(self, data, mask): data, mask = self._checkNumpy(data, mask, tryToCast=False) data, mask = self._checkNonNumpy(data, mask) data2 = NP("empty", len(data), dtype=self.dtype) mask2 = NP("zeros", len(data), dtype=defs.maskType) for i, x in enumerate(data): if (mask is not None and mask[i]) or (isinstance(x, float) and math.isnan(x)) or (isinstance(x, basestring) and x.upper() == "NAN"): data2[i] = defs.PADDING mask2[i] = defs.MISSING else: try: data2[i] = self.stringToValue(x) except (ValueError, TypeError): data2[i] = defs.PADDING mask2[i] = defs.INVALID if not mask2.any(): data, mask = data2, None else: data, mask = data2, mask2 data, mask = self._checkValues(data, mask) data, mask = self._checkIntervals(data, mask) return DataColumn(self, data, mask)
def functionMax(self, dataColumn, whereMask, groupSelection, getstate, setstate): """Finds the maximum of rows in a DataColumn, possibly with an SQL where mask and groupField. @type dataColumn: DataColumn @param dataColumn: The input data column. @type whereMask: 1d Numpy array of bool, or None @param whereMask: The result of the SQL where selection. @type groupSelection: 1d Numpy array of bool, or None. @param groupSelection: Rows corresponding to a particular value of the groupField. @type getstate: callable function @param getstate: Retrieve staring values from the DataTableState. @type setstate: callable function @param setstate: Store ending values to the DataTableState. @rtype: DataColumn @return: A column of maximized rows. """ fieldType = dataColumn.fieldType if fieldType.optype not in ("continuous", "ordinal"): raise defs.PmmlValidationError("Aggregate function \"min\" requires a continuous or ordinal input field") if dataColumn.mask is None: selection = NP("ones", len(dataColumn), dtype=NP.dtype(bool)) else: selection = NP(dataColumn.mask == defs.VALID) if whereMask is not None: NP("logical_and", selection, whereMask, selection) if groupSelection is not None: NP("logical_and", selection, groupSelection, selection) maximum = None if getstate is not None: startingState = getstate() if startingState is not None: maximum = startingState data = NP("empty", len(dataColumn), dtype=fieldType.dtype) mask = NP("zeros", len(dataColumn), dtype=defs.maskType) for i, x in enumerate(dataColumn.data): if selection[i]: if maximum is None or x > maximum: maximum = x if maximum is None: mask[i] = defs.INVALID else: data[i] = maximum if not mask.any(): mask = None if setstate is not None: setstate(maximum) return DataColumn(fieldType, data, mask)
def functionAverage(self, dataColumn, whereMask, groupSelection, getstate, setstate): """Averages rows in a DataColumn, possibly with an SQL where mask and groupField. @type dataColumn: DataColumn @param dataColumn: The input data column. @type whereMask: 1d Numpy array of bool, or None @param whereMask: The result of the SQL where selection. @type groupSelection: 1d Numpy array of bool, or None. @param groupSelection: Rows corresponding to a particular value of the groupField. @type getstate: callable function @param getstate: Retrieve staring values from the DataTableState. @type setstate: callable function @param setstate: Store ending values to the DataTableState. @rtype: DataColumn @return: A column of averaged rows. """ fieldType = FakeFieldType("double", "continuous") if dataColumn.fieldType.dataType not in ("integer", "float", "double"): raise defs.PmmlValidationError( "Aggregate function \"average\" requires a numeric input field: \"integer\", \"float\", \"double\"" ) denominator = NP("ones", len(dataColumn), dtype=fieldType.dtype) if dataColumn.mask is not None: NP("logical_and", denominator, NP(dataColumn.mask == defs.VALID), denominator) if whereMask is not None: NP("logical_and", denominator, whereMask, denominator) if groupSelection is not None: NP("logical_and", denominator, groupSelection, denominator) numerator = NP("multiply", denominator, dataColumn.data) if getstate is not None and len(dataColumn) > 0: startingState = getstate() if startingState is not None: startingNumerator, startingDenominator = startingState numerator[0] += startingNumerator denominator[0] += startingDenominator numerator = NP("cumsum", numerator) denominator = NP("cumsum", denominator) data = NP(numerator / denominator) mask = NP(NP("logical_not", NP("isfinite", data)) * defs.INVALID) if not mask.any(): mask = None if setstate is not None and len(dataColumn) > 0: setstate((numerator[-1], denominator[-1])) return DataColumn(fieldType, data, mask)
def functionAverage(self, dataColumn, whereMask, groupSelection, getstate, setstate): """Averages rows in a DataColumn, possibly with an SQL where mask and groupField. @type dataColumn: DataColumn @param dataColumn: The input data column. @type whereMask: 1d Numpy array of bool, or None @param whereMask: The result of the SQL where selection. @type groupSelection: 1d Numpy array of bool, or None. @param groupSelection: Rows corresponding to a particular value of the groupField. @type getstate: callable function @param getstate: Retrieve staring values from the DataTableState. @type setstate: callable function @param setstate: Store ending values to the DataTableState. @rtype: DataColumn @return: A column of averaged rows. """ fieldType = FakeFieldType("double", "continuous") if dataColumn.fieldType.dataType not in ("integer", "float", "double"): raise defs.PmmlValidationError("Aggregate function \"average\" requires a numeric input field: \"integer\", \"float\", \"double\"") denominator = NP("ones", len(dataColumn), dtype=fieldType.dtype) if dataColumn.mask is not None: NP("logical_and", denominator, NP(dataColumn.mask == defs.VALID), denominator) if whereMask is not None: NP("logical_and", denominator, whereMask, denominator) if groupSelection is not None: NP("logical_and", denominator, groupSelection, denominator) numerator = NP("multiply", denominator, dataColumn.data) if getstate is not None and len(dataColumn) > 0: startingState = getstate() if startingState is not None: startingNumerator, startingDenominator = startingState numerator[0] += startingNumerator denominator[0] += startingDenominator numerator = NP("cumsum", numerator) denominator = NP("cumsum", denominator) data = NP(numerator / denominator) mask = NP(NP("logical_not", NP("isfinite", data)) * defs.INVALID) if not mask.any(): mask = None if setstate is not None and len(dataColumn) > 0: setstate((numerator[-1], denominator[-1])) return DataColumn(fieldType, data, mask)
def applyMapMissingTo(fieldType, data, mask, mapMissingTo, overwrite=False): """Replace MISSING values with a given substitute. This function does not modify the original data (unless C{overwrite} is True), but it returns a substitute. Example use:: data, mask = dataColumn.data, dataColumn.mask data, mask = FieldCastMethods.applyMapMissingTo(dataColumn.fieldType, data, mask, "-999") return DataColumn(dataColumn.fieldType, data, mask) It can also be used in conjunction with other FieldCastMethods. @type fieldType: FieldType @param fieldType: The data fieldType (to interpret C{mapMissingTo}). @type data: 1d Numpy array @param data: The data. @type mask: 1d Numpy array of dtype defs.maskType, or None @param mask: The mask. @type mapMissingTo: string @param mapMissingTo: The replacement value, represented as a string (e.g. directly from a PMML attribute). @type overwrite: bool @param overwrite: If True, temporarily unlike and overwrite the original mask. @rtype: 2-tuple of 1d Numpy arrays @return: The new data and mask. """ if mask is None: return data, mask if mapMissingTo is not None: selection = NP(mask == defs.MISSING) try: mappedValue = fieldType.stringToValue(mapMissingTo) except ValueError as err: raise defs.PmmlValidationError("mapMissingTo string \"%s\" cannot be cast as %r: %s" % (mapMissingTo, fieldType, str(err))) if overwrite: data.setflags(write=True) mask.setflags(write=True) else: data = NP("copy", data) mask = NP("copy", mask) data[selection] = mappedValue mask[selection] = defs.VALID if not mask.any(): mask = None return data, mask
def evaluate(self, dataTable, functionTable, performanceTable, arguments): arguments = [x.evaluate(dataTable, functionTable, performanceTable) for x in arguments] performanceTable.begin("built-in \"%s\"" % self.name) fieldType = self.fieldTypeFromSignature(arguments) left, right = arguments zeroDenominators = NP(NP(right.data == 0.0) * defs.INVALID) if not zeroDenominators.any(): zeroDenominators = None mask = DataColumn.mapAnyMissingInvalid([zeroDenominators, left.mask, right.mask]) dataColumn = DataColumn(fieldType, NP("floor_divide", left.data, right.data), mask) performanceTable.end("built-in \"%s\"" % self.name) return dataColumn
def _checkIntervals(self, data, mask): intervals = self.intervals if len(intervals) == 0: return data, mask # innocent until proven guilty invalid = NP("zeros", len(data), dtype=NP.dtype(bool)) for interval in intervals: closure = interval["closure"] leftMargin = interval.get("leftMargin") rightMargin = interval.get("rightMargin") if leftMargin is not None: try: leftMargin = self.stringToValue(leftMargin) except ValueError: raise defs.PmmlValidationError("Improper value in Interval leftMargin specification: \"%s\"" % leftMargin) if closure in ("openClosed", "openOpen"): invalid[NP(data <= leftMargin)] = True elif closure in ("closedOpen", "closedClosed"): invalid[NP(data < leftMargin)] = True if rightMargin is not None: try: rightMargin = self.stringToValue(rightMargin) except ValueError: raise defs.PmmlValidationError("Improper value in Interval rightMargin specification: \"%s\"" % rightMargin) if closure in ("openOpen", "closedOpen"): invalid[NP(data >= rightMargin)] = True elif closure in ("openClosed", "closedClosed"): invalid[NP(data > rightMargin)] = True if not invalid.any(): return data, mask if mask is None: return data, NP(invalid * defs.INVALID) else: NP("logical_and", invalid, NP(mask == defs.VALID), invalid) # only change what wasn't already marked as MISSING mask[invalid] = defs.INVALID return data, mask
def _toDataColumn_object(self, data, mask): data, mask = self._checkNumpy(data, mask) if isinstance(data, NP.ndarray) and (mask is None or isinstance(mask, NP.ndarray)) and data.dtype == self.dtype: pass # proceed to return statement (after checking values and intervals) else: data, mask = self._checkNonNumpy(data, mask) data = NP.array(data, dtype=self.dtype) if mask is None: mask = NP("fromiter", (defs.MISSING if (isinstance(d, float) and math.isnan(d)) else defs.VALID for d in data), dtype=defs.maskType, count=len(data)) else: mask = NP("fromiter", (defs.MISSING if (m != 0 or (isinstance(data[i], float) and math.isnan(data[i]))) else defs.VALID for i, m in enumerate(mask)), dtype=defs.maskType, count=len(mask)) if not mask.any(): mask = None data, mask = self._checkValues(data, mask) data, mask = self._checkIntervals(data, mask) return DataColumn(self, data, mask)
def evaluate(self, dataTable, functionTable, performanceTable, arguments): arguments = [x.evaluate(dataTable, functionTable, performanceTable) for x in arguments] performanceTable.begin("built-in \"%s\"" % self.name) fieldType = self.allBooleanType(arguments, atleast=2) data = NP("zeros", len(dataTable), dtype=fieldType.dtype) mask = None allbad = NP("ones", len(dataTable), dtype=NP.dtype(bool)) (data, allbad), mask = self.applySkipMissing((data, allbad), mask, arguments) if allbad.any(): if mask is None: mask = allbad * defs.MISSING else: NP("logical_and", allbad, NP(mask == defs.VALID), allbad) mask[allbad] = defs.MISSING performanceTable.end("built-in \"%s\"" % self.name) return DataColumn(fieldType, data, mask)
def _toDataColumn_internal(self, data, mask): data, mask = self._checkNumpy(data, mask, tryToCast=False) data, mask = self._checkNonNumpy(data, mask) try: data = NP("fromiter", (self.stringToValue(d) for d in data), dtype=self.dtype, count=len(data)) # mask is handled in the else statement after the except block except ValueError: data2 = NP("empty", len(data), dtype=self.dtype) if mask is None: mask2 = NP("zeros", len(data), dtype=defs.maskType) else: mask2 = NP("fromiter", (defs.VALID if not m else defs.MISSING for m in mask), dtype=defs.maskType, count=len(mask)) for i, v in enumerate(data): if isinstance(v, float) and math.isnan(v): data2[i] = defs.PADDING mask2[i] = defs.MISSING else: try: data2[i] = self.stringToValue(v) except (ValueError, TypeError): data2[i] = defs.PADDING mask2[i] = defs.INVALID if not mask2.any(): mask2 = None data, mask = data2, mask2 else: if mask is not None and not isinstance(mask, NP.ndarray): mask = NP("array", mask, dtype=defs.maskType) # this is the only _toDataColumn that doesn't check values and intervals because these were checked in _setup for categorical and ordinal strings return DataColumn(self, data, mask)
def evaluate(self, dataTable, functionTable, performanceTable, arguments): arguments = [ x.evaluate(dataTable, functionTable, performanceTable) for x in arguments ] performanceTable.begin("built-in \"%s\"" % self.name) fieldType = self.fieldTypeFromSignature(arguments) left, right = arguments zeroDenominators = NP(NP(right.data == 0.0) * defs.INVALID) if not zeroDenominators.any(): zeroDenominators = None mask = DataColumn.mapAnyMissingInvalid( [zeroDenominators, left.mask, right.mask]) dataColumn = DataColumn(fieldType, NP("floor_divide", left.data, right.data), mask) performanceTable.end("built-in \"%s\"" % self.name) return dataColumn
def _selectFirst(self, dataTable, functionTable, performanceTable, segmentation): """Used by C{calculateScore}.""" performanceTable.begin("Segmentation selectFirst") scoresData = NP("empty", len(dataTable), dtype=NP.dtype(object)) scoresMask = NP("zeros", len(dataTable), dtype=defs.maskType) unfilled = NP("ones", len(dataTable), dtype=NP.dtype(bool)) segments = NP("empty", len(dataTable), dtype=NP.dtype(object)) newOutputData = [] for segment in segmentation.childrenOfTag("Segment", iterator=True): performanceTable.pause("Segmentation selectFirst") selection = segment.childOfClass(PmmlPredicate).evaluate( dataTable, functionTable, performanceTable) performanceTable.unpause("Segmentation selectFirst") NP("logical_and", selection, unfilled, selection) if not selection.any(): continue subTable = dataTable.subTable(selection) subModel = segment.childOfClass(PmmlModel) performanceTable.pause("Segmentation selectFirst") subModel.calculate(subTable, functionTable, performanceTable) performanceTable.unpause("Segmentation selectFirst") scoresData[selection] = subTable.score.data if subTable.score.mask is not None: scoresMask[selection] = subTable.score.mask else: scoresMask[selection] = defs.VALID segmentName = segment.get("id") if segmentName is not None: segments[selection] = segmentName for fieldName, dataColumn in subTable.output.items(): if fieldName not in dataTable.output: data = NP("empty", len(dataTable), dtype=dataColumn.fieldType.dtype) data[selection] = dataColumn.data mask = NP( NP("ones", len(dataTable), dtype=defs.maskType) * defs.MISSING) if dataColumn.mask is None: mask[selection] = defs.VALID else: mask[selection] = dataColumn.mask newDataColumn = DataColumn(dataColumn.fieldType, data, mask) newDataColumn._unlock() dataTable.output[fieldName] = newDataColumn newOutputData.append(newDataColumn) else: newDataColumn = dataTable.output[fieldName] newDataColumn.data[selection] = dataColumn.data if dataColumn.mask is None: newDataColumn.mask[selection] = defs.VALID else: newDataColumn.mask[selection] = dataColumn.mask unfilled -= selection if not unfilled.any(): break for newDataColumn in newOutputData: if not newDataColumn.mask.any(): newDataColumn._mask = None newDataColumn._lock() if not scoresMask.any(): scoresMask = None scores = DataColumn(self.scoreType, scoresData, scoresMask) if self.name is None: performanceTable.end("Segmentation selectFirst") return {None: scores} else: performanceTable.end("Segmentation selectFirst") return { None: scores, "segment": DataColumn(self.scoreTypeSegment, segments, None) }
def _selectAllMedianMajority(self, dataTable, functionTable, performanceTable, segmentation, which): """Used by C{calculateScore}.""" if which is self.SELECT_ALL: performanceLabel = "Segmentation selectAll" elif which is self.MEDIAN: performanceLabel = "Segmentation median" elif which is self.MAJORITY_VOTE: performanceLabel = "Segmentation majorityVote" elif which is self.WEIGHTED_MAJORITY_VOTE: performanceLabel = "Segmentation weightedMajorityVote" performanceTable.begin(performanceLabel) scores = [[] for x in xrange(len(dataTable))] if which is self.SELECT_ALL: segments = [[] for x in xrange(len(dataTable))] newOutputData = {} for segment in segmentation.childrenOfTag("Segment", iterator=True): performanceTable.pause(performanceLabel) selection = segment.childOfClass(PmmlPredicate).evaluate( dataTable, functionTable, performanceTable) performanceTable.unpause(performanceLabel) if not selection.any(): continue segmentName = segment.get("id") indexes = NP("nonzero", selection)[0] subTable = dataTable.subTable(selection) subModel = segment.childOfClass(PmmlModel) performanceTable.pause(performanceLabel) subModel.calculate(subTable, functionTable, performanceTable) performanceTable.unpause(performanceLabel) if which is self.MEDIAN and subTable.score.fieldType.dataType in ( "string", "boolean", "object"): raise defs.PmmlValidationError( "Segmentation with multipleModelMethod=\"median\" cannot be applied to models that produce dataType \"%s\"" % subTable.score.fieldType.dataType) scoreData = subTable.score.data scoreMask = subTable.score.mask indexesUsed = indexes if which is self.SELECT_ALL: for subIndex, index in enumerate(indexes): if scoreMask is None or scoreMask[subIndex] == defs.VALID: scores[index].append(scoreData[subIndex]) segments[index].append(segmentName) elif which is self.MEDIAN: for subIndex, index in enumerate(indexes): if scoreMask is None or scoreMask[subIndex] == defs.VALID: scores[index].append(scoreData[subIndex]) elif which in (self.MAJORITY_VOTE, self.WEIGHTED_MAJORITY_VOTE): if which is self.MAJORITY_VOTE: weight = 1.0 else: weight = float(segment.get("weight", 1.0)) for subIndex, index in enumerate(indexes): if scoreMask is None or scoreMask[subIndex] == defs.VALID: newValue = scoreData[subIndex] score = scores[index] found = False for pair in score: if pair[0] == newValue: pair[1] += weight found = True break if not found: score.append([newValue, weight]) if which is self.SELECT_ALL: for fieldName, dataColumn in subTable.output.items(): newData = newOutputData.get(fieldName) if newData is None: newData = [[] for x in xrange(len(dataTable))] newOutputData[fieldName] = newData dataColumnData = dataColumn.data dataColumnMask = dataColumn.mask for subIndex, index in enumerate(indexes): if scoreMask is None or scoreMask[ subIndex] == defs.VALID: if dataColumnMask is None or dataColumnMask[ subIndex] == defs.VALID: newData[index].append(dataColumnData[subIndex]) else: newData[index].append(None) if which is self.SELECT_ALL: for fieldName, newData in newOutputData.items(): finalNewData = NP("empty", len(dataTable), dtype=NP.dtype(object)) for index, newDatum in enumerate(newData): finalNewData[index] = tuple(newDatum) dataTable.output[fieldName] = DataColumn( self.scoreType, finalNewData, None) finalScoresData = NP("empty", len(dataTable), dtype=NP.dtype(object)) for index, score in enumerate(scores): finalScoresData[index] = tuple(score) finalScores = DataColumn(self.scoreType, finalScoresData, None) if self.name is None: performanceTable.end(performanceLabel) return {None: finalScores} else: finalSegmentsData = NP("empty", len(dataTable), dtype=NP.dtype(object)) for index, segment in enumerate(segments): finalSegmentsData[index] = tuple(segment) performanceTable.end(performanceLabel) return { None: finalScores, "segment": DataColumn(self.scoreTypeSegment, finalSegmentsData, None) } elif which is self.MEDIAN: finalScoresData = NP("empty", len(dataTable), dtype=NP.dtype(object)) finalScoresMask = NP("empty", len(dataTable), dtype=defs.maskType) for index, score in enumerate(scores): if len(score) > 0: finalScoresData[index] = NP("median", score) finalScoresMask[index] = defs.VALID else: finalScoresMask[index] = defs.INVALID if not finalScoresMask.any(): finalScoresMask = None finalScores = DataColumn(self.scoreType, finalScoresData, finalScoresMask) performanceTable.end(performanceLabel) return {None: finalScores} elif which in (self.MAJORITY_VOTE, self.WEIGHTED_MAJORITY_VOTE): finalScoresData = NP("empty", len(dataTable), dtype=NP.dtype(object)) finalScoresMask = NP("empty", len(dataTable), dtype=defs.maskType) cardinality = NP("empty", len(dataTable), dtype=self.scoreTypeCardinality.dtype) for index, score in enumerate(scores): bestN, bestValue = None, None for value, N in score: if bestN is None or N > bestN: bestN = N bestValue = value if bestN is not None: finalScoresData[index] = bestValue finalScoresMask[index] = defs.VALID cardinality[index] = bestN else: finalScoresMask[index] = defs.INVALID cardinality[index] = 0 if not finalScoresMask.any(): finalScoresMask = None finalScores = DataColumn(self.scoreType, finalScoresData, finalScoresMask) if self.name is None: performanceTable.end(performanceLabel) return {None: finalScores} else: finalCardinality = DataColumn(self.scoreTypeCardinality, cardinality, None) performanceTable.end(performanceLabel) return {None: finalScores, "cardinality": finalCardinality}
def _sumAverageWeighted(self, dataTable, functionTable, performanceTable, segmentation, which): """Used by C{calculateScore}.""" if which is self.SUM: performanceLabel = "Segmentation sum" elif which is self.AVERAGE: performanceLabel = "Segmentation average" elif which is self.WEIGHTED_AVERAGE: performanceLabel = "Segmentation weightedAverage" performanceTable.begin(performanceLabel) scoresData = NP("zeros", len(dataTable), dtype=NP.dtype(object)) if which is not self.SUM: denominator = NP("zeros", len(dataTable), dtype=NP.dtype(float)) invalid = NP("zeros", len(dataTable), dtype=NP.dtype(bool)) for segment in segmentation.childrenOfTag("Segment", iterator=True): performanceTable.pause(performanceLabel) selection = segment.childOfClass(PmmlPredicate).evaluate( dataTable, functionTable, performanceTable) performanceTable.unpause(performanceLabel) if not selection.any(): continue subTable = dataTable.subTable(selection) subModel = segment.childOfClass(PmmlModel) performanceTable.pause(performanceLabel) subModel.calculate(subTable, functionTable, performanceTable) performanceTable.unpause(performanceLabel) if subTable.score.fieldType.dataType in ("string", "boolean", "object"): raise defs.PmmlValidationError( "Segmentation with multipleModelMethod=\"%s\" cannot be applied to models that produce dataType \"%s\"" % (self.childOfTag("Segmentation").get( "multipleModelMethod"), subTable.score.fieldType.dataType)) # ignore invalid in matches (like the built-in "+" and "avg" Apply functions) if subTable.score.mask is not None: NP("logical_and", selection, NP(subTable.score.mask == defs.VALID), selection) if which is self.SUM: scoresData[selection] += subTable.score.data if which is self.AVERAGE: scoresData[selection] += subTable.score.data denominator[selection] += 1.0 elif which is self.WEIGHTED_AVERAGE: weight = float(segment.get("weight", 1.0)) scoresData[selection] += (subTable.score.data * weight) denominator[selection] += weight if subTable.score.mask is not None: invalid[selection] = NP("logical_or", invalid[selection], NP(subTable.score.mask != defs.VALID)) if which is not self.SUM: NP("logical_or", invalid, NP(denominator == 0.0), invalid) valid = NP("logical_not", invalid) scoresData[valid] /= denominator[valid] if invalid.any(): scoresMask = NP( NP("array", invalid, dtype=defs.maskType) * defs.INVALID) else: scoresMask = None scores = DataColumn(self.scoreType, scoresData, scoresMask) performanceTable.end(performanceLabel) return {None: scores}
def format(self, subTable, functionTable, performanceTable, score): """Extract or post-process output for the output field of a DataTable. @type subTable: DataTable @param subTable: The DataTable associated with this local lexical scope. @type functionTable: FunctionTable or None @param functionTable: A table of functions. @type performanceTable: PerformanceTable or None @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @type score: dict @param score: Dictionary mapping PMML score "feature" strings to DataColumns. This dictionary always contains a None key, which is the basic feature ("predictedValue"). @rtype: DataColumn @return: The output that would go into an output field of a DataTable. """ performanceTable.begin("OutputField") feature = self.get("feature") if feature is None: dataColumn = subTable.fields[self["name"]] elif feature == "predictedValue": dataColumn = score[None] elif feature == "predictedDisplayValue": original = score[None] toString = original.fieldType.valueToString data = NP("empty", len(subTable), dtype=NP.dtype(object)) for i, x in enumerate(original.data): data[i] = toString(x) dataColumn = DataColumn(FakeFieldType("string", "continuous"), data, None) elif feature == "transformedValue": expression = self.childOfClass(PmmlExpression) if expression is None: raise defs.PmmlValidationError("OutputField with feature \"transformedValue\" requires an EXPRESSION") performanceTable.pause("OutputField") dataColumn = expression.evaluate(subTable, functionTable, performanceTable) performanceTable.unpause("OutputField") elif feature == "decision": decisions = self.childOfTag("Decisions") if decisions is None: raise defs.PmmlValidationError("OutputField with feature \"decision\" requires a Decisions block") performanceTable.pause("OutputField") dataColumn = self.childOfClass(PmmlExpression).evaluate(subTable, functionTable, performanceTable) performanceTable.unpause("OutputField") if dataColumn.mask is None: valid = None else: valid = NP(dataColumn.mask == defs.VALID) fieldType = FakeFieldType("object", "any") data = NP("empty", len(subTable), dtype=fieldType.dtype) mask = NP(NP("ones", len(subTable), dtype=defs.maskType) * defs.MISSING) for decision in decisions.childrenOfTag("Decision"): value = dataColumn.fieldType.stringToValue(decision["value"]) selection = NP(dataColumn.data == value) if valid is not None: NP("logical_and", selection, valid, selection) for i in xrange(len(data)): if selection[i]: data[i] = decision mask[selection] = defs.VALID if not mask.any(): mask = None dataColumn = DataColumn(fieldType, data, mask) elif feature in score: dataColumn = score[feature] else: model = self.getparent() if model is not None: model = model.getparent() if model is None: model = "(orphaned OutputField; no parent model)" else: model = model.t raise defs.PmmlValidationError("Models of type %s do not produce \"%s\" features (or at least, it is not yet implemented by Augustus)" % (model, feature)) dataType = self.get("dataType", dataColumn.fieldType.dataType) optype = self.get("optype", dataColumn.fieldType.optype) if (dataType != dataColumn.fieldType.dataType or optype != dataColumn.fieldType.optype) and feature not in ("predictedDisplayValue", "decision"): dataColumn = FieldCastMethods.cast(FakeFieldType(dataType, optype), dataColumn) if feature is not None: subTable.fields[self.get("displayName", self["name"])] = dataColumn performanceTable.end("OutputField") return dataColumn
def _sumAverageWeighted(self, dataTable, functionTable, performanceTable, segmentation, which): """Used by C{calculateScore}.""" if which is self.SUM: performanceLabel = "Segmentation sum" elif which is self.AVERAGE: performanceLabel = "Segmentation average" elif which is self.WEIGHTED_AVERAGE: performanceLabel = "Segmentation weightedAverage" performanceTable.begin(performanceLabel) scoresData = NP("zeros", len(dataTable), dtype=NP.dtype(object)) if which is not self.SUM: denominator = NP("zeros", len(dataTable), dtype=NP.dtype(float)) invalid = NP("zeros", len(dataTable), dtype=NP.dtype(bool)) for segment in segmentation.childrenOfTag("Segment", iterator=True): performanceTable.pause(performanceLabel) selection = segment.childOfClass(PmmlPredicate).evaluate(dataTable, functionTable, performanceTable) performanceTable.unpause(performanceLabel) if not selection.any(): continue subTable = dataTable.subTable(selection) subModel = segment.childOfClass(PmmlModel) performanceTable.pause(performanceLabel) subModel.calculate(subTable, functionTable, performanceTable) performanceTable.unpause(performanceLabel) if subTable.score.fieldType.dataType in ("string", "boolean", "object"): raise defs.PmmlValidationError("Segmentation with multipleModelMethod=\"%s\" cannot be applied to models that produce dataType \"%s\"" % (self.childOfTag("Segmentation").get("multipleModelMethod"), subTable.score.fieldType.dataType)) # ignore invalid in matches (like the built-in "+" and "avg" Apply functions) if subTable.score.mask is not None: NP("logical_and", selection, NP(subTable.score.mask == defs.VALID), selection) if which is self.SUM: scoresData[selection] += subTable.score.data if which is self.AVERAGE: scoresData[selection] += subTable.score.data denominator[selection] += 1.0 elif which is self.WEIGHTED_AVERAGE: weight = float(segment.get("weight", 1.0)) scoresData[selection] += (subTable.score.data * weight) denominator[selection] += weight if subTable.score.mask is not None: invalid[selection] = NP("logical_or", invalid[selection], NP(subTable.score.mask != defs.VALID)) if which is not self.SUM: NP("logical_or", invalid, NP(denominator == 0.0), invalid) valid = NP("logical_not", invalid) scoresData[valid] /= denominator[valid] if invalid.any(): scoresMask = NP(NP("array", invalid, dtype=defs.maskType) * defs.INVALID) else: scoresMask = None scores = DataColumn(self.scoreType, scoresData, scoresMask) performanceTable.end(performanceLabel) return {None: scores}
def functionMax(self, dataColumn, whereMask, groupSelection, getstate, setstate): """Finds the maximum of rows in a DataColumn, possibly with an SQL where mask and groupField. @type dataColumn: DataColumn @param dataColumn: The input data column. @type whereMask: 1d Numpy array of bool, or None @param whereMask: The result of the SQL where selection. @type groupSelection: 1d Numpy array of bool, or None. @param groupSelection: Rows corresponding to a particular value of the groupField. @type getstate: callable function @param getstate: Retrieve staring values from the DataTableState. @type setstate: callable function @param setstate: Store ending values to the DataTableState. @rtype: DataColumn @return: A column of maximized rows. """ fieldType = dataColumn.fieldType if fieldType.optype not in ("continuous", "ordinal"): raise defs.PmmlValidationError( "Aggregate function \"min\" requires a continuous or ordinal input field" ) if dataColumn.mask is None: selection = NP("ones", len(dataColumn), dtype=NP.dtype(bool)) else: selection = NP(dataColumn.mask == defs.VALID) if whereMask is not None: NP("logical_and", selection, whereMask, selection) if groupSelection is not None: NP("logical_and", selection, groupSelection, selection) maximum = None if getstate is not None: startingState = getstate() if startingState is not None: maximum = startingState data = NP("empty", len(dataColumn), dtype=fieldType.dtype) mask = NP("zeros", len(dataColumn), dtype=defs.maskType) for i, x in enumerate(dataColumn.data): if selection[i]: if maximum is None or x > maximum: maximum = x if maximum is None: mask[i] = defs.INVALID else: data[i] = maximum if not mask.any(): mask = None if setstate is not None: setstate(maximum) return DataColumn(fieldType, data, mask)
def _selectAllMedianMajority(self, dataTable, functionTable, performanceTable, segmentation, which): """Used by C{calculateScore}.""" if which is self.SELECT_ALL: performanceLabel = "Segmentation selectAll" elif which is self.MEDIAN: performanceLabel = "Segmentation median" elif which is self.MAJORITY_VOTE: performanceLabel = "Segmentation majorityVote" elif which is self.WEIGHTED_MAJORITY_VOTE: performanceLabel = "Segmentation weightedMajorityVote" performanceTable.begin(performanceLabel) scores = [[] for x in xrange(len(dataTable))] if which is self.SELECT_ALL: segments = [[] for x in xrange(len(dataTable))] newOutputData = {} for segment in segmentation.childrenOfTag("Segment", iterator=True): performanceTable.pause(performanceLabel) selection = segment.childOfClass(PmmlPredicate).evaluate(dataTable, functionTable, performanceTable) performanceTable.unpause(performanceLabel) if not selection.any(): continue segmentName = segment.get("id") indexes = NP("nonzero", selection)[0] subTable = dataTable.subTable(selection) subModel = segment.childOfClass(PmmlModel) performanceTable.pause(performanceLabel) subModel.calculate(subTable, functionTable, performanceTable) performanceTable.unpause(performanceLabel) if which is self.MEDIAN and subTable.score.fieldType.dataType in ("string", "boolean", "object"): raise defs.PmmlValidationError("Segmentation with multipleModelMethod=\"median\" cannot be applied to models that produce dataType \"%s\"" % subTable.score.fieldType.dataType) scoreData = subTable.score.data scoreMask = subTable.score.mask indexesUsed = indexes if which is self.SELECT_ALL: for subIndex, index in enumerate(indexes): if scoreMask is None or scoreMask[subIndex] == defs.VALID: scores[index].append(scoreData[subIndex]) segments[index].append(segmentName) elif which is self.MEDIAN: for subIndex, index in enumerate(indexes): if scoreMask is None or scoreMask[subIndex] == defs.VALID: scores[index].append(scoreData[subIndex]) elif which in (self.MAJORITY_VOTE, self.WEIGHTED_MAJORITY_VOTE): if which is self.MAJORITY_VOTE: weight = 1.0 else: weight = float(segment.get("weight", 1.0)) for subIndex, index in enumerate(indexes): if scoreMask is None or scoreMask[subIndex] == defs.VALID: newValue = scoreData[subIndex] score = scores[index] found = False for pair in score: if pair[0] == newValue: pair[1] += weight found = True break if not found: score.append([newValue, weight]) if which is self.SELECT_ALL: for fieldName, dataColumn in subTable.output.items(): newData = newOutputData.get(fieldName) if newData is None: newData = [[] for x in xrange(len(dataTable))] newOutputData[fieldName] = newData dataColumnData = dataColumn.data dataColumnMask = dataColumn.mask for subIndex, index in enumerate(indexes): if scoreMask is None or scoreMask[subIndex] == defs.VALID: if dataColumnMask is None or dataColumnMask[subIndex] == defs.VALID: newData[index].append(dataColumnData[subIndex]) else: newData[index].append(None) if which is self.SELECT_ALL: for fieldName, newData in newOutputData.items(): finalNewData = NP("empty", len(dataTable), dtype=NP.dtype(object)) for index, newDatum in enumerate(newData): finalNewData[index] = tuple(newDatum) dataTable.output[fieldName] = DataColumn(self.scoreType, finalNewData, None) finalScoresData = NP("empty", len(dataTable), dtype=NP.dtype(object)) for index, score in enumerate(scores): finalScoresData[index] = tuple(score) finalScores = DataColumn(self.scoreType, finalScoresData, None) if self.name is None: performanceTable.end(performanceLabel) return {None: finalScores} else: finalSegmentsData = NP("empty", len(dataTable), dtype=NP.dtype(object)) for index, segment in enumerate(segments): finalSegmentsData[index] = tuple(segment) performanceTable.end(performanceLabel) return {None: finalScores, "segment": DataColumn(self.scoreTypeSegment, finalSegmentsData, None)} elif which is self.MEDIAN: finalScoresData = NP("empty", len(dataTable), dtype=NP.dtype(object)) finalScoresMask = NP("empty", len(dataTable), dtype=defs.maskType) for index, score in enumerate(scores): if len(score) > 0: finalScoresData[index] = NP("median", score) finalScoresMask[index] = defs.VALID else: finalScoresMask[index] = defs.INVALID if not finalScoresMask.any(): finalScoresMask = None finalScores = DataColumn(self.scoreType, finalScoresData, finalScoresMask) performanceTable.end(performanceLabel) return {None: finalScores} elif which in (self.MAJORITY_VOTE, self.WEIGHTED_MAJORITY_VOTE): finalScoresData = NP("empty", len(dataTable), dtype=NP.dtype(object)) finalScoresMask = NP("empty", len(dataTable), dtype=defs.maskType) cardinality = NP("empty", len(dataTable), dtype=self.scoreTypeCardinality.dtype) for index, score in enumerate(scores): bestN, bestValue = None, None for value, N in score: if bestN is None or N > bestN: bestN = N bestValue = value if bestN is not None: finalScoresData[index] = bestValue finalScoresMask[index] = defs.VALID cardinality[index] = bestN else: finalScoresMask[index] = defs.INVALID cardinality[index] = 0 if not finalScoresMask.any(): finalScoresMask = None finalScores = DataColumn(self.scoreType, finalScoresData, finalScoresMask) if self.name is None: performanceTable.end(performanceLabel) return {None: finalScores} else: finalCardinality = DataColumn(self.scoreTypeCardinality, cardinality, None) performanceTable.end(performanceLabel) return {None: finalScores, "cardinality": finalCardinality}
def applyScore(self, dataTable, functionTable, performanceTable, selection, score, missingValueStrategy, missingValuePenalty, noTrueChildStrategy): """Walk through the tree by one Node, splitting the DataTable on the way down and merging it on the way back up. @type dataTable: DataTable @param dataTable: A DataTable containing all rows that match this node in the tree and those above it. @type functionTable: FunctionTable @param functionTable: A table of functions. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @type selection: 1d Numpy array of bool @param selection: The rows in this DataTable that match this Node. @type score: dict @param score: A dictionary that maps PMML score "features" to DataColumns. The None key is "predictedValue" and is the only one guaranteed to exist. @type missingValueStrategy: singleton Python object, defined in the Node class @param missingValueStrategy: The tree's global missing value strategy. @type missingValuePenalty: number @param missingValuePenalty: The tree's global missing value penalty. @type noTrueChildStrategy: singleton Python object, defined in the Node class @param noTrueChildStrategy: The tree's global no-true-child strategy. """ if not selection.any(): return subNodes = self.childrenOfClass(Node) if len(subNodes) == 0: self.applyScoreLeaf(selection, score, performanceTable) else: performanceTable.begin("split downward") subTable = dataTable.subTable(selection) subScore = {} for name, field in score.items(): if field.mask is None: subScore[name] = DataColumn(field.fieldType, field.data[selection], None) else: subScore[name] = DataColumn(field.fieldType, field.data[selection], field.mask[selection]) subScore[name]._unlock() unset = NP("ones", len(subTable), dtype=NP.dtype(bool)) performanceTable.end("split downward") for subNode in subNodes: subSelection, subUnknowns, subEncounteredUnknowns = subNode.evaluatePredicate( subTable, functionTable, performanceTable, returnUnknowns=True) performanceTable.begin("logical_and") NP("logical_and", subSelection, unset, subSelection) NP("logical_and", subSelection, NP("logical_not", subUnknowns), subSelection) NP("logical_and", subUnknowns, unset, subUnknowns) NP("logical_and", subEncounteredUnknowns, unset, subEncounteredUnknowns) NP("logical_and", unset, NP("logical_not", subSelection), unset) performanceTable.end("logical_and") subNode.applyScore(subTable, functionTable, performanceTable, subSelection, subScore, missingValueStrategy, missingValuePenalty, noTrueChildStrategy) if "penaltyProduct" in subScore: subScore["penaltyProduct"].data[ subEncounteredUnknowns] *= missingValuePenalty if subUnknowns.any(): if missingValueStrategy is self.LAST_PREDICTION: self.applyScoreLeaf(subUnknowns, subScore, performanceTable) NP("logical_and", unset, NP("logical_not", subUnknowns), unset) elif missingValueStrategy is self.NULL_PREDICTION: NP("logical_and", unset, NP("logical_not", subUnknowns), unset) elif missingValueStrategy is self.DEFAULT_CHILD: defaultChild = self.xpath("@defaultChild") if len(defaultChild) == 0: raise defs.PmmlValidationError( "When missingValueStrategy is \"defaultChild\", every non-leaf node must have a defaultChild attribute" ) defaultChild = defaultChild[0] defaultNode = self.xpath("pmml:Node[@id='%s']" % defaultChild) if len(defaultNode) == 0: raise defs.PmmlValidationError( "The defaultChild \"%s\" is not found (no such id at this level)" % defaultChild) defaultNode = defaultNode[0] NP("logical_and", unset, NP("logical_not", subUnknowns), unset) defaultNode.applyScore(subTable, functionTable, performanceTable, subUnknowns, subScore, missingValueStrategy, missingValuePenalty, noTrueChildStrategy) elif missingValueStrategy is self.WEIGHTED_CONFIDENCE: # this involves evaluating an ensemble of subtrees and choosing among them: too hard raise NotImplementedError( "missingValueStrategy=\"weightedConfidence\"") elif missingValueStrategy is self.AGGREGATE_NODES: # this involves evaluating an ensemble of subtrees and agregating over them: too hard raise NotImplementedError( "missingValueStrategy=\"aggregateNodes\"") elif missingValueStrategy is self.NONE: pass if not unset.any(): break if noTrueChildStrategy is self.RETURN_LAST_PREDICTION and unset.any( ): self.applyScoreLeaf(unset, subScore, performanceTable) performanceTable.begin("merge upward") for name, field in score.items(): field.data[selection] = subScore[name].data if field.mask is not None: field.mask[selection] = subScore[name].mask performanceTable.end("merge upward")
def applyScore(self, dataTable, functionTable, performanceTable, selection, score, missingValueStrategy, missingValuePenalty, noTrueChildStrategy): """Walk through the tree by one Node, splitting the DataTable on the way down and merging it on the way back up. @type dataTable: DataTable @param dataTable: A DataTable containing all rows that match this node in the tree and those above it. @type functionTable: FunctionTable @param functionTable: A table of functions. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @type selection: 1d Numpy array of bool @param selection: The rows in this DataTable that match this Node. @type score: dict @param score: A dictionary that maps PMML score "features" to DataColumns. The None key is "predictedValue" and is the only one guaranteed to exist. @type missingValueStrategy: singleton Python object, defined in the Node class @param missingValueStrategy: The tree's global missing value strategy. @type missingValuePenalty: number @param missingValuePenalty: The tree's global missing value penalty. @type noTrueChildStrategy: singleton Python object, defined in the Node class @param noTrueChildStrategy: The tree's global no-true-child strategy. """ if not selection.any(): return subNodes = self.childrenOfClass(Node) if len(subNodes) == 0: self.applyScoreLeaf(selection, score, performanceTable) else: performanceTable.begin("split downward") subTable = dataTable.subTable(selection) subScore = {} for name, field in score.items(): if field.mask is None: subScore[name] = DataColumn(field.fieldType, field.data[selection], None) else: subScore[name] = DataColumn(field.fieldType, field.data[selection], field.mask[selection]) subScore[name]._unlock() unset = NP("ones", len(subTable), dtype=NP.dtype(bool)) performanceTable.end("split downward") for subNode in subNodes: subSelection, subUnknowns, subEncounteredUnknowns = subNode.evaluatePredicate(subTable, functionTable, performanceTable, returnUnknowns=True) performanceTable.begin("logical_and") NP("logical_and", subSelection, unset, subSelection) NP("logical_and", subSelection, NP("logical_not", subUnknowns), subSelection) NP("logical_and", subUnknowns, unset, subUnknowns) NP("logical_and", subEncounteredUnknowns, unset, subEncounteredUnknowns) NP("logical_and", unset, NP("logical_not", subSelection), unset) performanceTable.end("logical_and") subNode.applyScore(subTable, functionTable, performanceTable, subSelection, subScore, missingValueStrategy, missingValuePenalty, noTrueChildStrategy) if "penaltyProduct" in subScore: subScore["penaltyProduct"].data[subEncounteredUnknowns] *= missingValuePenalty if subUnknowns.any(): if missingValueStrategy is self.LAST_PREDICTION: self.applyScoreLeaf(subUnknowns, subScore, performanceTable) NP("logical_and", unset, NP("logical_not", subUnknowns), unset) elif missingValueStrategy is self.NULL_PREDICTION: NP("logical_and", unset, NP("logical_not", subUnknowns), unset) elif missingValueStrategy is self.DEFAULT_CHILD: defaultChild = self.xpath("@defaultChild") if len(defaultChild) == 0: raise defs.PmmlValidationError("When missingValueStrategy is \"defaultChild\", every non-leaf node must have a defaultChild attribute") defaultChild = defaultChild[0] defaultNode = self.xpath("pmml:Node[@id='%s']" % defaultChild) if len(defaultNode) == 0: raise defs.PmmlValidationError("The defaultChild \"%s\" is not found (no such id at this level)" % defaultChild) defaultNode = defaultNode[0] NP("logical_and", unset, NP("logical_not", subUnknowns), unset) defaultNode.applyScore(subTable, functionTable, performanceTable, subUnknowns, subScore, missingValueStrategy, missingValuePenalty, noTrueChildStrategy) elif missingValueStrategy is self.WEIGHTED_CONFIDENCE: # this involves evaluating an ensemble of subtrees and choosing among them: too hard raise NotImplementedError("missingValueStrategy=\"weightedConfidence\"") elif missingValueStrategy is self.AGGREGATE_NODES: # this involves evaluating an ensemble of subtrees and agregating over them: too hard raise NotImplementedError("missingValueStrategy=\"aggregateNodes\"") elif missingValueStrategy is self.NONE: pass if not unset.any(): break if noTrueChildStrategy is self.RETURN_LAST_PREDICTION and unset.any(): self.applyScoreLeaf(unset, subScore, performanceTable) performanceTable.begin("merge upward") for name, field in score.items(): field.data[selection] = subScore[name].data if field.mask is not None: field.mask[selection] = subScore[name].mask performanceTable.end("merge upward")
def format(self, subTable, functionTable, performanceTable, score): """Extract or post-process output for the output field of a DataTable. @type subTable: DataTable @param subTable: The DataTable associated with this local lexical scope. @type functionTable: FunctionTable or None @param functionTable: A table of functions. @type performanceTable: PerformanceTable or None @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @type score: dict @param score: Dictionary mapping PMML score "feature" strings to DataColumns. This dictionary always contains a None key, which is the basic feature ("predictedValue"). @rtype: DataColumn @return: The output that would go into an output field of a DataTable. """ performanceTable.begin("OutputField") feature = self.get("feature") if feature is None: dataColumn = subTable.fields[self["name"]] elif feature == "predictedValue": dataColumn = score[None] elif feature == "predictedDisplayValue": original = score[None] toString = original.fieldType.valueToString data = NP("empty", len(subTable), dtype=NP.dtype(object)) for i, x in enumerate(original.data): data[i] = toString(x) dataColumn = DataColumn(FakeFieldType("string", "continuous"), data, None) elif feature == "transformedValue": expression = self.childOfClass(PmmlExpression) if expression is None: raise defs.PmmlValidationError( "OutputField with feature \"transformedValue\" requires an EXPRESSION" ) performanceTable.pause("OutputField") dataColumn = expression.evaluate(subTable, functionTable, performanceTable) performanceTable.unpause("OutputField") elif feature == "decision": decisions = self.childOfTag("Decisions") if decisions is None: raise defs.PmmlValidationError( "OutputField with feature \"decision\" requires a Decisions block" ) performanceTable.pause("OutputField") dataColumn = self.childOfClass(PmmlExpression).evaluate( subTable, functionTable, performanceTable) performanceTable.unpause("OutputField") if dataColumn.mask is None: valid = None else: valid = NP(dataColumn.mask == defs.VALID) fieldType = FakeFieldType("object", "any") data = NP("empty", len(subTable), dtype=fieldType.dtype) mask = NP( NP("ones", len(subTable), dtype=defs.maskType) * defs.MISSING) for decision in decisions.childrenOfTag("Decision"): value = dataColumn.fieldType.stringToValue(decision["value"]) selection = NP(dataColumn.data == value) if valid is not None: NP("logical_and", selection, valid, selection) for i in xrange(len(data)): if selection[i]: data[i] = decision mask[selection] = defs.VALID if not mask.any(): mask = None dataColumn = DataColumn(fieldType, data, mask) elif feature in score: dataColumn = score[feature] else: model = self.getparent() if model is not None: model = model.getparent() if model is None: model = "(orphaned OutputField; no parent model)" else: model = model.t raise defs.PmmlValidationError( "Models of type %s do not produce \"%s\" features (or at least, it is not yet implemented by Augustus)" % (model, feature)) dataType = self.get("dataType", dataColumn.fieldType.dataType) optype = self.get("optype", dataColumn.fieldType.optype) if (dataType != dataColumn.fieldType.dataType or optype != dataColumn.fieldType.optype) and feature not in ( "predictedDisplayValue", "decision"): dataColumn = FieldCastMethods.cast(FakeFieldType(dataType, optype), dataColumn) if feature is not None: subTable.fields[self.get("displayName", self["name"])] = dataColumn performanceTable.end("OutputField") return dataColumn
def calculateScore(self, dataTable, functionTable, performanceTable): """Calculate the score of this model. This method is called by C{calculate} to separate operations that are performed by all models (in C{calculate}) from operations that are performed by specific models (in C{calculateScore}). @type subTable: DataTable @param subTable: The DataTable representing this model's lexical scope. @type functionTable: FunctionTable or None @param functionTable: A table of functions. @type performanceTable: PerformanceTable or None @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: DataColumn @return: A DataColumn containing the score. """ performanceTable.begin("ClusteringModel") performanceTable.begin("set up") distributionBased = (self["modelClass"] == "distributionBased") clusteringFields = self.xpath( "pmml:ClusteringField[not(@isCenterField='false')]") fieldWeights = [ clusteringField.get("fieldWeight", defaultFromXsd=True, convertType=True) for clusteringField in clusteringFields ] for fieldWeight in fieldWeights: if fieldWeight < 0.0: raise defs.PmmlValidationError( "ClusteringField fieldWeights must all be non-negative (encountered %g)" % fieldWeight) clusters = self.xpath("pmml:Cluster") comparisonMeasure = self.childOfClass(ComparisonMeasure) defaultCompareFunction = comparisonMeasure.get("compareFunction", defaultFromXsd=True) metric = comparisonMeasure.childOfClass(PmmlClusteringMetric) metrictag = metric.t performanceTable.end("set up") for clusteringField in clusteringFields: dataType = dataTable.fields[ clusteringField["field"]].fieldType.dataType if dataType == "string": raise defs.PmmlValidationError( "ClusteringField \"%s\" has dataType \"%s\", which cannot be used for clustering" % (clusteringField["field"], dataType)) missingValueWeights = self.childOfTag("MissingValueWeights") if missingValueWeights is None: adjustM = None else: performanceTable.begin("MissingValueWeights") missingWeights = missingValueWeights.childOfClass( PmmlArray).values(convertType=True) sumNMqi = NP("zeros", len(dataTable), dtype=NP.dtype(float)) for clusteringField, missingWeight in zip(clusteringFields, missingWeights): clusteringField.addToAdjustM(dataTable, functionTable, performanceTable, sumNMqi, missingWeight) adjustM = NP(sum(missingWeights) / sumNMqi) adjustM[NP(sumNMqi == 0.0)] = 1.0 performanceTable.end("MissingValueWeights") anyInvalid = NP("zeros", len(dataTable), dtype=NP.dtype(bool)) for clusteringField in clusteringFields: mask = dataTable.fields[clusteringField["field"]].mask if mask is not None: NP("logical_or", anyInvalid, NP(mask == defs.INVALID), anyInvalid) bestClusterId = None bestClusterAffinity = None allClusterAffinities = {} for index, cluster in enumerate(clusters): array = cluster.childOfClass(PmmlArray) if array is None: raise defs.PmmlValidationError( "Cluster must have an array to designate its center") centerStrings = array.values(convertType=False) if len(centerStrings) != len(clusteringFields): raise defs.PmmlValidationError( "Cluster array has %d components, but there are %d ClusteringFields with isCenterField=true" % (len(centerStrings), len(clusteringFields))) performanceTable.begin(metrictag) if distributionBased: matrix = cluster.xpath("pmml:Covariances/pmml:Matrix") if len(matrix) != 1: raise defs.PmmlValidationError( "In distribution-based clustering, all clusters must have a Covariances/Matrix" ) try: covarianceMatrix = NP("array", matrix[0].values(), dtype=NP.dtype(float)) except ValueError: raise defs.PmmlValidationError( "Covariances/Matrix must contain real numbers for distribution-based clustering" ) else: covarianceMatrix = None state = self._State() metric.initialize(state, len(dataTable), len(clusteringFields), distributionBased) for clusteringField, centerString, fieldWeight in zip( clusteringFields, centerStrings, fieldWeights): if isinstance(metric, PmmlClusteringMetricBinary): metric.accumulateBinary( state, dataTable.fields[clusteringField["field"]], centerString, distributionBased) else: performanceTable.pause(metrictag) cxy = clusteringField.compare(dataTable, functionTable, performanceTable, centerString, defaultCompareFunction, anyInvalid) performanceTable.unpause(metrictag) metric.accumulate(state, cxy, fieldWeight, distributionBased) distance = metric.finalizeDistance(state, adjustM, distributionBased, covarianceMatrix) del state performanceTable.end(metrictag) if index == 0: bestClusterId = NP("ones", len(dataTable), dtype=NP.dtype(int)) # 1-based index bestClusterAffinity = distance better = NP(distance < bestClusterAffinity) bestClusterId[better] = index + 1 # 1-based index bestClusterAffinity[better] = distance[better] allClusterAffinities[cluster.get("id", "%d" % (index + 1))] = distance if not anyInvalid.any(): scoreMask = None else: scoreMask = NP(anyInvalid * defs.INVALID) performanceTable.begin("set scores") score = {} performanceTable.begin("predictedValue") fieldType = FakeFieldType("string", "categorical") clusterIdentifiers = NP("empty", len(dataTable), dtype=fieldType.dtype) for index, cluster in enumerate(clusters): value = fieldType.stringToValue( cluster.get("id", "%d" % (index + 1))) clusterIdentifiers[NP(bestClusterId == (index + 1))] = value score[None] = DataColumn(fieldType, clusterIdentifiers, scoreMask) performanceTable.end("predictedValue") if self.subFields["predictedDisplayValue"]: performanceTable.begin("predictedDisplayValue") fieldType = FakeFieldType("string", "categorical") clusterNames = NP("empty", len(dataTable), dtype=fieldType.dtype) for index, cluster in enumerate(clusters): value = fieldType.stringToValue(cluster.get("name", "")) clusterNames[NP(bestClusterId == (index + 1))] = value score["predictedDisplayValue"] = DataColumn( fieldType, clusterNames, scoreMask) performanceTable.end("predictedDisplayValue") if self.subFields["entity"]: performanceTable.begin("entity") fieldType = FakeFieldType("object", "any") entities = NP("empty", len(dataTable), dtype=fieldType.dtype) for index, cluster in enumerate(clusters): value = fieldType.stringToValue(cluster.get("name", "")) indexPlusOne = index + 1 for i in xrange(len(entities)): if bestClusterId[i] == indexPlusOne: entities[i] = cluster score["entity"] = DataColumn(fieldType, entities, scoreMask) performanceTable.end("entity") if self.subFields["clusterId"]: performanceTable.begin("clusterId") fieldType = FakeFieldType("integer", "continuous") score["clusterId"] = DataColumn(fieldType, bestClusterId, scoreMask) performanceTable.end("clusterId") if self.subFields["entityId"]: performanceTable.begin("entityId") fieldType = FakeFieldType("integer", "continuous") score["entityId"] = DataColumn(fieldType, bestClusterId, scoreMask) performanceTable.end("entityId") if self.subFields["clusterAffinity"]: performanceTable.begin("clusterAffinity") fieldType = FakeFieldType("double", "continuous") score["clusterAffinity"] = DataColumn(fieldType, bestClusterAffinity, scoreMask) performanceTable.end("clusterAffinity") if self.subFields["affinity"]: performanceTable.begin("affinity") fieldType = FakeFieldType("double", "continuous") score["affinity"] = DataColumn(fieldType, bestClusterAffinity, scoreMask) performanceTable.end("affinity") if self.subFields["all"]: performanceTable.begin("all") fieldType = FakeFieldType("double", "continuous") for identifier, distance in allClusterAffinities.items(): score["all.%s" % identifier] = DataColumn( fieldType, distance, scoreMask) performanceTable.end("all") performanceTable.end("set scores") performanceTable.end("ClusteringModel") return score
def calculateScore(self, dataTable, functionTable, performanceTable): """Calculate the score of this model. This method is called by C{calculate} to separate operations that are performed by all models (in C{calculate}) from operations that are performed by specific models (in C{calculateScore}). @type subTable: DataTable @param subTable: The DataTable representing this model's lexical scope. @type functionTable: FunctionTable or None @param functionTable: A table of functions. @type performanceTable: PerformanceTable or None @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: DataColumn @return: A DataColumn containing the score. """ performanceTable.begin("ClusteringModel") performanceTable.begin("set up") distributionBased = (self["modelClass"] == "distributionBased") clusteringFields = self.xpath("pmml:ClusteringField[not(@isCenterField='false')]") fieldWeights = [clusteringField.get("fieldWeight", defaultFromXsd=True, convertType=True) for clusteringField in clusteringFields] for fieldWeight in fieldWeights: if fieldWeight < 0.0: raise defs.PmmlValidationError("ClusteringField fieldWeights must all be non-negative (encountered %g)" % fieldWeight) clusters = self.xpath("pmml:Cluster") comparisonMeasure = self.childOfClass(ComparisonMeasure) defaultCompareFunction = comparisonMeasure.get("compareFunction", defaultFromXsd=True) metric = comparisonMeasure.childOfClass(PmmlClusteringMetric) metrictag = metric.t performanceTable.end("set up") for clusteringField in clusteringFields: dataType = dataTable.fields[clusteringField["field"]].fieldType.dataType if dataType == "string": raise defs.PmmlValidationError("ClusteringField \"%s\" has dataType \"%s\", which cannot be used for clustering" % (clusteringField["field"], dataType)) missingValueWeights = self.childOfTag("MissingValueWeights") if missingValueWeights is None: adjustM = None else: performanceTable.begin("MissingValueWeights") missingWeights = missingValueWeights.childOfClass(PmmlArray).values(convertType=True) sumNMqi = NP("zeros", len(dataTable), dtype=NP.dtype(float)) for clusteringField, missingWeight in zip(clusteringFields, missingWeights): clusteringField.addToAdjustM(dataTable, functionTable, performanceTable, sumNMqi, missingWeight) adjustM = NP(sum(missingWeights) / sumNMqi) adjustM[NP(sumNMqi == 0.0)] = 1.0 performanceTable.end("MissingValueWeights") anyInvalid = NP("zeros", len(dataTable), dtype=NP.dtype(bool)) for clusteringField in clusteringFields: mask = dataTable.fields[clusteringField["field"]].mask if mask is not None: NP("logical_or", anyInvalid, NP(mask == defs.INVALID), anyInvalid) bestClusterId = None bestClusterAffinity = None allClusterAffinities = {} for index, cluster in enumerate(clusters): array = cluster.childOfClass(PmmlArray) if array is None: raise defs.PmmlValidationError("Cluster must have an array to designate its center") centerStrings = array.values(convertType=False) if len(centerStrings) != len(clusteringFields): raise defs.PmmlValidationError("Cluster array has %d components, but there are %d ClusteringFields with isCenterField=true" % (len(centerStrings), len(clusteringFields))) performanceTable.begin(metrictag) if distributionBased: matrix = cluster.xpath("pmml:Covariances/pmml:Matrix") if len(matrix) != 1: raise defs.PmmlValidationError("In distribution-based clustering, all clusters must have a Covariances/Matrix") try: covarianceMatrix = NP("array", matrix[0].values(), dtype=NP.dtype(float)) except ValueError: raise defs.PmmlValidationError("Covariances/Matrix must contain real numbers for distribution-based clustering") else: covarianceMatrix = None state = self._State() metric.initialize(state, len(dataTable), len(clusteringFields), distributionBased) for clusteringField, centerString, fieldWeight in zip(clusteringFields, centerStrings, fieldWeights): if isinstance(metric, PmmlClusteringMetricBinary): metric.accumulateBinary(state, dataTable.fields[clusteringField["field"]], centerString, distributionBased) else: performanceTable.pause(metrictag) cxy = clusteringField.compare(dataTable, functionTable, performanceTable, centerString, defaultCompareFunction, anyInvalid) performanceTable.unpause(metrictag) metric.accumulate(state, cxy, fieldWeight, distributionBased) distance = metric.finalizeDistance(state, adjustM, distributionBased, covarianceMatrix) del state performanceTable.end(metrictag) if index == 0: bestClusterId = NP("ones", len(dataTable), dtype=NP.dtype(int)) # 1-based index bestClusterAffinity = distance better = NP(distance < bestClusterAffinity) bestClusterId[better] = index + 1 # 1-based index bestClusterAffinity[better] = distance[better] allClusterAffinities[cluster.get("id", "%d" % (index + 1))] = distance if not anyInvalid.any(): scoreMask = None else: scoreMask = NP(anyInvalid * defs.INVALID) performanceTable.begin("set scores") score = {} performanceTable.begin("predictedValue") fieldType = FakeFieldType("string", "categorical") clusterIdentifiers = NP("empty", len(dataTable), dtype=fieldType.dtype) for index, cluster in enumerate(clusters): value = fieldType.stringToValue(cluster.get("id", "%d" % (index + 1))) clusterIdentifiers[NP(bestClusterId == (index + 1))] = value score[None] = DataColumn(fieldType, clusterIdentifiers, scoreMask) performanceTable.end("predictedValue") if self.subFields["predictedDisplayValue"]: performanceTable.begin("predictedDisplayValue") fieldType = FakeFieldType("string", "categorical") clusterNames = NP("empty", len(dataTable), dtype=fieldType.dtype) for index, cluster in enumerate(clusters): value = fieldType.stringToValue(cluster.get("name", "")) clusterNames[NP(bestClusterId == (index + 1))] = value score["predictedDisplayValue"] = DataColumn(fieldType, clusterNames, scoreMask) performanceTable.end("predictedDisplayValue") if self.subFields["entity"]: performanceTable.begin("entity") fieldType = FakeFieldType("object", "any") entities = NP("empty", len(dataTable), dtype=fieldType.dtype) for index, cluster in enumerate(clusters): value = fieldType.stringToValue(cluster.get("name", "")) indexPlusOne = index + 1 for i in xrange(len(entities)): if bestClusterId[i] == indexPlusOne: entities[i] = cluster score["entity"] = DataColumn(fieldType, entities, scoreMask) performanceTable.end("entity") if self.subFields["clusterId"]: performanceTable.begin("clusterId") fieldType = FakeFieldType("integer", "continuous") score["clusterId"] = DataColumn(fieldType, bestClusterId, scoreMask) performanceTable.end("clusterId") if self.subFields["entityId"]: performanceTable.begin("entityId") fieldType = FakeFieldType("integer", "continuous") score["entityId"] = DataColumn(fieldType, bestClusterId, scoreMask) performanceTable.end("entityId") if self.subFields["clusterAffinity"]: performanceTable.begin("clusterAffinity") fieldType = FakeFieldType("double", "continuous") score["clusterAffinity"] = DataColumn(fieldType, bestClusterAffinity, scoreMask) performanceTable.end("clusterAffinity") if self.subFields["affinity"]: performanceTable.begin("affinity") fieldType = FakeFieldType("double", "continuous") score["affinity"] = DataColumn(fieldType, bestClusterAffinity, scoreMask) performanceTable.end("affinity") if self.subFields["all"]: performanceTable.begin("all") fieldType = FakeFieldType("double", "continuous") for identifier, distance in allClusterAffinities.items(): score["all.%s" % identifier] = DataColumn(fieldType, distance, scoreMask) performanceTable.end("all") performanceTable.end("set scores") performanceTable.end("ClusteringModel") return score
def _selectFirst(self, dataTable, functionTable, performanceTable, segmentation): """Used by C{calculateScore}.""" performanceTable.begin("Segmentation selectFirst") scoresData = NP("empty", len(dataTable), dtype=NP.dtype(object)) scoresMask = NP("zeros", len(dataTable), dtype=defs.maskType) unfilled = NP("ones", len(dataTable), dtype=NP.dtype(bool)) segments = NP("empty", len(dataTable), dtype=NP.dtype(object)) newOutputData = [] for segment in segmentation.childrenOfTag("Segment", iterator=True): performanceTable.pause("Segmentation selectFirst") selection = segment.childOfClass(PmmlPredicate).evaluate(dataTable, functionTable, performanceTable) performanceTable.unpause("Segmentation selectFirst") NP("logical_and", selection, unfilled, selection) if not selection.any(): continue subTable = dataTable.subTable(selection) subModel = segment.childOfClass(PmmlModel) performanceTable.pause("Segmentation selectFirst") subModel.calculate(subTable, functionTable, performanceTable) performanceTable.unpause("Segmentation selectFirst") scoresData[selection] = subTable.score.data if subTable.score.mask is not None: scoresMask[selection] = subTable.score.mask else: scoresMask[selection] = defs.VALID segmentName = segment.get("id") if segmentName is not None: segments[selection] = segmentName for fieldName, dataColumn in subTable.output.items(): if fieldName not in dataTable.output: data = NP("empty", len(dataTable), dtype=dataColumn.fieldType.dtype) data[selection] = dataColumn.data mask = NP(NP("ones", len(dataTable), dtype=defs.maskType) * defs.MISSING) if dataColumn.mask is None: mask[selection] = defs.VALID else: mask[selection] = dataColumn.mask newDataColumn = DataColumn(dataColumn.fieldType, data, mask) newDataColumn._unlock() dataTable.output[fieldName] = newDataColumn newOutputData.append(newDataColumn) else: newDataColumn = dataTable.output[fieldName] newDataColumn.data[selection] = dataColumn.data if dataColumn.mask is None: newDataColumn.mask[selection] = defs.VALID else: newDataColumn.mask[selection] = dataColumn.mask unfilled -= selection if not unfilled.any(): break for newDataColumn in newOutputData: if not newDataColumn.mask.any(): newDataColumn._mask = None newDataColumn._lock() if not scoresMask.any(): scoresMask = None scores = DataColumn(self.scoreType, scoresData, scoresMask) if self.name is None: performanceTable.end("Segmentation selectFirst") return {None: scores} else: performanceTable.end("Segmentation selectFirst") return {None: scores, "segment": DataColumn(self.scoreTypeSegment, segments, None)}