Exemplo n.º 1
0
    def calc(self, inputData, inputMask=None, performanceTable=None):
        """Build a DataTable from the input data and then run k-means
        clustering on it to produce a ClusteringModel.

        This method is intended for interactive use, since it is more
        laborious to construct a DataTable by hand.

        Modifies and returns C{self.clusteringModel}.

        @type inputData: dict
        @param inputData: Dictionary from field names to data, as required by the DataTable constructor.
        @type inputMask: dict or None
        @param inputMask: Dictionary from field names to missing value masks, as required by the DataTable constructor.
        @type inputState: DataTableState or None
        @param inputState: Calculation state, used to continue a calculation over many C{calc} calls.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: PmmlBinding
        @return: The PMML model representing the result of the k-means clustering.
        """

        if performanceTable is None:
            performanceTable = FakePerformanceTable()

        performanceTable.begin("make DataTable")
        dataTable = DataTable(self.clusteringModel, inputData, inputMask, None)
        performanceTable.end("make DataTable")

        self.smallTrials(dataTable, performanceTable=performanceTable)

        self.optimize([dataTable], performanceTable=performanceTable)
        return self.clusteringModel
Exemplo n.º 2
0
    def calc(self, inputData, inputMask=None, inputState=None, functionTable=None, performanceTable=None):
        """User interface to quickly make and return a plot.

        This method is intended for interactive use, since it is more
        laborious to construct a DataTable by hand.

        This method modifies the input FunctionTable.

        Note that PmmlCalculables return a DataTable from C{calc},
        wheras PlotCanvas returns an SvgBinding.

        @type inputData: dict
        @param inputData: Dictionary from field names to data, as required by the DataTable constructor.
        @type inputMask: dict or None
        @param inputMask: Dictionary from field names to missing value masks, as required by the DataTable constructor.
        @type inputState: DataTableState or None
        @param inputState: Calculation state, used to continue a calculation over many C{calc} calls.
        @type functionTable: FunctionTable or None
        @param functionTable: A table of functions.  Initially, it contains only the built-in functions, but any user functions defined in PMML would be added to it.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: SvgBinding
        @return: A complete SVG image representing the fully drawn plot.        
        """

        if functionTable is None:
            functionTable = FunctionTable()
        if performanceTable is None:
            performanceTable = FakePerformanceTable()

        performanceTable.begin("make DataTable")
        dataTable = DataTable(self, inputData, inputMask, inputState)
        performanceTable.end("make DataTable")

        return self.makePlot(dataTable, functionTable, performanceTable)
Exemplo n.º 3
0
    def makePlot(self, dataTable, functionTable=None, performanceTable=None):
        """Construct a plot from the data and return a complete SVG
        image.
        
        @type dataTable: DataTable
        @param dataTable: Contains the data to plot.
        @type functionTable: FunctionTable
        @param functionTable: Defines functions that may be used to transform data for plotting.
        @type performanceTable: PerformanceTable
        @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process.
        @rtype: SvgBinding
        @return: A complete SVG image representing the fully drawn plot.
        """

        if functionTable is None:
            functionTable = FunctionTable()
        if performanceTable is None:
            performanceTable = FakePerformanceTable()

        svg = SvgBinding.elementMaker
        performanceTable.begin("PlotCanvas")

        width = self.get("width", defaultFromXsd=True, convertType=True)
        height = self.get("height", defaultFromXsd=True, convertType=True)
        style = self.get("style", defaultFromXsd=True)

        attrib = self.globalAttrib.copy()
        svgId = self.get("svgId")
        if svgId is not None:
            attrib["id"] = svgId

        attrib["viewBox"] = "0 0 %d %d" % (width, height)
        attrib["style"] = style
        attrib["font-family"] = self.get("font-family", defaultFromXsd=True)
        attrib["font-weight"] = self.get("font-weight", defaultFromXsd=True)

        plotCoordinates = PlotCoordinates()
        plotContentBox = PlotContentBox(0, 0, width, height)
        plotDefinitions = PlotDefinitions()

        performanceTable.pause("PlotCanvas")
        content = [
            x.frame(dataTable, functionTable, performanceTable,
                    plotCoordinates, plotContentBox, plotDefinitions)
            for x in self.childrenOfClass(PmmlPlotFrame)
        ]
        performanceTable.unpause("PlotCanvas")

        content = [svg.defs(*plotDefinitions.values())] + content

        performanceTable.end("PlotCanvas")
        return svg.svg(*content, **attrib)
Exemplo n.º 4
0
    def optimize(self,
                 dataTables,
                 numberOfMappers=1,
                 numberOfReducers=1,
                 performanceTable=None):
        """Attempt to optimize the current set of clusters with
        Lloyd's algorithm (k-means clustering).

        Modifies C{self.clusteringModel}.

        Behind the scenes, the algorithm is run in a pure Python
        map-reduce framework.  If C{numberOfMappers} or
        C{numberOfReducers} is greater than 1, the algorithm will be
        parallelized with threads.  Splitting the data among multiple
        mappers requires a list of DataTables, rather than a single
        DataTable.
        
        @type dataTables: list of DataTables
        @param dataTables: The input data.
        @type numberOfMappers: int
        @param numberOfMappers: Requested number of mappers.  Input data will be divided evenly among them.
        @type numberOfReducers: int
        @param numberOfReducers: Requested number of reducers.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        """

        if performanceTable is None:
            performanceTable = FakePerformanceTable()

        mapReduce = self.mapReduce()
        outputRecords, outputKeyValues, numberOfIterations = mapReduce.run(
            dataTables,
            parallel=(numberOfMappers > 1 or numberOfReducers > 1),
            frozenClass=False,
            numberOfMappers=numberOfMappers,
            numberOfReducers=numberOfReducers,
            iterationLimit=self.iterationLimit,
            performanceTable=PerformanceTable())

        parent = self.clusteringModel.getparent()
        if parent is not None:
            index = parent.index(self.clusteringModel)
            parent[index] = mapReduce.metadata["ClusteringModel"]

        self.clusteringModel = mapReduce.metadata["ClusteringModel"]
        for extension in self.clusteringModel.xpath(
                "pmml:Extension[@name='iterations']"):
            extension["value"] = repr(
                int(extension["value"]) + numberOfIterations)

        performanceTable.absorb(mapReduce.performanceTable)
Exemplo n.º 5
0
    def calculate(self, dataTable, functionTable=None, performanceTable=None):
        """Perform a calculation directly, without constructing a
        DataTable first.

        This method is intended for performance-critical cases where
        the DataTable would be built without having to analyze the
        PMML for field type context.

        This method modifies the input DataTable and FunctionTable.

        @type dataTable: DataTable
        @param dataTable: The pre-built DataTable.
        @type functionTable: FunctionTable or None
        @param functionTable: A table of functions.  Initially, it contains only the built-in functions, but any user functions defined in PMML would be added to it.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataTable
        @return: A DataTable containing the result, usually a modified version of the input.
        """

        if functionTable is None:
            functionTable = FunctionTable()
        if performanceTable is None:
            performanceTable = FakePerformanceTable()

        if not self.get("isScorable", defaultFromXsd=True, convertType=True):
            dataTable.score = DataColumn(self.scoreType,
                                         NP(NP("ones", len(dataTable), dtype=self.scoreType.dtype) * defs.PADDING),
                                         NP(NP("ones", len(dataTable), dtype=defs.maskType) * defs.INVALID))
            return dataTable

        subTable = dataTable.subTable()

        for miningField in self.xpath("pmml:MiningSchema/pmml:MiningField"):
            miningField.replaceField(subTable, functionTable, performanceTable)

        for calculable in self.calculableTrans():
            calculable.calculate(subTable, functionTable, performanceTable)

        score = self.calculateScore(subTable, functionTable, performanceTable)
        dataTable.score = score[None]
        if self.name is not None:
            for key, value in score.items():
                if key is None:
                    dataTable.fields[self.name] = value
                else:
                    dataTable.fields["%s.%s" % (self.name, key)] = value

        for outputField in self.xpath("pmml:Output/pmml:OutputField"):
            displayName = outputField.get("displayName", outputField["name"])
            dataTable.output[displayName] = outputField.format(subTable, functionTable, performanceTable, score)

        for fieldName in subTable.output:
            dataTable.output[fieldName] = subTable.output[fieldName]

        return dataTable.score
Exemplo n.º 6
0
    def makePlot(self, dataTable, functionTable=None, performanceTable=None):
        """Construct a plot from the data and return a complete SVG
        image.
        
        @type dataTable: DataTable
        @param dataTable: Contains the data to plot.
        @type functionTable: FunctionTable
        @param functionTable: Defines functions that may be used to transform data for plotting.
        @type performanceTable: PerformanceTable
        @param performanceTable: Measures and records performance (time and memory consumption) of the drawing process.
        @rtype: SvgBinding
        @return: A complete SVG image representing the fully drawn plot.
        """

        if functionTable is None:
            functionTable = FunctionTable()
        if performanceTable is None:
            performanceTable = FakePerformanceTable()

        svg = SvgBinding.elementMaker
        performanceTable.begin("PlotCanvas")
        
        width = self.get("width", defaultFromXsd=True, convertType=True)
        height = self.get("height", defaultFromXsd=True, convertType=True)
        style = self.get("style", defaultFromXsd=True)

        attrib = self.globalAttrib.copy()
        svgId = self.get("svgId")
        if svgId is not None:
            attrib["id"] = svgId

        attrib["viewBox"] = "0 0 %d %d" % (width, height)
        attrib["style"] = style
        attrib["font-family"] = self.get("font-family", defaultFromXsd=True)
        attrib["font-weight"] = self.get("font-weight", defaultFromXsd=True)

        plotCoordinates = PlotCoordinates()
        plotContentBox = PlotContentBox(0, 0, width, height)
        plotDefinitions = PlotDefinitions()

        performanceTable.pause("PlotCanvas")
        content = [x.frame(dataTable, functionTable, performanceTable, plotCoordinates, plotContentBox, plotDefinitions) for x in self.childrenOfClass(PmmlPlotFrame)]
        performanceTable.unpause("PlotCanvas")

        content = [svg.defs(*plotDefinitions.values())] + content

        performanceTable.end("PlotCanvas")
        return svg.svg(*content, **attrib)
Exemplo n.º 7
0
    def calc(self, inputData, inputMask=None, inputState=None, functionTable=None, performanceTable=None):
        """Build a DataTable from the input data and then perform a
        calculation.

        This method is intended for interactive use, since it is more
        laborious to construct a DataTable by hand.

        This method modifies the input FunctionTable.

        @type inputData: dict
        @param inputData: Dictionary from field names to data, as required by the DataTable constructor.
        @type inputMask: dict or None
        @param inputMask: Dictionary from field names to missing value masks, as required by the DataTable constructor.
        @type inputState: DataTableState or None
        @param inputState: Calculation state, used to continue a calculation over many C{calc} calls.
        @type functionTable: FunctionTable or None
        @param functionTable: A table of functions.  Initially, it contains only the built-in functions, but any user functions defined in PMML would be added to it.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataTable
        @return: A DataTable containing the result.        
        """

        if functionTable is None:
            functionTable = FunctionTable()
        if performanceTable is None:
            performanceTable = FakePerformanceTable()

        performanceTable.begin("make DataTable")
        dataTable = DataTable(self, inputData, inputMask, inputState)
        performanceTable.end("make DataTable")

        self.calculate(dataTable, functionTable, performanceTable)
        return dataTable
Exemplo n.º 8
0
    def calc(self, inputData, inputMask=None, performanceTable=None):
        """Build a DataTable from the input data and then run k-means
        clustering on it to produce a ClusteringModel.

        This method is intended for interactive use, since it is more
        laborious to construct a DataTable by hand.

        Modifies and returns C{self.clusteringModel}.

        @type inputData: dict
        @param inputData: Dictionary from field names to data, as required by the DataTable constructor.
        @type inputMask: dict or None
        @param inputMask: Dictionary from field names to missing value masks, as required by the DataTable constructor.
        @type inputState: DataTableState or None
        @param inputState: Calculation state, used to continue a calculation over many C{calc} calls.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: PmmlBinding
        @return: The PMML model representing the result of the k-means clustering.
        """

        if performanceTable is None:
            performanceTable = FakePerformanceTable()

        performanceTable.begin("make DataTable")
        dataTable = DataTable(self.clusteringModel, inputData, inputMask, None)
        performanceTable.end("make DataTable")

        self.smallTrials(dataTable, performanceTable=performanceTable)

        self.optimize([dataTable], performanceTable=performanceTable)
        return self.clusteringModel
Exemplo n.º 9
0
    def optimize(self, dataTables, numberOfMappers=1, numberOfReducers=1, performanceTable=None):
        """Attempt to optimize the current set of clusters with
        Lloyd's algorithm (k-means clustering).

        Modifies C{self.clusteringModel}.

        Behind the scenes, the algorithm is run in a pure Python
        map-reduce framework.  If C{numberOfMappers} or
        C{numberOfReducers} is greater than 1, the algorithm will be
        parallelized with threads.  Splitting the data among multiple
        mappers requires a list of DataTables, rather than a single
        DataTable.
        
        @type dataTables: list of DataTables
        @param dataTables: The input data.
        @type numberOfMappers: int
        @param numberOfMappers: Requested number of mappers.  Input data will be divided evenly among them.
        @type numberOfReducers: int
        @param numberOfReducers: Requested number of reducers.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        """

        if performanceTable is None:
            performanceTable = FakePerformanceTable()

        mapReduce = self.mapReduce()
        outputRecords, outputKeyValues, numberOfIterations = mapReduce.run(dataTables, parallel=(numberOfMappers > 1 or numberOfReducers > 1), frozenClass=False, numberOfMappers=numberOfMappers, numberOfReducers=numberOfReducers, iterationLimit=self.iterationLimit, performanceTable=PerformanceTable())

        parent = self.clusteringModel.getparent()
        if parent is not None:
            index = parent.index(self.clusteringModel)
            parent[index] = mapReduce.metadata["ClusteringModel"]

        self.clusteringModel = mapReduce.metadata["ClusteringModel"]
        for extension in self.clusteringModel.xpath("pmml:Extension[@name='iterations']"):
            extension["value"] = repr(int(extension["value"]) + numberOfIterations)

        performanceTable.absorb(mapReduce.performanceTable)
Exemplo n.º 10
0
    def calculate(self, dataTable, functionTable=None, performanceTable=None):
        """Calculate a DerivedField.

        This method modifies the input DataTable.

        If the data types between the DerivedField and its EXPRESSION
        are not matched, the DerivedField will need to cast the output.
        This is a potentially expensive and often unwanted operation.
        When a DerivedField casts, it reports the cast in the
        PerformanceTable with DerivedField name, to help the user
        debug their PMML.

        @type dataTable: DataTable
        @param dataTable: The pre-built DataTable.
        @type functionTable: FunctionTable or None
        @param functionTable: A table of functions.  Initially, it contains only the built-in functions, but any user functions defined in PMML would be added to it.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataTable
        @return: A DataTable containing the result, usually a modified version of the input.
        """

        if functionTable is None:
            functionTable = FunctionTable()
        if performanceTable is None:
            performanceTable = FakePerformanceTable()

        dataColumn = self.childOfClass(PmmlExpression).evaluate(dataTable, functionTable, performanceTable)
        performanceTable.begin("DerivedField")

        dataType = dataColumn.fieldType.dataType
        optype = dataColumn.fieldType.optype
        if self.get("dataType", dataType) == dataType and self.get("optype", optype) == optype and len(self.childrenOfTag("Value")) == 0:
            dataTable.fields[self.name] = dataColumn

        else:
            performanceTable.begin("cast (\"%s\")" % self.name)
            dataTable.fields[self.name] = FieldCastMethods.cast(FieldType(self), dataColumn)
            performanceTable.end("cast (\"%s\")" % self.name)

        performanceTable.end("DerivedField")

        return dataTable.fields[self.name]
Exemplo n.º 11
0
    def calculate(self, dataTable, functionTable=None, performanceTable=None):
        """Perform a calculation directly, without constructing a
        DataTable first.

        This method is intended for performance-critical cases where
        the DataTable would be built without having to analyze the
        PMML for field type context.

        This method modifies the input DataTable and FunctionTable.

        @type dataTable: DataTable
        @param dataTable: The pre-built DataTable.
        @type functionTable: FunctionTable or None
        @param functionTable: A table of functions.  Initially, it contains only the built-in functions, but any user functions defined in PMML would be added to it.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataTable
        @return: A DataTable containing the result, usually a modified version of the input.
        """

        if functionTable is None:
            functionTable = FunctionTable()
        if performanceTable is None:
            performanceTable = FakePerformanceTable()

        for calculable in self.calculableTrans():
            calculable.calculate(dataTable, functionTable, performanceTable)

        calculableModels = self.calculableModels()
        if len(calculableModels) == 0:
            # if there was a model among the transformations, it erroneously set the score; unset it
            dataTable.score = None

        elif len(calculableModels) == 1:
            # this implicitly sets the score
            calculableModels[0].calculate(dataTable, functionTable,
                                          performanceTable)

        else:
            # this explicitly sets the score
            score = []
            for calculableModel in calculableModels:
                subTable = dataTable.subTable()
                calculableModel.calculate(subTable, functionTable,
                                          performanceTable)
                score.append(subTable.score)
            dataTable.score = tuple(score)
Exemplo n.º 12
0
    def calculate(self, dataTable, functionTable=None, performanceTable=None):
        """Perform a calculation directly, without constructing a
        DataTable first.

        This method is intended for performance-critical cases where
        the DataTable would be built without having to analyze the
        PMML for field type context.

        This method modifies the input DataTable and FunctionTable.

        Note that PmmlCalculables return a DataTable from
        C{calculate}, wheras PlotCanvas returns an SvgBinding.  When
        computing an entire PMML document, Augustus ignores the return
        value of both PmmlCalculable and PlotCanvas C{calculate}
        methods, using the fact that they both modify the input
        DataTable.  The return values are just for user convenience.

        @type dataTable: DataTable
        @param dataTable: The pre-built DataTable.
        @type functionTable: FunctionTable or None
        @param functionTable: A table of functions.  Initially, it contains only the built-in functions, but any user functions defined in PMML would be added to it.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: SvgBinding
        @return: A complete SVG image representing the fully drawn plot.        
        """

        if functionTable is None:
            functionTable = FunctionTable()
        if performanceTable is None:
            performanceTable = FakePerformanceTable()

        plot = self.emptyPlot
        if self.get("isPlotable", defaultFromXsd=True, convertType=True):
            plot = self.makePlot(dataTable, functionTable, performanceTable)

            if self.get("plotName") is not None:
                dataTable.plots[self.get("plotName")] = plot

            if self.get("fileName") is not None:
                plot.xmlFile(self.get("fileName"))

        return plot
Exemplo n.º 13
0
    def calc(self,
             inputData,
             inputMask=None,
             inputState=None,
             functionTable=None,
             performanceTable=None):
        """User interface to quickly make and return a plot.

        This method is intended for interactive use, since it is more
        laborious to construct a DataTable by hand.

        This method modifies the input FunctionTable.

        Note that PmmlCalculables return a DataTable from C{calc},
        wheras PlotCanvas returns an SvgBinding.

        @type inputData: dict
        @param inputData: Dictionary from field names to data, as required by the DataTable constructor.
        @type inputMask: dict or None
        @param inputMask: Dictionary from field names to missing value masks, as required by the DataTable constructor.
        @type inputState: DataTableState or None
        @param inputState: Calculation state, used to continue a calculation over many C{calc} calls.
        @type functionTable: FunctionTable or None
        @param functionTable: A table of functions.  Initially, it contains only the built-in functions, but any user functions defined in PMML would be added to it.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: SvgBinding
        @return: A complete SVG image representing the fully drawn plot.        
        """

        if functionTable is None:
            functionTable = FunctionTable()
        if performanceTable is None:
            performanceTable = FakePerformanceTable()

        performanceTable.begin("make DataTable")
        dataTable = DataTable(self, inputData, inputMask, inputState)
        performanceTable.end("make DataTable")

        return self.makePlot(dataTable, functionTable, performanceTable)
Exemplo n.º 14
0
    def verify(self, showSuccess=False, performanceTable=None):
        """Run the model verification tests defined by this element.

        The output is a list of results (all results or only failures,
        depending on C{showSuccess}), each of which is a dictionary of
        field names to values.  Fields are:

          - "success": was the comparison successful?
          - "expectedMissing", "observedMissing": is the
             expected/observed value missing?
          - "expectedValue", "observedValue": result as an internal
             value.
          - "expectedPythonValue", "observedPythonValue": result as a
             Python value.
          - "expectedDisplayValue", "observedDisplayValue": result as
             a string displayValue.

        Only "success", "expectedMissing", and "observedMissing" appear
        if the "is missing?" comparison was unsuccessful.

        @type showSuccess: bool
        @param showSuccess: If True, emit output even if the tests are successful.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: JSON-like list of dicts
        @return: As described above.
        """

        verificationFields = {}
        for verificationField in self.xpath("pmml:VerificationFields/pmml:VerificationField"):
            verificationField.column = verificationField.get("column", verificationField["field"])
            verificationField.precision = verificationField.get("precision", defaultFromXsd=True, convertType=True)
            verificationField.zeroThreshold = verificationField.get("zeroThreshold", defaultFromXsd=True, convertType=True)

            verificationField.data = []
            verificationField.mask = []
            verificationFields[verificationField.column] = verificationField

        inputData = {}
        inputMask = {}
        for index, row in enumerate(self.childOfClass(TableInterface).iterate()):
            for columnName, columnValue in row.items():
                verificationField = verificationFields.get(columnName)

                if verificationField is not None:
                    while len(verificationField.data) < index:
                        verificationField.data.append(defs.PADDING)
                        verificationField.mask.append(True)
                    
                    verificationField.data.append(columnValue)
                    verificationField.mask.append(False)

                else:
                    inputDataField = inputData.get(columnName)
                    if inputDataField is None:
                        inputDataField = []
                        inputData[columnName] = inputDataField
                        inputMask[columnName] = []
                    inputMaskField = inputMask[columnName]

                    while len(inputDataField) < index:
                        inputDataField.append(defs.PADDING)
                        inputMaskField.append(True)

                    inputDataField.append(columnValue)
                    inputMaskField.append(False)

        for verificationField in verificationFields.values():
            while len(verificationField.data) < index:
                verificationField.data.append(defs.PADDING)
                verificationField.mask.append(True)

        for columnName in inputData:
            inputDataField = inputData[columnName]
            inputMaskField = inputMask[columnName]
            while len(inputDataField) < index:
                inputDataField.append(defs.PADDING)
                inputMaskField.append(True)

        for columnName, verificationField in verificationFields.items():
            inputData[columnName] = verificationField.data
            inputMask[columnName] = verificationField.mask

        model = self.getparent()

        if performanceTable is None:
            performanceTable = FakePerformanceTable()

        performanceTable.begin("make DataTable")
        dataTable = DataTable(model, inputData, inputMask, inputState=None)
        performanceTable.end("make DataTable")

        functionTable = FunctionTable()

        for miningField in model.xpath("pmml:MiningSchema/pmml:MiningField"):
            miningField.replaceField(dataTable, functionTable, performanceTable)

        for calculable in model.calculableTrans():
            calculable.calculate(dataTable, functionTable, performanceTable)

        score = model.calculateScore(dataTable, functionTable, performanceTable)
        dataTable.score = score[None]
        if model.name is not None:
            for key, value in score.items():
                if key is None:
                    dataTable.fields[model.name] = value
                else:
                    dataTable.fields["%s.%s" % (model.name, key)] = value

        for outputField in self.xpath("../pmml:Output/pmml:OutputField"):
            displayName = outputField.get("displayName", outputField["name"])
            outputField.format(dataTable, functionTable, performanceTable, score)

        output = []
        for verificationField in verificationFields.values():
            observedOutput = dataTable.fields.get(verificationField["field"])

            if observedOutput is None:
                raise defs.PmmlValidationError("VerificationField references field \"%s\" but it was not produced by the model")
            fieldType = observedOutput.fieldType

            if fieldType.dataType == "object":
                try:
                    newArray = [float(x) for x in observedOutput.data]
                except ValueError:
                    pass
                else:
                    fieldType = FakeFieldType("double", "continuous")
                    observedOutput._data = newArray
                        
            for index in xrange(len(dataTable)):
                record = {"field": verificationField["field"], "index": index}

                record["expectedMissing"] = verificationField.mask[index]
                record["observedMissing"] = (observedOutput.mask is not None and observedOutput.mask[index] != defs.VALID)

                if record["expectedMissing"] != record["observedMissing"]:
                    record["success"] = False
                    output.append(record)

                elif not record["expectedMissing"]:
                    record["expectedValue"] = fieldType.stringToValue(verificationField.data[index])
                    record["observedValue"] = observedOutput.data[index]
                    record["expectedPythonValue"] = fieldType.valueToPython(record["expectedValue"])
                    record["observedPythonValue"] = fieldType.valueToPython(record["observedValue"])
                    record["expectedDisplayValue"] = fieldType.valueToString(record["expectedValue"])
                    record["observedDisplayValue"] = fieldType.valueToString(record["observedValue"])

                    if fieldType.optype == "continuous":
                        if (abs(record["expectedValue"]) <= verificationField.zeroThreshold) and (abs(record["observedValue"]) <= verificationField.zeroThreshold):
                            record["success"] = True
                        else:
                            record["success"] = ((record["expectedValue"] * (1.0 - verificationField.precision)) <= record["observedValue"] <= (record["expectedValue"] * (1.0 + verificationField.precision)))

                        if not record["success"] or showSuccess:
                            output.append(record)
                            
                    else:
                        if record["expectedValue"] != record["observedValue"]:
                            record["success"] = False
                            output.append(record)
                        else:
                            record["success"] = True
                            if showSuccess:
                                output.append(record)

        return output
Exemplo n.º 15
0
    def smallTrials(self,
                    dataTable,
                    numberOfTrials=5,
                    recordsPerTrial=100,
                    performanceTable=None):
        """Improve the initial seed with a few small trials on random subsets of the data.

        Modifies C{self.clusteringModel}.

        @type dataTable: DataTable
        @param dataTable: The input data.
        @type numberOfTrials: int
        @param numberOfTrials: The number of independent trials with the same number of C{recordsPerTrial}.  The trial with the smallest sum of in-cluster variances wins.
        @type recordsPerTrial: int
        @param recordsPerTrial: The number of rows to randomly select from the DataTable in each trial.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        """

        if performanceTable is None:
            performanceTable = FakePerformanceTable()

        performanceTable.begin("smallTrials")

        mapReduce = self.mapReduce()

        self.KMeansMapReduceApplication.metadata[
            "ClusteringModel"] = copy.deepcopy(
                self.KMeansMapReduceApplication.metadata["ClusteringModel"])

        bestVariance = None
        bestSeed = None
        for trialNumber in xrange(numberOfTrials):
            indexes = random.sample(xrange(len(dataTable)), recordsPerTrial)
            subTable = dataTable.subTable(
                NP("array", indexes, dtype=NP.dtype(int)))

            self.randomSeeds(dataTable)
            mapReduce.metadata["ClusteringModel"] = self.clusteringModel

            outputRecords, outputKeyValues, numberOfIterations = mapReduce.run(
                [subTable],
                parallel=False,
                frozenClass=False,
                numberOfMappers=1,
                numberOfReducers=1,
                iterationLimit=self.iterationLimit)

            for extension in self.clusteringModel.xpath(
                    "pmml:Extension[@name='iterations.smallTrials']"):
                extension["value"] = repr(
                    int(extension["value"]) + numberOfIterations)

            mapReduce.metadata["ClusteringModel"]["modelName"] = "smallTrials"
            mapReduce.metadata["ClusteringModel"].subFields = dict(
                mapReduce.metadata["ClusteringModel"].subFields)
            mapReduce.metadata["ClusteringModel"].subFields.update(
                {"affinity": True})
            mapReduce.metadata["ClusteringModel"].calculate(subTable)

            data = subTable.fields["smallTrials.affinity"].data
            mask = subTable.fields["smallTrials.affinity"].mask
            if mask is None:
                variance = NP(data**2).sum() / float(len(subTable))
            else:
                selection = NP(mask == defs.VALID)
                denom = NP("count_nonzero", selection)
                if denom > 0:
                    variance = NP(data[selection]**2).sum() / float(denom)
                else:
                    variance = None
            if variance is not None and (bestVariance is None
                                         or variance < bestVariance):
                bestVariance = variance
                bestSeed = mapReduce.metadata["clusterVectors"]

        if bestSeed is not None:
            self.explicitSeeds(bestSeed)

        performanceTable.end("smallTrials")
Exemplo n.º 16
0
    def smallTrials(self, dataTable, numberOfTrials=5, recordsPerTrial=100, performanceTable=None):
        """Improve the initial seed with a few small trials on random subsets of the data.

        Modifies C{self.clusteringModel}.

        @type dataTable: DataTable
        @param dataTable: The input data.
        @type numberOfTrials: int
        @param numberOfTrials: The number of independent trials with the same number of C{recordsPerTrial}.  The trial with the smallest sum of in-cluster variances wins.
        @type recordsPerTrial: int
        @param recordsPerTrial: The number of rows to randomly select from the DataTable in each trial.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        """

        if performanceTable is None:
            performanceTable = FakePerformanceTable()

        performanceTable.begin("smallTrials")

        mapReduce = self.mapReduce()
        
        self.KMeansMapReduceApplication.metadata["ClusteringModel"] = copy.deepcopy(self.KMeansMapReduceApplication.metadata["ClusteringModel"])

        bestVariance = None
        bestSeed = None
        for trialNumber in xrange(numberOfTrials):
            indexes = random.sample(xrange(len(dataTable)), recordsPerTrial)
            subTable = dataTable.subTable(NP("array", indexes, dtype=NP.dtype(int)))

            self.randomSeeds(dataTable)
            mapReduce.metadata["ClusteringModel"] = self.clusteringModel

            outputRecords, outputKeyValues, numberOfIterations = mapReduce.run([subTable], parallel=False, frozenClass=False, numberOfMappers=1, numberOfReducers=1, iterationLimit=self.iterationLimit)

            for extension in self.clusteringModel.xpath("pmml:Extension[@name='iterations.smallTrials']"):
                extension["value"] = repr(int(extension["value"]) + numberOfIterations)

            mapReduce.metadata["ClusteringModel"]["modelName"] = "smallTrials"
            mapReduce.metadata["ClusteringModel"].subFields = dict(mapReduce.metadata["ClusteringModel"].subFields)
            mapReduce.metadata["ClusteringModel"].subFields.update({"affinity": True})
            mapReduce.metadata["ClusteringModel"].calculate(subTable)

            data = subTable.fields["smallTrials.affinity"].data
            mask = subTable.fields["smallTrials.affinity"].mask
            if mask is None:
                variance = NP(data**2).sum() / float(len(subTable))
            else:
                selection = NP(mask == defs.VALID)
                denom = NP("count_nonzero", selection)
                if denom > 0:
                    variance = NP(data[selection]**2).sum() / float(denom)
                else:
                    variance = None
            if variance is not None and (bestVariance is None or variance < bestVariance):
                bestVariance = variance
                bestSeed = mapReduce.metadata["clusterVectors"]

        if bestSeed is not None:
            self.explicitSeeds(bestSeed)

        performanceTable.end("smallTrials")
Exemplo n.º 17
0
    def verify(self, showSuccess=False, performanceTable=None):
        """Run the model verification tests defined by this element.

        The output is a list of results (all results or only failures,
        depending on C{showSuccess}), each of which is a dictionary of
        field names to values.  Fields are:

          - "success": was the comparison successful?
          - "expectedMissing", "observedMissing": is the
             expected/observed value missing?
          - "expectedValue", "observedValue": result as an internal
             value.
          - "expectedPythonValue", "observedPythonValue": result as a
             Python value.
          - "expectedDisplayValue", "observedDisplayValue": result as
             a string displayValue.

        Only "success", "expectedMissing", and "observedMissing" appear
        if the "is missing?" comparison was unsuccessful.

        @type showSuccess: bool
        @param showSuccess: If True, emit output even if the tests are successful.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: JSON-like list of dicts
        @return: As described above.
        """

        verificationFields = {}
        for verificationField in self.xpath(
                "pmml:VerificationFields/pmml:VerificationField"):
            verificationField.column = verificationField.get(
                "column", verificationField["field"])
            verificationField.precision = verificationField.get(
                "precision", defaultFromXsd=True, convertType=True)
            verificationField.zeroThreshold = verificationField.get(
                "zeroThreshold", defaultFromXsd=True, convertType=True)

            verificationField.data = []
            verificationField.mask = []
            verificationFields[verificationField.column] = verificationField

        inputData = {}
        inputMask = {}
        for index, row in enumerate(
                self.childOfClass(TableInterface).iterate()):
            for columnName, columnValue in row.items():
                verificationField = verificationFields.get(columnName)

                if verificationField is not None:
                    while len(verificationField.data) < index:
                        verificationField.data.append(defs.PADDING)
                        verificationField.mask.append(True)

                    verificationField.data.append(columnValue)
                    verificationField.mask.append(False)

                else:
                    inputDataField = inputData.get(columnName)
                    if inputDataField is None:
                        inputDataField = []
                        inputData[columnName] = inputDataField
                        inputMask[columnName] = []
                    inputMaskField = inputMask[columnName]

                    while len(inputDataField) < index:
                        inputDataField.append(defs.PADDING)
                        inputMaskField.append(True)

                    inputDataField.append(columnValue)
                    inputMaskField.append(False)

        for verificationField in verificationFields.values():
            while len(verificationField.data) < index:
                verificationField.data.append(defs.PADDING)
                verificationField.mask.append(True)

        for columnName in inputData:
            inputDataField = inputData[columnName]
            inputMaskField = inputMask[columnName]
            while len(inputDataField) < index:
                inputDataField.append(defs.PADDING)
                inputMaskField.append(True)

        for columnName, verificationField in verificationFields.items():
            inputData[columnName] = verificationField.data
            inputMask[columnName] = verificationField.mask

        model = self.getparent()

        if performanceTable is None:
            performanceTable = FakePerformanceTable()

        performanceTable.begin("make DataTable")
        dataTable = DataTable(model, inputData, inputMask, inputState=None)
        performanceTable.end("make DataTable")

        functionTable = FunctionTable()

        for miningField in model.xpath("pmml:MiningSchema/pmml:MiningField"):
            miningField.replaceField(dataTable, functionTable,
                                     performanceTable)

        for calculable in model.calculableTrans():
            calculable.calculate(dataTable, functionTable, performanceTable)

        score = model.calculateScore(dataTable, functionTable,
                                     performanceTable)
        dataTable.score = score[None]
        if model.name is not None:
            for key, value in score.items():
                if key is None:
                    dataTable.fields[model.name] = value
                else:
                    dataTable.fields["%s.%s" % (model.name, key)] = value

        for outputField in self.xpath("../pmml:Output/pmml:OutputField"):
            displayName = outputField.get("displayName", outputField["name"])
            outputField.format(dataTable, functionTable, performanceTable,
                               score)

        output = []
        for verificationField in verificationFields.values():
            observedOutput = dataTable.fields.get(verificationField["field"])

            if observedOutput is None:
                raise defs.PmmlValidationError(
                    "VerificationField references field \"%s\" but it was not produced by the model"
                )
            fieldType = observedOutput.fieldType

            if fieldType.dataType == "object":
                try:
                    newArray = [float(x) for x in observedOutput.data]
                except ValueError:
                    pass
                else:
                    fieldType = FakeFieldType("double", "continuous")
                    observedOutput._data = newArray

            for index in xrange(len(dataTable)):
                record = {"field": verificationField["field"], "index": index}

                record["expectedMissing"] = verificationField.mask[index]
                record["observedMissing"] = (
                    observedOutput.mask is not None
                    and observedOutput.mask[index] != defs.VALID)

                if record["expectedMissing"] != record["observedMissing"]:
                    record["success"] = False
                    output.append(record)

                elif not record["expectedMissing"]:
                    record["expectedValue"] = fieldType.stringToValue(
                        verificationField.data[index])
                    record["observedValue"] = observedOutput.data[index]
                    record["expectedPythonValue"] = fieldType.valueToPython(
                        record["expectedValue"])
                    record["observedPythonValue"] = fieldType.valueToPython(
                        record["observedValue"])
                    record["expectedDisplayValue"] = fieldType.valueToString(
                        record["expectedValue"])
                    record["observedDisplayValue"] = fieldType.valueToString(
                        record["observedValue"])

                    if fieldType.optype == "continuous":
                        if (abs(record["expectedValue"]) <=
                                verificationField.zeroThreshold) and (
                                    abs(record["observedValue"]) <=
                                    verificationField.zeroThreshold):
                            record["success"] = True
                        else:
                            record["success"] = (
                                (record["expectedValue"] *
                                 (1.0 - verificationField.precision)) <=
                                record["observedValue"] <=
                                (record["expectedValue"] *
                                 (1.0 + verificationField.precision)))

                        if not record["success"] or showSuccess:
                            output.append(record)

                    else:
                        if record["expectedValue"] != record["observedValue"]:
                            record["success"] = False
                            output.append(record)
                        else:
                            record["success"] = True
                            if showSuccess:
                                output.append(record)

        return output