示例#1
0
    def __init__(self,
                 pmmlModel,
                 pmmlPredicate,
                 parentPmmlOutput,
                 engine,
                 name=None):
        """Called by Engine when a PMML file is loaded or when a new
        segment is observed."""

        self.pmmlModel = pmmlModel

        self.pmmlPredicate = pmmlPredicate
        if pmmlPredicate is not None:
            streamlined = True
            if pmmlPredicate.exists(
                    lambda x: isinstance(x, pmml.CompoundPredicate) and x.
                    attrib["booleanOperator"] == "surrogate",
                    maxdepth=None):
                streamlined = False
            self.predicateMatches = pmmlPredicate.createTest(streamlined)
            self.expressionTree = pmmlPredicate.expressionTree()
        else:
            self.predicateMatches = None
            self.expressionTree = None

        # merge this <Output> section with the parent's
        thisOutput = pmmlModel.child(pmml.Output, exception=False)
        if thisOutput is None:
            self.pmmlOutput = parentPmmlOutput
        elif parentPmmlOutput is None:
            self.pmmlOutput = thisOutput
        else:
            self.pmmlOutput = parentPmmlOutput.copy()
            for outputField in thisOutput.matches(pmml.OutputField):
                self.pmmlOutput.children.append(outputField)
            self.pmmlOutput.validate()

        self.engine = engine
        self.segmentNameRegistry.register(name, id(self))

        # make an X-ODG-ModelMaturity object to keep track of how many updates this segment has seen
        pmmlExtension = self.pmmlModel.child(pmml.Extension, exception=False)
        if pmmlExtension is not None:
            self.pmmlModelMaturity = pmmlExtension.child(
                pmml.X_ODG_ModelMaturity, exception=False)
            if self.pmmlModelMaturity is None:
                self.pmmlModelMaturity = pmml.X_ODG_ModelMaturity(
                    numUpdates=0, locked=self.engine.lockAllSegments)
                pmmlExtension.children.append(self.pmmlModelMaturity)
            elif self.engine.lockAllSegments:
                # Always lock if the user asked for it in the configuration file
                self.pmmlModelMaturity.attrib["locked"] = True
        else:
            pmmlExtension = pmml.Extension()
            self.pmmlModelMaturity = pmml.X_ODG_ModelMaturity(
                numUpdates=0, locked=self.engine.lockAllSegments)
            pmmlExtension.children.append(self.pmmlModelMaturity)
            self.pmmlModel.children.insert(0, pmmlExtension)
示例#2
0
    def _updateDistribution_first(self):
        if isinstance(self.baseline,
                      (pmml.PoissonDistribution, pmml.GaussianDistribution)):
            self.baselinePartialSums = self.baseline.descendant(
                pmml.X_ODG_PartialSums, exception=False, maxdepth=2)
            if self.baselinePartialSums is None:
                self.baselinePartialSums = pmml.X_ODG_PartialSums()
                if not self.baseline.exists(pmml.Extension):
                    self.baseline.children.append(pmml.Extension())
                self.baseline.child(pmml.Extension).children.append(
                    self.baselinePartialSums)

        if isinstance(self.baseline, pmml.PoissonDistribution):
            self.baselineUpdator = self.engine.producerUpdateScheme.updator(
                SUM1, SUMX)
            if self.updateExisting:
                self.baselineUpdator.initialize({
                    COUNT:
                    self.baselinePartialSums.attrib.get("COUNT", 0),
                    SUM1:
                    self.baselinePartialSums.attrib.get("SUM1", 0.),
                    SUMX:
                    self.baselinePartialSums.attrib.get("SUMX", 0.)
                })
            if COUNT in self.baselineUpdator.counters:
                self.baselinePartialSums.attrib[
                    "COUNT"] = self.baselineUpdator.counters[COUNT]
            self.baselinePartialSums.attrib[
                "SUM1"] = self.baselineUpdator.counters[SUM1]
            self.baselinePartialSums.attrib[
                "SUMX"] = self.baselineUpdator.counters[SUMX]

        elif isinstance(self.baseline, pmml.GaussianDistribution):
            self.baselineUpdator = self.engine.producerUpdateScheme.updator(
                SUM1, SUMX, SUMXX)
            if self.updateExisting:
                self.baselineUpdator.initialize({
                    COUNT:
                    self.baselinePartialSums.attrib.get("COUNT", 0),
                    SUM1:
                    self.baselinePartialSums.attrib.get("SUM1", 0.),
                    SUMX:
                    self.baselinePartialSums.attrib.get("SUMX", 0.),
                    SUMXX:
                    self.baselinePartialSums.attrib.get("SUMXX", 0.)
                })
            if COUNT in self.baselineUpdator.counters:
                self.baselinePartialSums.attrib[
                    "COUNT"] = self.baselineUpdator.counters[COUNT]
            self.baselinePartialSums.attrib[
                "SUM1"] = self.baselineUpdator.counters[SUM1]
            self.baselinePartialSums.attrib[
                "SUMX"] = self.baselineUpdator.counters[SUMX]
            self.baselinePartialSums.attrib[
                "SUMXX"] = self.baselineUpdator.counters[SUMXX]

        elif isinstance(self.baseline, pmml.UniformDistribution):
            self.baselineUpdator = self.engine.producerUpdateScheme.updator(
                MIN, MAX)
            if self.updateExisting:
                self.baselineUpdator.initialize({
                    MIN:
                    self.baseline.attrib["lower"],
                    MAX:
                    self.baseline.attrib["upper"]
                })

        else:
            raise NotImplementedError, "Only production of Gaussian, Poisson, and Uniform distributions has been implemented."

        if self.alternateField is not None:
            if not testDistributions.exists(pmml.Alternate):
                raise RuntimeError, "alternateField requested but there is no <Alternate/> distribution in the PMML"

            self.alternate = testDistributions.child(pmml.Alternate).child()

            if isinstance(
                    self.alternate,
                (pmml.PoissonDistribution, pmml.GaussianDistribution)):
                self.alternatePartialSums = self.alternate.descendant(
                    pmml.X_ODG_PartialSums, exception=False, maxdepth=2)
                if self.alternatePartialSums is None:
                    self.alternatePartialSums = pmml.X_ODG_PartialSums()
                    if not self.alternate.exists(pmml.Extension):
                        self.alternate.children.append(pmml.Extension())
                    self.alternate.child(pmml.Extension).children.append(
                        self.alternatePartialSums)

            if isinstance(self.alternate, pmml.PoissonDistribution):
                self.alternateUpdator = self.engine.producerUpdateScheme.updator(
                    SUM1, SUMX)
                if self.updateExisting:
                    self.alternateUpdator.initialize({
                        COUNT:
                        self.alternatePartialSums.attrib.get("COUNT", 0),
                        SUM1:
                        self.alternatePartialSums.attrib.get("SUM1", 0.),
                        SUMX:
                        self.alternatePartialSums.attrib.get("SUMX", 0.)
                    })
                if COUNT in self.alternateUpdator.counters:
                    self.alternatePartialSums.attrib[
                        "COUNT"] = self.alternateUpdator.counters[COUNT]
                self.alternatePartialSums.attrib[
                    "SUM1"] = self.alternateUpdator.counters[SUM1]
                self.alternatePartialSums.attrib[
                    "SUMX"] = self.alternateUpdator.counters[SUMX]

            elif isinstance(self.alternate, pmml.GaussianDistribution):
                self.alternateUpdator = self.engine.producerUpdateScheme.updator(
                    SUM1, SUMX, SUMXX)
                if self.updateExisting:
                    self.alternateUpdator.initialize({
                        COUNT:
                        self.alternatePartialSums.attrib.get("COUNT", 0),
                        SUM1:
                        self.alternatePartialSums.attrib.get("SUM1", 0.),
                        SUMX:
                        self.alternatePartialSums.attrib.get("SUMX", 0.),
                        SUMXX:
                        self.alternatePartialSums.attrib.get("SUMXX", 0.)
                    })
                if COUNT in self.alternateUpdator.counters:
                    self.alternatePartialSums.attrib[
                        "COUNT"] = self.alternateUpdator.counters[COUNT]
                self.alternatePartialSums.attrib[
                    "SUM1"] = self.alternateUpdator.counters[SUM1]
                self.alternatePartialSums.attrib[
                    "SUMX"] = self.alternateUpdator.counters[SUMX]
                self.alternatePartialSums.attrib[
                    "SUMXX"] = self.alternateUpdator.counters[SUMXX]

            elif isinstance(self.alternate, pmml.UniformDistribution):
                self.alternateUpdator = self.engine.producerUpdateScheme.updator(
                    MIN, MAX)
                if self.updateExisting:
                    self.alteranteUpdator.initialize({
                        MIN:
                        self.alterante.attrib["lower"],
                        MAX:
                        self.alterante.attrib["upper"]
                    })

            else:
                raise NotImplementedError, "Only production of Gaussian, Poisson, and Uniform distributions has been implemented."

        else:
            self.alternate = None
示例#3
0
    def initialize(self):
        """Interpret PMML file, set up SegmentRecords list, and
        initialize all algorithms."""

        self.firstSegment = True

        # set up the header, so that our models can be stamped with time and event number
        header = self.pmmlFile.child(pmml.Header)
        if header.exists(pmml.Extension):
            headerExtension = header.child(pmml.Extension)
        else:
            headerExtension = pmml.Extension()
            header.children.insert(0, headerExtension)

        if headerExtension.exists(pmml.X_ODG_RandomSeed):
            del headerExtension[headerExtension.index(pmml.X_ODG_RandomSeed)]
        augustusRandomSeed = pmml.X_ODG_RandomSeed(
            value=self.augustusRandomSeed)
        headerExtension.children.append(augustusRandomSeed)

        if headerExtension.exists(pmml.X_ODG_Eventstamp):
            del headerExtension[headerExtension.index(pmml.X_ODG_Eventstamp)]
        self.eventStamp = pmml.X_ODG_Eventstamp(number=0)
        headerExtension.children.append(self.eventStamp)

        if header.exists(pmml.Timestamp):
            del header[header.index(pmml.Timestamp)]
        self.timeStamp = pmml.Timestamp(
            xmlbase.XMLText(datetime.datetime.today().isoformat()))
        header.children.append(self.timeStamp)

        # select the first model or select a model by name
        if self.modelName is None:
            self.pmmlModel = self.pmmlFile.topModels[0]
        else:
            self.pmmlModel = None
            for model in self.pmmlFile.topModels:
                if "modelName" in model.attrib and model.attrib[
                        "modelName"] == self.modelName:
                    self.pmmlModel = model
                    break
            if self.pmmlModel is None:
                raise RuntimeError, "No model named \"%s\" was found in the PMML file" % self.modelName

        # connect the dataContext to the dataStream, so that events will flow from the input file into the transformations
        self.resetDataStream(self.dataStream)

        # clear the cache the model DataContexts (initializes some dictionaries)
        self.pmmlModel.dataContext.clear()
        if self.pmmlModel.dataContext.transformationDictionary:
            self.metadata.data["Transformation dictionary elements"] = len(
                self.pmmlModel.dataContext.transformationDictionary.cast)
        else:
            self.metadata.data["Transformation dictionary elements"] = 0

        self.segmentRecords = []
        self._lookup = NameSpace(tuples={}, fields={}, other=[])
        SegmentRecord.maturityThreshold = self.maturityThreshold
        SegmentRecord.lockingThreshold = self.lockingThreshold

        if self.pmmlFile.exists(pmml.TransformationDictionary):
            if self.pmmlFile.child(pmml.TransformationDictionary).exists(
                    pmml.Aggregate, maxdepth=None):
                raise NotImplementedError, "Aggregate transformations in the TransformationDictionary are not supported"
            if self.pmmlFile.child(pmml.TransformationDictionary).exists(
                    pmml.X_ODG_AggregateReduce, maxdepth=None):
                raise NotImplementedError, "X-ODG-AggregateReduce transformations in the TransformationDictionary are not supported"

        # MiningModels are special because we handle segmentation at the Engine level
        # Currently no support for MiningModels nested within MiningModels
        if isinstance(self.pmmlModel, pmml.MiningModel):
            self.pmmlOutput = self.pmmlModel.child(pmml.Output,
                                                   exception=False)
            segmentation = self.pmmlModel.child(pmml.Segmentation,
                                                exception=False)
            # for now, assume a MiningModel without any segments will be populated through autosegmentation

            if self.pmmlModel.exists(pmml.LocalTransformations):
                if self.pmmlModel.child(pmml.LocalTransformations).exists(
                        pmml.Aggregate, maxdepth=None):
                    raise NotImplementedError, "Aggregate transformations in the MiningModel's LocalTransformations are not supported"
                if self.pmmlModel.child(pmml.LocalTransformations).exists(
                        pmml.X_ODG_AggregateReduce, maxdepth=None):
                    raise NotImplementedError, "X-ODG-AggregateReduce transformations in the MiningModel's LocalTransformations are not supported"

            if segmentation.attrib["multipleModelMethod"] == "selectFirst":
                self.multipleModelMethod = SELECTFIRST
            elif segmentation.attrib["multipleModelMethod"] == "selectAll":
                self.multipleModelMethod = SELECTALL
            else:
                raise NotImplementedError, "Only 'selectFirst', 'selectAll', and no segmentation have been implemented."
            self.metadata.data[
                "Match all segments"] = self.multipleModelMethod != SELECTFIRST

            for pmmlSegment in segmentation.matches(pmml.Segment):
                self._makeSegmentRecord(pmmlSegment)

        else:
            self.multipleModelMethod = SELECTONLY

            segmentRecord = SegmentRecord(self.pmmlModel, None, None, self)

            modelClass = self.pmmlModel.__class__
            algoName = self.producerAlgorithm[
                modelClass.__name__].attrib["algorithm"]
            segmentRecord.consumerAlgorithm = consumerAlgorithmMap[modelClass](
                self, segmentRecord)
            segmentRecord.producerAlgorithm = producerAlgorithmMap[
                modelClass, algoName](self, segmentRecord)
            segmentRecord.producerParameters = self.producerAlgorithm[
                modelClass.__name__].parameters
            self.setProvenance(self.pmmlModel, algoName,
                               segmentRecord.producerAlgorithm,
                               segmentRecord.producerParameters)

            localTransformations = self.pmmlModel.child(
                pmml.LocalTransformations, exception=False)
            if localTransformations is not None:
                segmentRecord.aggregates = localTransformations.matches(
                    pmml.Aggregate, maxdepth=None)
                segmentRecord.aggregates.extend(
                    localTransformations.matches(pmml.X_ODG_AggregateReduce,
                                                 maxdepth=None))
            else:
                segmentRecord.aggregates = []
            for aggregate in segmentRecord.aggregates:
                aggregate.initialize(self.consumerUpdateScheme)

            self.segmentRecords.append(segmentRecord)
            self.metadata.data[
                "First segment model type"] = segmentRecord.pmmlModel.tag

        self.reinitialize()
示例#4
0
    def initialize(self, **params):
        """Initialize a clustering model producer."""

        if "resume" in params:
            self.resume = pmml.boolCheck(params["resume"])
            del params["resume"]
        else:
            self.resume = False

        if "numberOfTrials" in params:
            self.numberOfTrials = int(params["numberOfTrials"])
            del params["numberOfTrials"]
        else:
            self.numberOfTrials = 10

        if "numberToKeep" in params:
            self.numberToKeep = int(params["numberToKeep"])
            del params["numberToKeep"]
        else:
            self.numberToKeep = 3

        if "maturityThreshold" in params:
            self.maturityThreshold = int(params["maturityThreshold"])
            del params["maturityThreshold"]
        else:
            self.maturityThreshold = 100

        if "initialStability" in params:
            self.initialStability = int(params["initialStability"])
            del params["initialStability"]
        else:
            self.initialStability = 100

        if "overrideSignificance" in params:
            self.overrideSignificance = float(params["overrideSignificance"])
            del params["overrideSignificance"]
            if self.overrideSignificance == 0.:
                self.overrideSignificance = None
        else:
            self.overrideSignificance = 5.

        self.model = self.segmentRecord.pmmlModel
        self.dataDistribution = self.engine.producerUpdateScheme.updator(COVARIANCE(self.model.numberOfFields))

        self.distanceMeasure = (self.model.child(pmml.ComparisonMeasure).attrib["kind"] == "distance")
            
        # put PartialSums in the model if they're not already there; pick up old values if you're resuming
        extension = self.model.child(pmml.Extension, exception=False)
        if extension is None:
            extension = pmml.Extension()
            self.model.children.append(extension)

        if self.resume:
            self.sumOfDistances = extension.child(lambda x: isinstance(x, pmml.X_ODG_PartialSums) and x.attrib.get("name", None) == "SumOfDistances", exception=False)
        else:
            index = extension.index(lambda x: isinstance(x, pmml.X_ODG_PartialSums) and x.attrib.get("name", None) == "SumOfDistances", exception=False)
            if index is not None:
                del extension[index]
            self.sumOfDistances = None

        if self.sumOfDistances is None:
            self.sumOfDistances = pmml.X_ODG_PartialSums(name="SumOfDistances", COUNT=0, SUM1=0., SUMX=0., SUMXX=0.)
            extension.children.append(self.sumOfDistances)

        self.partialSums = {}
        for theid, cluster in zip(self.model.ids, self.model.cluster):
            for i, field in enumerate(self.model.fields):
                fullname = "%s.%s" % (theid, field)

                if self.resume:
                    partialSum = extension.child(lambda x: isinstance(x, pmml.X_ODG_PartialSums) and x.attrib.get("name", None) == fullname, exception=False)
                else:
                    index = extension.index(lambda x: isinstance(x, pmml.X_ODG_PartialSums) and x.attrib.get("name", None) == fullname, exception=False)
                    if index is not None:
                        del extension[index]
                    partialSum = None

                if partialSum is None:
                    partialSum = pmml.X_ODG_PartialSums(name=fullname, SUM1=1., SUMX=cluster.value[i])
                    extension.children.append(partialSum)

                self.partialSums[fullname] = partialSum
                    
        # create the first trial using the values constructed above (they come from the PMML file if resume is True)
        trialFromPmml = new.instance(TrialClusterSet)
        trialFromPmml.updator = self.engine.producerUpdateScheme.updator(COUNT, SUM1, SUMX, SUMXX)
        trialFromPmml.updator.initialize({COUNT: self.sumOfDistances.attrib["COUNT"], SUM1: self.sumOfDistances.attrib["SUM1"], SUMX: self.sumOfDistances.attrib["SUMX"], SUMXX: self.sumOfDistances.attrib["SUMXX"]})

        trialFromPmml.clusters = []
        for theid, cluster in zip(self.model.ids, self.model.cluster):
            trialCluster = new.instance(TrialCluster)
            trialCluster.fields = []
            trialCluster.initialPosition = []
            for field in self.model.fields:
                partialSum = self.partialSums["%s.%s" % (theid, field)]
                u = self.engine.producerUpdateScheme.updator(SUM1, SUMX)
                u.initialize({SUM1: partialSum.attrib["SUM1"], SUMX: partialSum.attrib["SUMX"]})
                trialCluster.fields.append(u)
            trialCluster.initialPosition = list(cluster.value)
            trialFromPmml.clusters.append(trialCluster)

        self.trials = [trialFromPmml]

        if len(params) > 0:
            raise TypeError, "Unrecognized parameters %s" % params
示例#5
0
    def produce(self):
        self.resetLoggerLevels()

        extension = self.model.child(pmml.Extension, exception=False)
        if extension is None:
            extension = pmml.Extension()
            self.model.children.append(extension)

        convergence = extension.child(pmml.X_ODG_Convergence, exception=False)
        if convergence is None:
            convergence = pmml.X_ODG_Convergence()
            extension.children.append(convergence)

        numRecords = len(self.buffer[self.SYNCNUMBER])

        if self.logDebug:
            self.logger.debug(
                "KMeansClustering.produce: this segment has %d data records; setting up for cluster production."
                % numRecords)

        if numRecords == 0:
            self.logger.debug(
                "KMeansClustering.produce: no data in this segment, so there are no clusters to produce."
            )
            return

        if self.numberOfClusters is not None:
            if self.numberOfClusters > numRecords:
                self.logger.info(
                    "KMeansClustering.produce: number of desired clusters (%d) exceeds number of data records (%d), reducing number of clusters to match."
                    % (self.model.numberOfClusters, numRecords))
                self.model.changeNumberOfClusters(numRecords)
            elif self.numberOfClusters != self.model.numberOfClusters:
                self.model.changeNumberOfClusters(self.numberOfClusters)

        elif self.model.numberOfClusters > numRecords:
            self.logger.info(
                "KMeansClustering.produce: number of desired clusters (%d) exceeds number of data records (%d), reducing number of clusters to match."
                % (self.model.numberOfClusters, numRecords))
            self.model.changeNumberOfClusters(numRecords)

        # special case that should be easy, but it can cause the standard k-means algorithm to infinite loop:
        if self.model.numberOfClusters == numRecords:
            self.logger.debug(
                "KMeansClustering.produce: number of records equals the number of clusters (%d), so we skip the standard algorithm and just assign data points to clusters"
                % numRecords)
            for i, pmmlCluster in enumerate(self.model.cluster):
                pmmlCluster.value = [
                    self.buffer[field][i] for field in self.model.fields
                ]
                pmmlCluster.attrib["n"] = len(pmmlCluster.value)
            return

        self.trans = numpy.matrix(numpy.identity(len(self.model.fields)))
        self.shift = numpy.matrix(numpy.zeros(len(self.model.fields))).T

        if self.distanceMeasure:
            # characterize the data so that you can generate random numbers with the same distribution
            try:
                covariance = self.dataDistribution.covariance()
            except ZeroDivisionError:
                covariance = INVALID

            if covariance is not INVALID:
                self.shift = self.dataDistribution.covmean()
                try:
                    self.trans = numpy.linalg.cholesky(covariance)
                except numpy.linalg.LinAlgError:
                    pass  # FIXME: at least make trans a diagonal matrix with stdev entries (or 1/stdev)!

        else:
            raise NotImplementedError(
                "Currently, only clusters with ComparisonMeasure.kind == 'distance' metrics can be produced."
            )

        # make a new set of trials
        if self.seedSource is ProducerKMeans.RANDOM_DATAPOINTS:
            # pick a random point from the dataset
            def randomization():
                i = random.randint(0, len(self.buffer[self.SYNCNUMBER]) - 1)
                return [
                    self.buffer[field][i] for field in self.model.fields
                    if field is not self.SYNCNUMBER
                ]

            self.randomization = randomization

        elif self.seedSource == ProducerKMeans.RANDOM_DATAWEIGHTED:
            # pick a random point from the dataset, weighted by their weights
            sumOfWeights = numpy.cumsum(self.buffer[self.model.weightField])

            def randomization():
                x = random.uniform(0., sumOfWeights[-1])
                i = numpy.where(sumOfWeights > x)[0][0]
                return [
                    self.buffer[field][i] for field in self.model.fields
                    if field is not self.SYNCNUMBER
                ]

            self.randomization = randomization

        elif self.seedSource == ProducerKMeans.RANDOM_DATACOVARIANCE:
            # generate a random point from a distribution with a covariance like the data
            self.randomization = lambda: ((self.trans * (numpy.matrix(
                numpy.random.randn(len(self.shift))).T)) + self.shift)

        elif self.seedSource == ProducerKMeans.RANDOM_UNITRECT:
            # generate a random point in the unit rectangle
            self.randomization = lambda: [
                random.random() for i in xrange(len(self.shift))
            ]

        self.trials = [
            TrialClusterSet(self.model.numberOfClusters, self.randomization,
                            self.engine.producerUpdateScheme)
            for i in xrange(self.numberOfTrials)
        ]

        # prepare small subsamples to run first to improve convergence when the whole dataset gets used
        allIndices = range(len(self.buffer[self.SYNCNUMBER]))
        quickConvergeSamples = []
        for numEvents in self.quickConvergeSteps:
            if numEvents > len(allIndices):
                numEvents = len(allIndices)
            quickConvergeSamples.append(
                numpy.array(random.sample(allIndices, numEvents)))

        allIndices = numpy.array(allIndices)
        for key in self.buffer:
            self.buffer[key] = numpy.array(self.buffer[key])

        for i, quickConvergenceSample in enumerate(quickConvergeSamples):
            if self.logDebug:
                self.logger.debug(
                    "KMeansClustering.produce: ===== quickConverge %d: preparing for k-means by clustering a random subset of %d events"
                    % (i + 1, len(quickConvergenceSample)))
            self.iterations(quickConvergenceSample)

        self.logger.debug(
            "KMeansClustering.produce: ===== starting k-means clustering algorithm (whole dataset)"
        )
        convergence.attrib["iterations"] = self.iterations()

        # find the best one
        best = None
        for trial in self.trials:
            if trial.hasConverged:
                if best is None or trial.updator.mean() < best.updator.mean():
                    best = trial

        convergence.attrib["converged"] = (best is not None)

        if best is None:
            self.logger.error(
                "KMeansClustering.produce: no trial cluster-sets converged within the desired number of iterations (%s), using the best UNCONVERGED set instead."
                % str(self.maxIterations) if self.
                maxIterations is not None else "unset")
            for trial in self.trials:
                if best is None or trial.updator.mean() < best.updator.mean():
                    best = trial

        # write it to the PMML file
        for bestCluster, pmmlCluster in zip(best.clusters,
                                            self.model.matches(pmml.Cluster)):
            pmmlCluster.attrib["size"] = bestCluster.count()
            theArray = pmmlCluster.child(pmml.Array)
            theArray.value = bestCluster.initialPosition
            theArray.attrib["n"] = len(theArray.value)