Exemplo n.º 1
0
class ProducerKMeans(ProducerAlgorithm):
    """The standard k-means clustering algorithm."""

    SYNCNUMBER = Atom("SyncNumber")
    RANDOM_DATAPOINTS = Atom("Random_DataPoints")
    RANDOM_DATAWEIGHTED = Atom("Random_DataWeighted")
    RANDOM_DATACOVARIANCE = Atom("Random_DataCovariance")
    RANDOM_UNITRECT = Atom("Random_UnitRect")

    defaultParams = {
        "updateExisting": "false",
        "quickConvergeSteps": "()",
        "numberOfClusters": "unset",
        "seedSource": "dataPoints",
        "numberOfTrials": "20",
        "numberToConverge": "5",
        "maxIterations": "unset",
        "closeEnough": "0"
    }

    def initialize(self, **params):
        if "updateExisting" in params:
            self.updateExisting = pmml.boolCheck(params["updateExisting"])
            del params["updateExisting"]
            if self.updateExisting:
                raise NotImplementedError(
                    "Updating from existing ClusterModels using 'kmeans' not implemented; use mode='replaceExisting'"
                )
        else:
            self.updateExisting = pmml.boolCheck(
                self.defaultParams["updateExisting"])

        if "quickConvergeSteps" in params:
            try:
                self.quickConvergeSteps = eval(params["quickConvergeSteps"])
                if not isinstance(self.quickConvergeSteps, tuple):
                    raise RuntimeError
                self.quickConvergeSteps = map(int, self.quickConvergeSteps)
            except err:
                raise RuntimeError(
                    "quickConvergeSteps must be a tuple of numbers of events")
            del params["quickConvergeSteps"]
        else:
            self.quickConvergeSteps = eval(
                self.defaultParams["quickConvergeSteps"])

        if "numberOfClusters" in params:
            self.numberOfClusters = params["numberOfClusters"]
            del params["numberOfClusters"]
        else:
            self.numberOfClusters = self.defaultParams["numberOfClusters"]
        try:
            self.numberOfClusters = int(self.numberOfClusters)
            if self.numberOfClusters <= 0: raise ValueError
        except ValueError:
            if self.numberOfClusters == "unset":
                self.numberOfClusters = None
            else:
                raise RuntimeError(
                    "numberOfClusters must be a positive integer or \"unset\", not \"%s\""
                    % self.numberOfClusters)

        if "seedSource" in params:
            self.seedSource = params["seedSource"]
            del params["seedSource"]
        else:
            self.seedSource = self.defaultParams["seedSource"]
        if self.seedSource == "dataPoints":
            self.seedSource = self.RANDOM_DATAPOINTS
        elif self.seedSource == "dataWeighted":
            self.seedSource = self.RANDOM_DATAWEIGHTED
        elif self.seedSource == "dataCovariance":
            self.seedSource = self.RANDOM_DATACOVARIANCE
        elif self.seedSource == "unitRect":
            self.seedSource = self.RANDOM_UNITRECT
        else:
            raise NotImplementedError(
                "The seedSource must be one of 'dataPoints', 'dataCovariance', 'unitRect'"
            )

        if "numberOfTrials" in params:
            self.numberOfTrials = int(params["numberOfTrials"])
            del params["numberOfTrials"]
        else:
            self.numberOfTrials = int(self.defaultParams["numberOfTrials"])

        if "numberToConverge" in params:
            self.numberToConverge = int(params["numberToConverge"])
            del params["numberToConverge"]
        else:
            self.numberToConverge = int(self.defaultParams["numberToConverge"])

        if self.numberToConverge > self.numberOfTrials:
            raise RuntimeError(
                "numberToConverge (%d) must not be greater than numberOfTrials (%d)"
                % (self.numberToConverge, self.numberOfTrials))

        if "maxIterations" in params:
            self.maxIterations = params["maxIterations"]
            del params["maxIterations"]
        else:
            self.maxIterations = self.defaultParams["maxIterations"]
        try:
            self.maxIterations = int(self.maxIterations)
            if self.maxIterations <= 0: raise ValueError
        except ValueError:
            if self.maxIterations == "unset":
                self.maxIterations = None
            else:
                raise RuntimeError(
                    "maxIterations must be a positive integer or \"unset\", not \"%s\""
                    % self.maxIterations)

        if "closeEnough" in params:
            self.closeEnough = float(params["closeEnough"])
            del params["closeEnough"]
        else:
            self.closeEnough = float(self.defaultParams["closeEnough"])

        self.model = self.segmentRecord.pmmlModel
        self.dataDistribution = self.engine.producerUpdateScheme.updator(
            COVARIANCE(self.model.numberOfFields))
        self.distanceMeasure = (self.model.child(
            pmml.ComparisonMeasure).attrib["kind"] == "distance")

        if self.seedSource == self.RANDOM_DATAWEIGHTED and self.model.weightField is None:
            self.seedSource = self.RANDOM_DATAPOINTS

        # get rid of any PartialSums objects, since they would be misleading (this algorithm doesn't use them)
        extension = self.model.child(pmml.Extension, exception=False)
        if extension is not None:
            newChildren = []
            for child in extension.children:
                if not isinstance(child, pmml.X_ODG_PartialSums):
                    newChildren.append(child)
            extension.children = newChildren

        self.buffer = {self.SYNCNUMBER: []}
        for field in self.model.fields:
            self.buffer[field] = []

        if self.model.weightField is not None:
            self.buffer[self.model.weightField] = []

        if len(params) > 0:
            raise TypeError("Unrecognized parameters %s" % params)

    def update(self, syncNumber, get):
        self.resetLoggerLevels()

        vector = [get(field) for field in self.model.fields]
        if INVALID in vector:
            self.logger.debug(
                "KMeansClustering.update: returning False (INVALID data)")
            return False

        if self.model.weightField is not None:
            weight = get(self.model.weightField)
            if weight is INVALID or weight is MISSING:
                self.logger.debug(
                    "KMeansClustering.update: returning False (INVALID or MISSING weight)"
                )
                return False
            self.buffer[self.model.weightField].append(weight)

        self.buffer[self.SYNCNUMBER].append(syncNumber)
        for i, field in enumerate(self.model.fields):
            self.buffer[field].append(vector[i])

        if self.distanceMeasure and MISSING not in vector:
            self.dataDistribution.increment(syncNumber, vector)

        return True

    def produce(self):
        self.resetLoggerLevels()

        extension = self.model.child(pmml.Extension, exception=False)
        if extension is None:
            extension = pmml.Extension()
            self.model.children.append(extension)

        convergence = extension.child(pmml.X_ODG_Convergence, exception=False)
        if convergence is None:
            convergence = pmml.X_ODG_Convergence()
            extension.children.append(convergence)

        numRecords = len(self.buffer[self.SYNCNUMBER])

        if self.logDebug:
            self.logger.debug(
                "KMeansClustering.produce: this segment has %d data records; setting up for cluster production."
                % numRecords)

        if numRecords == 0:
            self.logger.debug(
                "KMeansClustering.produce: no data in this segment, so there are no clusters to produce."
            )
            return

        if self.numberOfClusters is not None:
            if self.numberOfClusters > numRecords:
                self.logger.info(
                    "KMeansClustering.produce: number of desired clusters (%d) exceeds number of data records (%d), reducing number of clusters to match."
                    % (self.model.numberOfClusters, numRecords))
                self.model.changeNumberOfClusters(numRecords)
            elif self.numberOfClusters != self.model.numberOfClusters:
                self.model.changeNumberOfClusters(self.numberOfClusters)

        elif self.model.numberOfClusters > numRecords:
            self.logger.info(
                "KMeansClustering.produce: number of desired clusters (%d) exceeds number of data records (%d), reducing number of clusters to match."
                % (self.model.numberOfClusters, numRecords))
            self.model.changeNumberOfClusters(numRecords)

        # special case that should be easy, but it can cause the standard k-means algorithm to infinite loop:
        if self.model.numberOfClusters == numRecords:
            self.logger.debug(
                "KMeansClustering.produce: number of records equals the number of clusters (%d), so we skip the standard algorithm and just assign data points to clusters"
                % numRecords)
            for i, pmmlCluster in enumerate(self.model.cluster):
                pmmlCluster.value = [
                    self.buffer[field][i] for field in self.model.fields
                ]
                pmmlCluster.attrib["n"] = len(pmmlCluster.value)
            return

        self.trans = numpy.matrix(numpy.identity(len(self.model.fields)))
        self.shift = numpy.matrix(numpy.zeros(len(self.model.fields))).T

        if self.distanceMeasure:
            # characterize the data so that you can generate random numbers with the same distribution
            try:
                covariance = self.dataDistribution.covariance()
            except ZeroDivisionError:
                covariance = INVALID

            if covariance is not INVALID:
                self.shift = self.dataDistribution.covmean()
                try:
                    self.trans = numpy.linalg.cholesky(covariance)
                except numpy.linalg.LinAlgError:
                    pass  # FIXME: at least make trans a diagonal matrix with stdev entries (or 1/stdev)!

        else:
            raise NotImplementedError(
                "Currently, only clusters with ComparisonMeasure.kind == 'distance' metrics can be produced."
            )

        # make a new set of trials
        if self.seedSource is ProducerKMeans.RANDOM_DATAPOINTS:
            # pick a random point from the dataset
            def randomization():
                i = random.randint(0, len(self.buffer[self.SYNCNUMBER]) - 1)
                return [
                    self.buffer[field][i] for field in self.model.fields
                    if field is not self.SYNCNUMBER
                ]

            self.randomization = randomization

        elif self.seedSource == ProducerKMeans.RANDOM_DATAWEIGHTED:
            # pick a random point from the dataset, weighted by their weights
            sumOfWeights = numpy.cumsum(self.buffer[self.model.weightField])

            def randomization():
                x = random.uniform(0., sumOfWeights[-1])
                i = numpy.where(sumOfWeights > x)[0][0]
                return [
                    self.buffer[field][i] for field in self.model.fields
                    if field is not self.SYNCNUMBER
                ]

            self.randomization = randomization

        elif self.seedSource == ProducerKMeans.RANDOM_DATACOVARIANCE:
            # generate a random point from a distribution with a covariance like the data
            self.randomization = lambda: ((self.trans * (numpy.matrix(
                numpy.random.randn(len(self.shift))).T)) + self.shift)

        elif self.seedSource == ProducerKMeans.RANDOM_UNITRECT:
            # generate a random point in the unit rectangle
            self.randomization = lambda: [
                random.random() for i in xrange(len(self.shift))
            ]

        self.trials = [
            TrialClusterSet(self.model.numberOfClusters, self.randomization,
                            self.engine.producerUpdateScheme)
            for i in xrange(self.numberOfTrials)
        ]

        # prepare small subsamples to run first to improve convergence when the whole dataset gets used
        allIndices = range(len(self.buffer[self.SYNCNUMBER]))
        quickConvergeSamples = []
        for numEvents in self.quickConvergeSteps:
            if numEvents > len(allIndices):
                numEvents = len(allIndices)
            quickConvergeSamples.append(
                numpy.array(random.sample(allIndices, numEvents)))

        allIndices = numpy.array(allIndices)
        for key in self.buffer:
            self.buffer[key] = numpy.array(self.buffer[key])

        for i, quickConvergenceSample in enumerate(quickConvergeSamples):
            if self.logDebug:
                self.logger.debug(
                    "KMeansClustering.produce: ===== quickConverge %d: preparing for k-means by clustering a random subset of %d events"
                    % (i + 1, len(quickConvergenceSample)))
            self.iterations(quickConvergenceSample)

        self.logger.debug(
            "KMeansClustering.produce: ===== starting k-means clustering algorithm (whole dataset)"
        )
        convergence.attrib["iterations"] = self.iterations()

        # find the best one
        best = None
        for trial in self.trials:
            if trial.hasConverged:
                if best is None or trial.updator.mean() < best.updator.mean():
                    best = trial

        convergence.attrib["converged"] = (best is not None)

        if best is None:
            self.logger.error(
                "KMeansClustering.produce: no trial cluster-sets converged within the desired number of iterations (%s), using the best UNCONVERGED set instead."
                % str(self.maxIterations) if self.
                maxIterations is not None else "unset")
            for trial in self.trials:
                if best is None or trial.updator.mean() < best.updator.mean():
                    best = trial

        # write it to the PMML file
        for bestCluster, pmmlCluster in zip(best.clusters,
                                            self.model.matches(pmml.Cluster)):
            pmmlCluster.attrib["size"] = bestCluster.count()
            theArray = pmmlCluster.child(pmml.Array)
            theArray.value = bestCluster.initialPosition
            theArray.attrib["n"] = len(theArray.value)

    def iterations(self, indices=None):
        if indices is None:
            dataset = self.buffer
        else:
            dataset = {}
            for key, value in self.buffer.items():
                dataset[key] = value[indices]

        # loop over the data many times until a subset of trials converge
        iteration = 0
        while True:
            iteration += 1

            # set "initialPosition" to the mean within each cluster
            for trial in self.trials:
                trial.reset()

            if self.logDebug:
                self.logger.debug("KMeansClustering.produce: iteration %d" %
                                  iteration)

            # loop over data (pre-calculated, including all derived fields)
            for i in xrange(len(dataset[self.SYNCNUMBER])):
                if self.logDebug and i % 10000 == 0:
                    self.logger.debug("    event %d/%d = %g%%" %
                                      (i, len(dataset[self.SYNCNUMBER]), 100. *
                                       i / len(dataset[self.SYNCNUMBER])))

                syncNumber = dataset[self.SYNCNUMBER][i]
                vector = [dataset[field][i] for field in self.model.fields]

                weight = None
                if self.model.weightField is not None:
                    weight = dataset[self.model.weightField][i]

                for trial in self.trials:
                    trial.update(syncNumber, vector, self.model, False, weight)

            if self.logDebug:
                self.logger.debug("    event %d/%d = 100%%" % (len(
                    dataset[self.SYNCNUMBER]), len(dataset[self.SYNCNUMBER])))

            self.logger.debug(
                "KMeansClustering.produce: about to sort the trials")

            self.trials.sort(
                lambda a, b: -cmp(a.updator.mean(), b.updator.mean()))

            self.logger.debug(
                "KMeansClustering.produce: about to check convergence of the trials"
            )

            numConverged = 0
            for trial in self.trials:
                if trial.converged(self.closeEnough):
                    trial.hasConverged = True

                    numConverged += 1

                else:
                    trial.hasConverged = False

            if self.logDebug:
                self.logger.debug(
                    "KMeansClustering.produce: iteration %d has %d converged cluster-set trials"
                    % (iteration, numConverged))

                best = None
                for trial in self.trials:
                    if trial.hasConverged:
                        if best is None or trial.updator.mean(
                        ) < best.updator.mean():
                            best = trial

                if best is not None:
                    self.logger.debug("    best CONVERGED so far: %s" %
                                      " ".join(map(repr, best.clusters)))
                else:
                    best = None
                    for trial in self.trials:
                        if best is None or trial.updator.mean(
                        ) < best.updator.mean():
                            best = trial

                    if best is not None:
                        self.logger.debug("    best so far: %s" %
                                          " ".join(map(repr, best.clusters)))

            for trial in self.trials:
                # self.logger.debug("    show all: %s%s" % (" ".join(map(repr, trial.clusters)), " (converged)" if trial.hasConverged else ""))
                trial.rethrowInvalid(self.randomization,
                                     self.engine.producerUpdateScheme)

            if numConverged >= self.numberToConverge:
                return iteration

            if self.maxIterations is not None and iteration >= self.maxIterations:
                return iteration
Exemplo n.º 2
0
class ProducerKMeans(ProducerAlgorithm):
    """The standard k-means clustering algorithm."""

    SYNCNUMBER = Atom("SyncNumber")
    RANDOM_DATAPOINTS = Atom("Random_DataPoints")
    RANDOM_DATACOVARIANCE = Atom("Random_DataCovariance")
    RANDOM_UNITRECT = Atom("Random_UnitRect")

    defaultParams = {
        "updateExisting": "false",
        "quickConvergeSteps": "()",
        "seedSource": "dataPoints",
        "numberOfTrials": "20",
        "numberToConverge": "5"
    }

    def initialize(self, **params):
        if "updateExisting" in params:
            self.updateExisting = pmml.boolCheck(params["updateExisting"])
            del params["updateExisting"]
            if self.updateExisting:
                raise NotImplementedError, "Updating from existing ClusterModels using 'kmeans' not implemented; use mode='replaceExisting'"
        else:
            self.updateExisting = pmml.boolCheck(
                self.defaultParams["updateExisting"])

        if "quickConvergeSteps" in params:
            try:
                self.quickConvergeSteps = eval(params["quickConvergeSteps"])
                if not isinstance(self.quickConvergeSteps, tuple):
                    raise RuntimeError
                self.quickConvergeSteps = map(int, self.quickConvergeSteps)
            except err:
                raise RuntimeError, "quickConvergeSteps must be a tuple of numbers of events"
            del params["quickConvergeSteps"]
        else:
            self.quickConvergeSteps = eval(
                self.defaultParams["quickConvergeSteps"])

        if "seedSource" in params:
            self.seedSource = params["seedSource"]
            del params["seedSource"]
        else:
            self.seedSource = self.defaultParams["seedSource"]
        if self.seedSource == "dataPoints":
            self.seedSource = self.RANDOM_DATAPOINTS
        elif self.seedSource == "dataCovariance":
            self.seedSource = self.RANDOM_DATACOVARIANCE
        elif self.seedSource == "unitRect":
            self.seedSource = self.RANDOM_UNITRECT
        else:
            raise NotImplementedError, "The seedSource must be one of 'dataPoints', 'dataCovariance', 'unitRect'"

        if "numberOfTrials" in params:
            self.numberOfTrials = int(params["numberOfTrials"])
            del params["numberOfTrials"]
        else:
            self.numberOfTrials = int(self.defaultParams["numberOfTrials"])

        if "numberToConverge" in params:
            self.numberToConverge = int(params["numberToConverge"])
            del params["numberToConverge"]
        else:
            self.numberToConverge = int(self.defaultParams["numberToConverge"])

        self.model = self.segmentRecord.pmmlModel
        self.dataDistribution = self.engine.producerUpdateScheme.updator(
            COVARIANCE(self.model.numberOfFields))
        self.distanceMeasure = (self.model.child(
            pmml.ComparisonMeasure).attrib["kind"] == "distance")

        # get rid of any PartialSums objects, since they would be misleading (this algorithm doesn't use them)
        extension = self.model.child(pmml.Extension, exception=False)
        if extension is not None:
            newChildren = []
            for child in extension.children:
                if not isinstance(child, pmml.X_ODG_PartialSums):
                    newChildren.append(child)
            extension.children = newChildren

        self.buffer = {self.SYNCNUMBER: []}
        for field in self.model.fields:
            self.buffer[field] = []

        if len(params) > 0:
            raise TypeError, "Unrecognized parameters %s" % params

    def update(self, syncNumber, get):
        self.resetLoggerLevels()

        vector = [get(field) for field in self.model.fields]
        if INVALID in vector:
            self.logger.debug(
                "KMeansClustering.update: returning False (INVALID data)")
            return False

        self.buffer[self.SYNCNUMBER].append(syncNumber)
        for i, field in enumerate(self.model.fields):
            self.buffer[field].append(vector[i])

        if self.distanceMeasure and MISSING not in vector:
            self.dataDistribution.increment(syncNumber, vector)

    def produce(self):
        self.resetLoggerLevels()

        if self.logDebug:
            self.logger.debug(
                "KMeansClustering.produce: this segment has %d data records; setting up for cluster production."
            )

        if len(self.buffer[self.SYNCNUMBER]) == 0:
            self.logger.debug(
                "KMeansClustering.produce: no data in this segment, so there are no clusters to produce."
            )
            return

        self.trans = numpy.matrix(numpy.identity(len(self.model.fields)))
        self.shift = numpy.matrix(numpy.zeros(len(self.model.fields))).T

        if self.distanceMeasure:
            # characterize the data so that you can generate random numbers with the same distribution
            try:
                covariance = self.dataDistribution.covariance()
            except ZeroDivisionError:
                covariance = INVALID

            if covariance is not INVALID:
                self.shift = self.dataDistribution.covmean()
                try:
                    self.trans = numpy.linalg.cholesky(covariance)
                except numpy.linalg.LinAlgError:
                    pass  # FIXME: at least make trans a diagonal matrix with stdev entries (or 1/stdev)!

        else:
            raise NotImplementedError, "Currently, only clusters with ComparisonMeasure.kind == 'distance' metrics can be produced."

        # make a new set of trials
        if self.seedSource is ProducerKMeans.RANDOM_DATAPOINTS:
            # pick a random point from the dataset
            def randomization():
                i = random.randint(0, len(self.buffer[self.SYNCNUMBER]) - 1)
                return [
                    self.buffer[field][i] for field in self.model.fields
                    if field is not self.SYNCNUMBER
                ]

            self.randomization = randomization

        elif self.seedSource == ProducerKMeans.RANDOM_DATACOVARIANCE:
            # generate a random point from a distribution with a covariance like the data
            self.randomization = lambda: ((self.trans * (numpy.matrix(
                numpy.random.randn(len(self.shift))).T)) + self.shift)

        elif self.seedSource == ProducerKMeans.RANDOM_UNITRECT:
            # generate a random point in the unit rectangle
            self.randomization = lambda: [
                random.random() for i in xrange(len(self.shift))
            ]

        self.trials = [
            TrialClusterSet(self.model.numberOfClusters, self.randomization,
                            self.engine.producerUpdateScheme)
            for i in xrange(self.numberOfTrials)
        ]

        # prepare small subsamples to run first to improve convergence when the whole dataset gets used
        allIndices = range(len(self.buffer[self.SYNCNUMBER]))
        quickConvergeSamples = []
        for numEvents in self.quickConvergeSteps:
            if numEvents > len(allIndices):
                numEvents = len(allIndices)
            quickConvergeSamples.append(
                numpy.array(random.sample(allIndices, numEvents)))

        allIndices = numpy.array(allIndices)
        for key in self.buffer:
            self.buffer[key] = numpy.array(self.buffer[key])

        for i, quickConvergenceSample in enumerate(quickConvergeSamples):
            if self.logDebug:
                self.logger.debug(
                    "KMeansClustering.produce: ===== quickConverge %d: preparing for k-means by clustering a random subset of %d events"
                    % (i + 1, len(quickConvergenceSample)))
            self.iterations(quickConvergenceSample)

        self.logger.debug(
            "KMeansClustering.produce: ===== starting k-means clustering algorithm (whole dataset)"
        )
        self.iterations()

        # find the best one
        best = None
        for trial in self.trials:
            if trial.hasConverged:
                if best is None or trial.updator.mean() < best.updator.mean():
                    best = trial

        # write it to the PMML file
        for bestCluster, pmmlCluster in zip(best.clusters, self.model.cluster):
            pmmlCluster.value = bestCluster.initialPosition

    def iterations(self, indices=None):
        if indices is None:
            dataset = self.buffer
        else:
            dataset = {}
            for key, value in self.buffer.items():
                dataset[key] = value[indices]

        # loop over the data many times until a subset of trials converge
        iteration = 0
        while True:
            # FIXME: TODO: the number of iterations and some facts about the
            # number of equivalent trials should go into metadata somewhere
            iteration += 1

            # set "initialPosition" to the mean within each cluster
            for trial in self.trials:
                trial.reset()

            if self.logDebug:
                self.logger.debug("KMeansClustering.produce: iteration %d" %
                                  iteration)

            # loop over data (pre-calculated, including all derived fields)
            for i in xrange(len(dataset[self.SYNCNUMBER])):
                if self.logDebug and i % 10000 == 0:
                    self.logger.debug("    event %d/%d = %g%%" %
                                      (i, len(dataset[self.SYNCNUMBER]), 100. *
                                       i / len(dataset[self.SYNCNUMBER])))

                syncNumber = dataset[self.SYNCNUMBER][i]
                vector = [dataset[field][i] for field in self.model.fields]

                for trial in self.trials:
                    trial.update(syncNumber, vector, self.model, False)

            if self.logDebug:
                self.logger.debug("    event %d/%d = 100%%" % (len(
                    dataset[self.SYNCNUMBER]), len(dataset[self.SYNCNUMBER])))

            self.logger.debug(
                "KMeansClustering.produce: about to sort the trials")

            self.trials.sort(
                lambda a, b: -cmp(a.updator.mean(), b.updator.mean()))

            self.logger.debug(
                "KMeansClustering.produce: about to check convergence of the trials"
            )

            numConverged = 0
            for trial in self.trials:
                if trial.converged():
                    trial.hasConverged = True

                    numConverged += 1

                else:
                    trial.hasConverged = False

            if self.logDebug:
                self.logger.debug(
                    "KMeansClustering.produce: iteration %d has %d converged cluster-set trials"
                    % (iteration, numConverged))

                best = None
                for trial in self.trials:
                    if trial.hasConverged:
                        if best is None or trial.updator.mean(
                        ) < best.updator.mean():
                            best = trial

                if best is not None:
                    self.logger.debug("    best CONVERGED so far: %s" %
                                      " ".join(map(repr, best.clusters)))
                else:
                    best = None
                    for trial in self.trials:
                        if best is None or trial.updator.mean(
                        ) < best.updator.mean():
                            best = trial

                    if best is not None:
                        self.logger.debug("    best so far: %s" %
                                          " ".join(map(repr, best.clusters)))

            for trial in self.trials:
                trial.rethrowInvalid(self.randomization,
                                     self.engine.producerUpdateScheme)

            if numConverged >= self.numberToConverge:
                return
Exemplo n.º 3
0
import augustus.core.pmml41 as pmml


def sigfigs(num, n):
    """Round a number to n significant figures and return the result as a string."""
    # stolen from Cassius:
    if num == 0.:
        level = 1
    else:
        level = n - int(math.ceil(math.log10(abs(num))))
    num = round(num, level)
    format = "%." + str(max(level, 0)) + "f"
    return format % num


BROKEN = Atom("Broken")
NOTFOUND = Atom("NotFound")

# def _show(index, pmmlFile, index_width=20):
#     if index is None:
#         return "%s %s" % (("%%-%ds" % index_width) % "index", repr(pmmlFile))
#     if index is BROKEN:
#         return "%s %s" % (("%%-%ds" % index_width) % "???", "???")
#     return "%s %s%s" % (("%%-%ds" % index_width) % repr(index), ". . " * len(index), repr(pmmlFile[index]))

# def _showUpTo(i, index1, file1, index2, file2):
#     output = []
#     for j, (j1, j2) in enumerate(zip(index1, index2)):
#         if j < i:
#             output.append("%-70s     versus     %-70s" % (_show(j1, file1)[:70], _show(j2, file2)[:70]))
#         elif j == i:
Exemplo n.º 4
0
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Defines the way all consumer algorithms store their states, and how
events are weighted or blended.  Can be expanded to handle model
production in a parallelized system."""

import numpy
import numpy.linalg

from augustus.core.defs import Atom, INVALID
from augustus.core.extramath import MINFLOAT

########################################################### Atoms

COUNT = Atom("Count")
SUM1 = Atom("Sum1")
SUMX = Atom("SumX")
SUMXX = Atom("SumXX")
RUNMEAN = Atom("RunMean")
RUNSN = Atom("RunSN")
MIN = Atom("Min")
MAX = Atom("Max")
CUSUM = Atom("CUSUM")
GLR = Atom("GLR")


class COVARIANCE(Atom):
    """Atom (isotope?) for covariance calculations.  The dimension of
    this object depends on the data, and is given at initialization."""
    def __init__(self, dimension):
Exemplo n.º 5
0
class ProducerKMeans(ProducerAlgorithm):
    """The standard k-means clustering algorithm."""

    SYNCNUMBER = Atom("SyncNumber")

    defaultParams = {
        "updateExisting": "false",
        "numberOfTrials": "20",
        "numberToConverge": "5"
    }

    def initialize(self, **params):
        if "updateExisting" in params:
            self.updateExisting = pmml.boolCheck(params["updateExisting"])
            del params["updateExisting"]
            if self.updateExisting:
                raise NotImplementedError, "Updating from existing ClusterModels using 'kmeans' not implemented; use mode='replaceExisting'"
        else:
            self.updateExisting = pmml.boolCheck(
                self.defaultParams["updateExisting"])

        if "numberOfTrials" in params:
            self.numberOfTrials = int(params["numberOfTrials"])
            del params["numberOfTrials"]
        else:
            self.numberOfTrials = int(self.defaultParams["numberOfTrials"])

        if "numberToConverge" in params:
            self.numberToConverge = int(params["numberToConverge"])
            del params["numberToConverge"]
        else:
            self.numberToConverge = int(self.defaultParams["numberToConverge"])

        self.model = self.segmentRecord.pmmlModel
        self.dataDistribution = self.engine.producerUpdateScheme.updator(
            COVARIANCE(self.model.numberOfFields))
        self.distanceMeasure = (self.model.child(
            pmml.ComparisonMeasure).attrib["kind"] == "distance")

        # get rid of any PartialSums objects, since they would be misleading (this algorithm doesn't use them)
        extension = self.model.child(pmml.Extension, exception=False)
        if extension is not None:
            newChildren = []
            for child in extension.children:
                if not isinstance(child, pmml.X_ODG_PartialSums):
                    newChildren.append(child)
            extension.children = newChildren

        self.buffer = {self.SYNCNUMBER: []}
        for field in self.model.fields:
            self.buffer[field] = []

        if len(params) > 0:
            raise TypeError, "Unrecognized parameters %s" % params

    def update(self, syncNumber, get):
        vector = [get(field) for field in self.model.fields]
        if INVALID in vector: return False

        self.buffer[self.SYNCNUMBER].append(syncNumber)
        for i, field in enumerate(self.model.fields):
            self.buffer[field].append(vector[i])

        if self.distanceMeasure and MISSING not in vector:
            self.dataDistribution.increment(syncNumber, vector)

    def produce(self):
        trans = numpy.matrix(numpy.identity(len(self.model.fields)))
        shift = numpy.matrix(numpy.zeros(len(self.model.fields))).T

        if self.distanceMeasure:
            # characterize the data so that you can generate random numbers with the same distribution
            try:
                covariance = self.dataDistribution.covariance()
            except ZeroDivisionError:
                covariance = INVALID

            if covariance is not INVALID:
                shift = self.dataDistribution.covmean()
                try:
                    trans = numpy.linalg.cholesky(covariance)
                except numpy.linalg.LinAlgError:
                    pass  # FIXME: at least make trans a diagonal matrix with stdev entries (or 1/stdev)!

        else:
            raise NotImplementedError, "Currently, only clusters with ComparisonMeasure.kind == 'distance' metrics can be produced."

        # make a new set of trials (randomly seeded with the same covariance as data)
        trials = [
            TrialClusterSet(self.model.numberOfClusters, trans, shift,
                            self.engine.producerUpdateScheme)
            for i in xrange(self.numberOfTrials)
        ]

        # loop over the data many times until a subset of trials converge
        iteration = 0
        while True:
            # FIXME: TODO: the number of iterations and some facts about the
            # number of equivalent trials should go into metadata somewhere
            iteration += 1

            # set "initialPosition" to the mean within each cluster
            for trial in trials:
                trial.reset()

            # loop over data (pre-calculated, including all derived fields)
            for i in xrange(len(self.buffer[self.SYNCNUMBER])):
                syncNumber = self.buffer[self.SYNCNUMBER][i]
                vector = [self.buffer[field][i] for field in self.model.fields]

                for trial in trials:
                    trial.update(syncNumber, vector, self.model, False)

            trials.sort(lambda a, b: -cmp(a.updator.mean(), b.updator.mean()))

            numConverged = 0
            for trial in trials:
                if trial.converged():
                    trial.hasConverged = True

                    numConverged += 1

                else:
                    trial.hasConverged = False

            if numConverged > self.numberToConverge:
                break

        # find the best one
        best = None
        for trial in trials:
            if trial.hasConverged:
                if best is None or trial.updator.mean() < best.updator.mean():
                    best = trial

        # write it to the PMML file
        for bestCluster, pmmlCluster in zip(best.clusters, self.model.cluster):
            pmmlCluster.value = bestCluster.initialPosition
Exemplo n.º 6
0
class ConsumerBaselineModel(ConsumerAlgorithm):
    CHISQUAREDISTRIBUTION = Atom("chiSquareDistribution")
    SCALARPRODUCT = Atom("scalarProduct")
    INDEPENDENT = Atom("Independent")
    SIZEWEIGHTED = Atom("SizeWeighted")

    def initialize(self, **params):
        """Initialize a baseline consumer.

        Unlike other consumers, this creates the score function
        dynamically, depending on the type of testStatistic.
        """

        testDistributions = self.segmentRecord.pmmlModel.child(
            pmml.TestDistributions)
        self.field = testDistributions.attrib["field"]
        testStatistic = testDistributions.attrib["testStatistic"]

        # updating can be configured in the Augustus configuration file and in the "windowSize" attribute in this segment
        # I will assume that the "windowSize" attribute can override CUSUM and GLR only

        # (the only other baseline consumer that has an intermediate state is chiSquareDistribution, which only makes
        # sense as UpdateScheme("synchronized"), and that one depends on the configuration of syncNumber, not in PMML)

        # the general case:
        self.updateScheme = self.engine.consumerUpdateScheme
        # the special case:
        if testStatistic in ("CUSUM", "GLR"):
            if "windowSize" in testDistributions.attrib and testDistributions.attrib[
                    "windowSize"] != 0:
                self.updateScheme = UpdateScheme(
                    "window",
                    windowSize=testDistributions.attrib["windowSize"],
                    windowLag=0)

        if testStatistic == "CUSUM":
            self.baseline = testDistributions.child(pmml.Baseline).child()
            self.alternate = testDistributions.child(pmml.Alternate).child()
            self.updator = self.updateScheme.updator(CUSUM)
            self.updator.resetValue = testDistributions.attrib["resetValue"]
            self.score = self.scoreCUSUM

            extension = testDistributions.child(pmml.Extension,
                                                exception=False)
            if extension is not None:
                init = extension.child(pmml.X_ODG_CUSUMInitialization,
                                       exception=False)
                if init is not None:
                    self.updator.initialize({CUSUM: [init.attrib["value"]]})

            self.pseudoField = self.field
            self.pseudoOutputAll = True

        elif testStatistic == "zValue":
            self.baseline = testDistributions.child(pmml.Baseline).child()
            if isinstance(self.baseline, pmml.GaussianDistribution):
                self.score = self.scoreZValueGaussian
            else:
                self.score = self.scoreZValue

            self.pseudoField = self.field
            self.pseudoOutputAll = True

        elif testStatistic in ("chiSquareDistribution", "scalarProduct"):
            self.updators = {}
            self.countTable = testDistributions.child(pmml.Baseline).child()

            if "weightField" in testDistributions.attrib:
                self.weightField = testDistributions.attrib["weightField"]
            else:
                self.weightField = None

            if "normalizationScheme" not in testDistributions.attrib:
                self.normalizationScheme = None
            elif testDistributions.attrib[
                    "normalizationScheme"] == "Independent":
                self.normalizationScheme = self.INDEPENDENT
            elif testDistributions.attrib[
                    "normalizationScheme"] == "SizeWeighted":
                self.normalizationScheme = self.SIZEWEIGHTED

            self.testStatistic = {
                "chiSquareDistribution": self.CHISQUAREDISTRIBUTION,
                "scalarProduct": self.SCALARPRODUCT,
            }[testStatistic]

            self.score = self.scoreHistogram

            self.pseudoField = (self.field, self.weightField)
            self.pseudoOutputAll = False

            # ODG extensions
            self.binsOfInterest = testDistributions.descendant(
                pmml.X_ODG_BinsOfInterest, exception=False)

        elif testStatistic == "chiSquareIndependence":
            self.baseline = testDistributions.child(pmml.Baseline)
            self.fields = None
            self.countTable = None
            self.score = self.scoreChiSquareIndependence

            self.pseudoField = None
            self.pseudoOutputAll = True

        # ODG extensions
        elif testStatistic == "GLR":
            self.baseline = testDistributions.child(pmml.Baseline).child()
            if not isinstance(
                    self.baseline,
                (pmml.GaussianDistribution, pmml.PoissonDistribution)):
                raise NotImplementedError, "GLR has only been implemented for Gaussian and Poisson distributions"

            self.updator = self.updateScheme.updator(GLR)
            self.score = self.scoreGLR

            self.pseudoField = self.field
            self.pseudoOutputAll = False

        if len(params) > 0:
            raise TypeError, "Unrecognized parameters %s" % params

    ######################################## CUSUM

    def scoreCUSUM(self, syncNumber, get):
        """Score one event with a CUSUM testStatistic."""

        value = get(self.field)
        if value is INVALID or value is MISSING:
            return INVALID

        self.updator.increment(
            syncNumber,
            self.alternate.logpdf(value) - self.baseline.logpdf(value))
        return {SCORE_predictedValue: self.updator.cusum()}

    ######################################## zValue

    def scoreZValueGaussian(self, syncNumber, get):
        """Score one event with a zValue testStatistic (Gaussian)."""

        value = get(self.field)
        if value is INVALID or value is MISSING:
            return INVALID

        if self.baseline.attrib["variance"] == 0.:
            return {SCORE_predictedValue: float("inf"), SCORE_pValue: 0.}

        elif self.baseline.attrib["variance"] < 0.:
            return INVALID

        zValue = (value - self.baseline.attrib["mean"]) / math.sqrt(
            self.baseline.attrib["variance"])
        probability = self.baseline.cdf(value)
        pValue = 1. - 2. * abs(probability - 0.5)

        return {SCORE_predictedValue: zValue, SCORE_pValue: pValue}

    def scoreZValue(self, syncNumber, get):
        """Score one event with a zValue testStatistic (non-Gaussian)."""

        value = get(self.field)
        if value is INVALID or value is MISSING:
            return INVALID

        probability = self.baseline.cdf(value)
        if probability <= 1e-16:
            zValue = -10.
        elif probability >= 1. - 1e-16:
            zValue = 10.
        else:
            zValue = math.sqrt(2.) * erfinv(2. * probability - 1.)
        pValue = 1. - 2. * abs(probability - 0.5)

        return {SCORE_predictedValue: zValue, SCORE_pValue: pValue}

    ######################################## chiSquareDistribution and scalarProduct

    def scoreHistogram(self, syncNumber, get):
        """Score one event with a chiSquareDistribution or scalarProduct."""

        value = get(self.field)
        if self.weightField is None:
            weight = 1.
        else:
            weight = get(self.weightField)

        # we can still calculate the consistency of the *accumulated* distribution, even if this datapoint is invalid
        if value is INVALID or value is MISSING or weight is INVALID or weight is MISSING:
            pass
        else:
            # for histograms, increment all bins, but only the correct bin gets a non-zero value
            found = False
            for bin, updator in self.updators.items():
                if bin == value:
                    updator.increment(syncNumber, weight)
                    found = True
                else:
                    updator.increment(syncNumber, 0.)

            # this might be a new bin
            if not found:
                updator = self.updateScheme.updator(SUMX)
                updator.increment(syncNumber, weight)
                self.updators[value] = updator

        fieldValueCounts = self.countTable.matches(pmml.FieldValueCount,
                                                   maxdepth=None)

        # chiSquareDistribution
        if self.testStatistic == self.CHISQUAREDISTRIBUTION:
            expectedTotal = 0.
            expectedValues = {}
            for fieldValueCount in fieldValueCounts:
                bin = fieldValueCount.attrib["value"]
                count = fieldValueCount.attrib["count"]
                expectedTotal += count
                expectedValues[bin] = count

            observedTotal = 0.
            for bin, updator in self.updators.items():
                observedTotal += updator.sum()

            if expectedTotal <= 0. or observedTotal <= 0. or (
                    isinstance(self.countTable, pmml.NormalizedCountTable)
                    and self.countTable.attrib["sample"] <= 0.):
                return INVALID

            chi2 = 0.
            if self.binsOfInterest is None:
                ndf = -1  # normalization removes one degree of freedom
            else:
                ndf = 0

            for bin in set(expectedValues.keys()).union(
                    set(self.updators.keys())):
                if self.binsOfInterest is not None:
                    if bin not in self.binsOfInterest:
                        continue

                expected = expectedValues.get(bin, 0.)
                updator = self.updators.get(bin, None)
                if updator is not None:
                    observed = updator.sum()
                else:
                    observed = 0.

                if expected > 0. or observed > 0.:
                    if isinstance(self.countTable, pmml.CountTable):
                        chi2 += (expected / expectedTotal -
                                 observed / observedTotal)**2 / (
                                     expected / expectedTotal**2 +
                                     observed / observedTotal**2)

                    elif isinstance(self.countTable,
                                    pmml.NormalizedCountTable):
                        sample = self.countTable.attrib["sample"]
                        chi2 += (expected / expectedTotal -
                                 observed / observedTotal)**2 / (
                                     expected / expectedTotal / sample +
                                     observed / observedTotal**2)

                    ndf += 1

            if ndf > 0:
                probability = chiSquare_cdf(chi2, ndf)
                pValue = 1. - probability
                return {
                    SCORE_predictedValue: probability,
                    SCORE_pValue: pValue,
                    SCORE_chiSquare: chi2,
                    SCORE_degreesOfFreedom: ndf
                }
            else:
                return INVALID

        # scalarProduct
        elif self.testStatistic == self.SCALARPRODUCT:
            expectedNorm2 = 0.
            dotProduct = 0.
            for fieldValueCount in fieldValueCounts:
                expected = fieldValueCount.attrib["count"]
                expectedNorm2 += expected**2

                bin = fieldValueCount.attrib["value"]
                if expected > 0. and bin in self.updators:
                    observed = self.updators[bin].sum()
                    dotProduct += expected * observed

            observedNorm2 = 0.
            for updator in self.updators.values():
                observed = updator.sum()
                observedNorm2 += observed**2

            if expectedNorm2 > 0. and observedNorm2 > 0.:
                if self.normalizationScheme is None:
                    return {SCORE_predictedValue: dotProduct}

                elif self.normalizationScheme is self.INDEPENDENT:
                    if expectedNorm2 <= 0. or observedNorm2 <= 0.:
                        return INVALID
                    return {
                        SCORE_predictedValue:
                        dotProduct / math.sqrt(expectedNorm2) /
                        math.sqrt(observedNorm2)
                    }

                elif self.normalizationScheme is self.SIZEWEIGHTED:
                    if expectedNorm2 + observedNorm2 <= 0.: return INVALID
                    return {
                        SCORE_predictedValue:
                        2. * dotProduct / (expectedNorm2 + observedNorm2)
                    }

            else:
                return INVALID

    ######################################## chiSquareIndependence

    def _chiSquareIndependence_add(self, pmmlNode, fieldValues, totals):
        if isinstance(
                pmmlNode,
            (pmml.CountTable, pmml.NormalizedCountTable, pmml.FieldValue)):
            for child in pmmlNode:
                self._chiSquareIndependence_add(
                    child, fieldValues + [child.attrib["value"]], totals)

        elif isinstance(pmmlNode, pmml.FieldValueCount):
            count = pmmlNode.attrib["count"]

            totals[None] += count
            for f, v in zip(self.fields, fieldValues):
                if v not in totals[f]:
                    totals[f][v] = 0.
                totals[f][v] += count

    def _chiSquareIndependence_chi2(self, pmmlNode, fieldValues, totals):
        if isinstance(
                pmmlNode,
            (pmml.CountTable, pmml.NormalizedCountTable, pmml.FieldValue)):
            output = 0.
            for child in pmmlNode:
                subchi2 = self._chiSquareIndependence_chi2(
                    child, fieldValues + [child.attrib["value"]], totals)
                if subchi2 is None: return None
                output += subchi2
            return output

        elif isinstance(pmmlNode, pmml.FieldValueCount):
            observed = pmmlNode.attrib["count"]

            if totals[None] == 0:
                return None
            else:
                if isinstance(self.countTable, pmml.NormalizedCountTable):
                    scale = self.countTable.attrib["sample"] / totals[None]
                else:
                    scale = 1.

                expected = 1. / (totals[None] * scale)**(len(self.fields) - 1)
                for f, v in zip(self.fields, fieldValues):
                    expected *= (totals[f][v] * scale)

                if expected == 0.:
                    return None
                else:
                    return (expected - (observed * scale))**2 / expected

    def scoreChiSquareIndependence(self, syncNumber, get):
        """Score one event with a chiSquareIndependence testStatistic.

        This reads from the multi-dimensional CountTable in PMML and
        ignores the data!  Data are only used to make the CountTable,
        so be sure to be running the producer if you want
        chiSquareIndependence.
        """

        # expect a CountTable (if it doesn't exist, the producer will make it)
        self.countTable = self.baseline.child()
        if not isinstance(self.countTable,
                          (pmml.CountTable, pmml.NormalizedCountTable)):
            return INVALID  # the "first" time doesn't happen until we see a count table

        self.fields = []
        dimension = self.countTable.child(pmml.nonExtension)
        while True:
            self.fields.append(dimension.attrib["field"])
            if isinstance(dimension, pmml.FieldValueCount): break
            dimension = dimension.child(pmml.nonExtension)

        totals = {None: 0.}
        for f in self.fields:
            totals[f] = {}

        # every time: add up the n-field margins (which are "rows and columns" in 2-field case)
        self._chiSquareIndependence_add(self.countTable, [], totals)
        chi2 = self._chiSquareIndependence_chi2(self.countTable, [], totals)

        ndf = 1
        for f, tot in totals.items():
            if f is not None:
                ndf *= (len(tot) - 1)

        if chi2 is not None and ndf > 0:
            probability = chiSquare_cdf(chi2, ndf)
            pValue = 1. - probability
            return {
                SCORE_predictedValue: probability,
                SCORE_pValue: pValue,
                SCORE_chiSquare: chi2,
                SCORE_degreesOfFreedom: ndf
            }
        else:
            return INVALID

    ######################################## ODG-extension: GLR

    def _scoreGLR_GaussianDistribution(self, s, N):
        return (s - N * self.baseline.attrib["mean"])**2 / N

    def _scoreGLR_PoissonDistribution(self, s, N):
        if s > 0.:
            return -math.log(self.baseline.attrib["mean"]) * s + math.log(
                s / N) * s + N * self.baseline.attrib["mean"] - s
        else:
            return -math.log(self.baseline.attrib["mean"]
                             ) * s + N * self.baseline.attrib["mean"] - s

    def scoreGLR(self, syncNumber, get):
        """Score one event with a GLR testStatistic.

        Output is the *current* best-guess of the turn-around time (as
        the corresponding syncNumber) and its log-likelihood ratio.
        """

        # Eq. 2.4.40 in Basseville and Nikiforov: http://www.irisa.fr/sisthem/kniga/ (partly in eventweighting.py)

        value = get(self.field)
        if value is not INVALID and value is not MISSING:
            self.updator.increment(syncNumber, value)

        if isinstance(self.baseline, pmml.GaussianDistribution):
            maximum_syncNumber, maximum = self.updator.glr(
                self._scoreGLR_GaussianDistribution)

            if maximum is None or self.baseline.attrib["variance"] < 0.:
                return INVALID
            elif self.baseline.attrib["variance"] == 0.:
                return {
                    SCORE_predictedValue: float("inf"),
                    SCORE_thresholdTime: maximum_syncNumber
                }
            else:
                return {
                    SCORE_predictedValue:
                    maximum / 2. / self.baseline.attrib["variance"],
                    SCORE_thresholdTime:
                    maximum_syncNumber
                }

        elif isinstance(self.baseline, pmml.PoissonDistribution):
            maximum_syncNumber, maximum = self.updator.glr(
                self._scoreGLR_PoissonDistribution)

            if maximum is None:
                return INVALID
            else:
                return {
                    SCORE_predictedValue: maximum,
                    SCORE_thresholdTime: maximum_syncNumber
                }
Exemplo n.º 7
0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Defines the way all consumer algorithms store their states, and how
events are weighted or blended.  Can be expanded to handle model
production in a parallelized system."""

import numpy

from augustus.core.defs import Atom, INVALID
from augustus.core.extramath import MINFLOAT

COUNT = Atom("Count")
SUM1 = Atom("Sum1")
SUMX = Atom("SumX")
SUMXX = Atom("SumXX")
MIN = Atom("Min")
MAX = Atom("Max")
CUSUM = Atom("CUSUM")
GLR = Atom("GLR")

class COVARIANCE(Atom):
    """Atom (isotope?) for covariance calculations.  The dimension of
    this object depends on the data, and is given at initialization."""

    def __init__(self, dimension):
        self.name = "Covariance"
        self.dimension = dimension
Exemplo n.º 8
0
class OutputWriter:
    """Writes all scoring output.

    Opened at the beginning of a job, written to with each
    event/pseudoevent, and closed at the end of a job.
    """

    XML = Atom("xml")
    JSON = Atom("json")

    def __init__(self,
                 fileName,
                 mode="xml",
                 reportName=None,
                 pmmlFileName=None,
                 eventName="Event",
                 pseudoEventName="pseudoEvent",
                 segmentName="Segment",
                 segmentExpressionName="SegmentExpression",
                 outputName="Output",
                 matchingSegmentsName="MatchingSegments"):
        """Create an OutputWriter with specified tag names."""

        self.fileName = fileName
        if mode == "xml":
            self.mode = self.XML
        elif mode == "json":
            self.mode = self.JSON
        else:
            raise NotImplementedError,\
                "Only 'xml' and 'json' output modes have been implemented"

        self.pmmlFileName = pmmlFileName
        self.reportName = reportName
        self.eventName = eventName
        self.pseudoEventName = pseudoEventName
        self.segmentName = segmentName
        self.segmentExpressionName = segmentExpressionName
        self.outputName = outputName
        self.matchingSegmentsName = matchingSegmentsName

    def open(self, append=True):
        """Open an output file for writing.

        If a reportName is given, open the outermost XML or JSON
        object.
        """

        if isinstance(self.fileName, basestring):
            self.ostream = open(self.fileName, "a" if append else "w")
        else:
            self.ostream = self.fileName
            self.fileName = self.ostream.name

        if self.reportName is not None:
            if self.mode is self.XML:
                label = dict(timestamp=datetime.datetime.now())
                if self.pmmlFileName is not None:
                    label["model"] = self.pmmlFileName
                label = " ".join([
                    "%s=%s" % (k, quoteattr(str(v)))
                    for k, v in label.iteritems()
                ])
                self.ostream.write("<%s %s>%s" %
                                   (self.reportName, label, os.linesep))
            elif self.mode is self.JSON:
                self.ostream.write("{\"%s\": [%s" %
                                   (self.reportName, os.linesep))
                self.needsComma = False

    def write(self,
              outputRecord,
              eventTags=None,
              eventName=None,
              pseudoEventName=None,
              segmentName=None,
              segmentExpressionName=None,
              outputName=None,
              matchingSegmentsName=None):
        """Write one record to the output file."""

        if self.mode is self.XML:
            self.ostream.write(
                outputRecord.xml(
                    eventTags,
                    self.eventName if eventName is None else eventName,
                    self.pseudoEventName
                    if pseudoEventName is None else pseudoEventName,
                    self.segmentName if segmentName is None else segmentName,
                    self.segmentExpressionName if segmentExpressionName is None
                    else segmentExpressionName,
                    self.outputName if outputName is None else outputName,
                    self.matchingSegmentsName
                    if matchingSegmentsName is None else matchingSegmentsName))
            self.ostream.write(os.linesep)

        elif self.mode is self.JSON:
            if self.reportName is not None:
                if self.needsComma:
                    self.ostream.write(",")
                    self.ostream.write(os.linesep)

            self.ostream.write(
                outputRecord.json(
                    eventTags,
                    self.eventName if eventName is None else eventName,
                    self.pseudoEventName
                    if pseudoEventName is None else pseudoEventName,
                    self.segmentName if segmentName is None else segmentName,
                    self.segmentExpressionName if segmentExpressionName is None
                    else segmentExpressionName,
                    self.outputName if outputName is None else outputName,
                    self.matchingSegmentsName
                    if matchingSegmentsName is None else matchingSegmentsName))

            if self.reportName is not None:
                self.needsComma = True
            else:
                self.ostream.write(os.linesep)

    def close(self):
        """Close the output record.

        If a reportName is given, close the outermost XML or JSON
        object.
        """

        if self.reportName is not None:
            if self.mode is self.XML:
                self.ostream.write("</%s>%s" % (self.reportName, os.linesep))
            elif self.mode is self.JSON:
                self.ostream.write("]}")
                self.ostream.write(os.linesep)
Exemplo n.º 9
0
}
producerAlgorithmMap = {
    pmml.BaselineModel:
    augustus.algorithms.baseline.ProducerBaselineModel,
    pmml.ClusteringModel:
    augustus.algorithms.clustering.ProducerClusteringModel,
    pmml.TreeModel:
    augustus.algorithms.trees.ProducerTreeModel,
    # ProducerTreeModel recognizes <RuleSetModel> elements and fills them appropriately
    pmml.RuleSetModel:
    augustus.algorithms.trees.ProducerTreeModel,
}

########################################################### for faster segment lookup

MATCHRANGES = Atom("MatchRanges")


def __matchesPartition(matcher, partition):
    for bound, comparator in partition:
        if bound is not None and not comparator(matcher, bound):
            return False
    return True

_segmentHelpers = NameSpace(
    lessThan=lambda x, val: x < val,
    lessOrEqual=lambda x, val: x <= val,
    greaterThan=lambda x, val: x > val,
    greaterOrEqual=lambda x, val: x >= val,
    isCompoundAnd=lambda x:
        isinstance(x, pmml.CompoundPredicate) and
Exemplo n.º 10
0
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Represents a segment, maintains the producer and consumer algorithms for this segment, and has pointers to everything relevant.  Maintained by Engine."""

from augustus.algorithms.eventweighting import COUNT
import augustus.core.pmml41 as pmml
from augustus.core.defs import Atom, IMMATURE, MATURE, LOCKED, UNINITIALIZED

SELECTFIRST = Atom("SelectFirst")
SELECTALL = Atom("SelectAll")
SELECTONLY = Atom("SelectOnly")

########################################################### SegmentRecord


class SegmentNameRegistry:
    """Give new segments IDs if they don't already have one and
    enforces uniqueness of IDs.

    Only one object of this class should exist.
    Maintains a double-referenced lookup table (dict <-> dict).
    """
    def __init__(self):
        """Called by SegmentRecord (the class, not an instance) when Python starts up."""
Exemplo n.º 11
0
class Feature:
    CATEGORICAL = Atom("Categorical")
    CONTINUOUS = Atom("Continuous")
    ORDINALSTRING = Atom("OrdinalString")

    STRING = Atom("String")
    INTEGER = Atom("Integer")
    FLOAT = Atom("Float")

    def __init__(self, name, optype, dataType, producerUpdateScheme):
        self.name = name

        if optype == "categorical":
            self.values = set()
            self.optype = self.CATEGORICAL
            self.dataType = self.STRING if dataType == "string" else dataType
            self.mature = False
            self.maturityCounter = 0

        elif optype == "continuous":
            self.updator = producerUpdateScheme.updator(SUM1, SUMX, SUMXX)
            self.optype = self.CONTINUOUS
            self.dataType = {
                "integer": self.INTEGER,
                "float": self.FLOAT,
                "double": self.FLOAT
            }.get(dataType, dataType)
            self.mature = False
            self.maturityCounter = 0

        else:
            self.values = map(optype, optype.values)
            self.optype = self.ORDINALSTRING
            self.dataType = self.STRING if dataType == "string" else dataType
            self.mature = True

    def increment(self, syncValue, get):
        value = get(self.name)
        if value is not INVALID and value is not MISSING:
            if self.optype is self.CATEGORICAL:
                self.values.add(value)

                if self.maturityCounter < self.maturityThreshold:
                    self.maturityCounter += 1
                else:
                    self.mature = True

            elif self.optype is self.CONTINUOUS:
                self.updator.increment(syncValue, value)

                if self.maturityCounter < self.maturityThreshold:
                    self.maturityCounter += 1
                else:
                    self.mature = True

    def randomSplit(self):
        if self.optype is self.CATEGORICAL:
            return SplitEqual(self.name, random.choice(tuple(self.values)))

        elif self.optype is self.CONTINUOUS:
            try:
                stdev = math.sqrt(self.updator.variance())
            except ValueError:
                stdev = 0.

            if self.dataType is self.INTEGER:
                return SplitGreaterThan(
                    self.name,
                    int(round(random.gauss(self.updator.mean(), stdev))))
            else:
                return SplitGreaterThan(
                    self.name, random.gauss(self.updator.mean(), stdev))

        elif self.optype is self.ORDINALSTRING:
            return SplitGreaterThan(self.name,
                                    random.choice(tuple(self.values)))
Exemplo n.º 12
0
class ProducerTreeModel(ProducerAlgorithm):
    TREEMODEL = Atom("TreeModel")
    RULESETMODEL = Atom("RuleSetModel")

    defaultParams = {
        "updateExisting": "false",
        "featureMaturityThreshold": "10",
        "splitMaturityThreshold": "30",
        "trialsToKeep": "50",
        "worldsToSplit": "3",
        "treeDepth": "3",
        "classification": ""
    }

    def initialize(self, **params):
        """An event-based tree-producing algorithm.

        Although it does not iterate over the data as the standard
        CART algorithm does, it converges to an approximate tree by
        keeping alternate hypotheses in mind and collecting data for
        all active hypotheses.
        """

        if "updateExisting" in params:
            self.updateExisting = pmml.boolCheck(params["updateExisting"])
            del params["updateExisting"]
        else:
            self.updateExisting = pmml.boolCheck(
                self.defaultParams["updateExisting"])

        if self.updateExisting:
            raise NotImplementedError, "Updating from existing TreeModels/RuleSetModels not implemented; use mode='replaceExisting'"

        if "featureMaturityThreshold" in params:
            self.featureMaturityThreshold = int(
                params["featureMaturityThreshold"])
            del params["featureMaturityThreshold"]
        else:
            self.featureMaturityThreshold = int(
                self.defaultParams["featureMaturityThreshold"])

        if "splitMaturityThreshold" in params:
            self.splitMaturityThreshold = int(params["splitMaturityThreshold"])
            del params["splitMaturityThreshold"]
        else:
            self.splitMaturityThreshold = int(
                self.defaultParams["splitMaturityThreshold"])

        if "trialsToKeep" in params:
            self.trialsToKeep = int(params["trialsToKeep"])
            del params["trialsToKeep"]
        else:
            self.trialsToKeep = int(self.defaultParams["trialsToKeep"])

        if "worldsToSplit" in params:
            self.worldsToSplit = int(params["worldsToSplit"])
            del params["worldsToSplit"]
        else:
            self.worldsToSplit = int(self.defaultParams["worldsToSplit"])

        if "treeDepth" in params:
            self.treeDepth = int(params["treeDepth"])
            del params["treeDepth"]
        else:
            self.treeDepth = int(self.defaultParams["treeDepth"])

        if "classification" in params:
            self.classification = params["classification"]
            del params["classification"]
        else:
            self.classification = self.defaultParams["classification"]
        if self.classification == "": self.classification = None

        self.model = self.segmentRecord.pmmlModel

        if isinstance(self.model, pmml.TreeModel):
            self.modelType = self.TREEMODEL
            self.nodeIndex = self.model.index(pmml.Node)

        elif isinstance(self.model, pmml.RuleSetModel):
            self.ruleSet = self.model.child(pmml.RuleSet)
            self.modelType = self.RULESETMODEL
            self.nodeIndex = self.ruleSet.index(
                lambda x: isinstance(x, (pmml.SimpleRule, pmml.CompoundRule)),
                exception=False)
            if self.nodeIndex is None:
                self.nodeIndex = len(self.ruleSet.children)
                self.ruleSet.children.append(None)

        self.features = []
        self.predicted = []
        for miningField in self.model.child(pmml.MiningSchema).matches(
                pmml.MiningField):
            name = miningField.attrib["name"]
            usageType = miningField.attrib.get("usageType", "active")
            if usageType == "active":
                dataType = self.model.dataContext.dataType[name]
                optype = self.model.dataContext.optype[name]
                if optype == "ordinal" and dataType == "string":
                    optype = self.model.dataContext.cast[name]

                feature = Feature(name, optype, dataType,
                                  self.engine.producerUpdateScheme)
                feature.maturityThreshold = self.featureMaturityThreshold
                self.features.append(feature)

            if usageType == "predicted":
                self.predicted.append(name)

        if len(self.predicted) == 0:
            self.classification = INVALID

        else:
            if self.classification is None:
                # by default, take the first 'predicted' feature
                self.classification = self.predicted[0]
            else:
                if self.classification not in self.predicted:
                    raise RuntimeError, "Classification feature not found among the 'predicted' features in the decision tree's MiningSchema%s" % self.model.child(
                        pmml.MiningSchema).fileAndLine()

        self.topWorld = World(0, None)
        self.counts = {}

        if len(params) > 0:
            raise TypeError, "Unrecognized parameters %s" % params

    def update(self, syncNumber, get):
        if self.classification is INVALID:
            raise RuntimeError, "Cannot produce a decision tree with no 'predicted' features in the MiningSchema%s" % self.model.child(
                pmml.MiningSchema).fileAndLine()

        values = [get(feature.name) for feature in self.features]
        if INVALID in values or MISSING in values: return False

        classification = get(self.classification)
        if classification is INVALID or classification is MISSING: return False

        if classification not in self.counts:
            self.counts[classification] = 0
        self.counts[classification] += 1

        bestClassification = None
        bestCount = None
        for c, count in self.counts.items():
            if bestClassification is None or count > bestCount:
                bestClassification = c
                bestCount = count

        matureFeatures = []
        for feature in self.features:
            feature.increment(syncNumber, get)
            if feature.mature:
                matureFeatures.append(feature)

        if len(matureFeatures) > 0:
            self.topWorld.increment(syncNumber, get, classification,
                                    matureFeatures, self)

        if self.modelType is self.TREEMODEL:
            self.topWorld.bestTree(self.model, bestClassification, self)
        elif self.modelType is self.RULESETMODEL:
            self.topWorld.bestRule(self.ruleSet, bestClassification, self)

        return True
Exemplo n.º 13
0
class PythonFunction(ScoresAwk):
    xsd = load_xsdElement(
        ScoresAwk, """
  <xs:element name="PythonFunction">
    <xs:complexType>
      <xs:sequence>
          <xs:element minOccurs="0" maxOccurs="unbounded" ref="Context"/>
      </xs:sequence>
      <xs:attribute name="condition" type="xs:string" use="optional"/>
      <xs:attribute name="action" type="xs:string" use="required"/>
    </xs:complexType>
  </xs:element>
  """)

    BEGIN = Atom("Begin")
    EVENT = Atom("Event")
    END = Atom("End")

    def post_validate(self):
        context = {"g": globalVariables}
        for c in self.matches(Context):
            context.update(c.context)

        cdatas = [i for i in self.children if isinstance(i, xmlbase.XMLCDATA)]
        if len(cdatas) != 1:
            raise XMLValidationError, "A PythonFunction object must contain exactly one CDATA"

        theCode = "".join(cdatas[0].text).lstrip().rstrip()

        ## CAREFUL: evaluates whatever you give it!
        try:
            exec theCode in context
        except SyntaxError as err:
            raise XMLValidationError, "PythonFunction could not be evaluated: %s" % str(
                err)

        if "condition" in self.attrib:
            if self["condition"] == "BEGIN":
                self.condition = self.BEGIN

            elif self["condition"] == "END":
                self.condition = self.END

            else:
                try:
                    self.condition = context[self["condition"]]
                    if not callable(self.condition):
                        raise KeyError
                except KeyError:
                    raise XMLValidationError, "PythonFunction does not contain a condition function called \"%s\"" % self[
                        "condition"]

        else:
            self.condition = self.EVENT

        try:
            self.action = context[self["action"]]
            if not callable(self.action):
                raise KeyError
        except KeyError:
            raise XMLValidationError, "PythonFunction does not contain an action function called \"%s\"" % self[
                "action"]

    def begin(self):
        if self.condition is self.BEGIN:
            return self.action()

    def evaluate(self, event):
        if self.condition is self.EVENT:
            result = True
        else:
            result = self.condition(event)

        if result is True:
            return self.action(event)

        elif result is False:
            return None

        else:
            if not isinstance(result, (list, tuple)):
                try:
                    result = list(result)
                except TypeError:
                    raise RuntimeError, "A PythonFunction's condition must return True, False, or a list of objects to act upon; result of %s is %s" % (
                        self.condition, result)

            output = []
            for r in result:
                output.append(self.action(r))
            return output

    def end(self):
        if self.condition is self.END:
            return self.action()
Exemplo n.º 14
0
class PythonFunction(ScoresAwk):
    xsd = load_xsdElement(
        ScoresAwk, """
  <xs:element name="PythonFunction">
    <xs:complexType>
      <xs:sequence>
          <xs:element minOccurs="0" maxOccurs="unbounded" ref="Context"/>
      </xs:sequence>
      <xs:attribute name="condition" type="xs:string" use="optional"/>
      <xs:attribute name="action" type="xs:string" use="required"/>
    </xs:complexType>
  </xs:element>
  """)

    BEGIN = Atom("Begin")
    EVENT = Atom("Event")
    END = Atom("End")

    def post_validate(self):
        context = {"g": globalVariables}
        for c in self.matches(Context):
            context.update(c.context)

        cdatas = [i for i in self.children if isinstance(i, xmlbase.XMLCDATA)]
        if len(cdatas) != 1:
            raise XMLValidationError(
                "A PythonFunction object must contain exactly one CDATA")

        theCode = "".join(cdatas[0].text).lstrip().rstrip()

        ## CAREFUL: evaluates whatever you give it!
        try:
            exec theCode in context
        except SyntaxError, err:
            raise XMLValidationError(
                "PythonFunction could not be evaluated: %s" % str(err))

        if "condition" in self.attrib:
            if self["condition"] == "BEGIN":
                self.condition = self.BEGIN

            elif self["condition"] == "END":
                self.condition = self.END

            else:
                try:
                    self.condition = context[self["condition"]]
                    if not callable(self.condition):
                        raise KeyError
                except KeyError:
                    raise XMLValidationError(
                        "PythonFunction does not contain a condition function called \"%s\""
                        % self["condition"])

        else:
            self.condition = self.EVENT

        try:
            self.action = context[self["action"]]
            if not callable(self.action):
                raise KeyError
        except KeyError:
            raise XMLValidationError(
                "PythonFunction does not contain an action function called \"%s\""
                % self["action"])