Пример #1
0
    def __init__(self,
                 errorCatcher=errorCatcher,
                 fromHTTP=False,
                 interactive=False,
                 isXML=True,
                 runForever=False,
                 filename=None,
                 **kwargs):
        self._errorCatcher = errorCatcher
        self._runOptions = NameSpace(fromHTTP=fromHTTP,
                                     interactive=interactive,
                                     isXML=isXML,
                                     runForever=runForever)
        self._fileList = filename  # None or else will become a list
        self.currentFileNumber = 0
        self._logger = logging.getLogger()
        self._logLevels = NameSpace(
            DEBUG=self._logger.getEffectiveLevel() <= logging.DEBUG,
            INFO=self._logger.getEffectiveLevel() <= logging.INFO,
            WARNING=self._logger.getEffectiveLevel() <= logging.WARNING,
            ERROR=self._logger.getEffectiveLevel() <= logging.ERROR)
        self._metadata = logging.getLogger('metadata')
        self._thread = None
        self._values = None
        self._tables = collections.deque()
        self._buffers = collections.deque()

        if interactive:
            self._reader = None
        elif fromHTTP:
            pass
        else:
            if filename == '-':
                self._fileList = ['-']
            else:
                import glob
                self._fileList = glob.glob(filename)
                self._fileList = sort()
                self._fileList.reverse()
                if len(self._fileList) == 0:
                    raise RuntimeError, "No Data Input files matched %s" % filename
Пример #2
0
    def initialize(self,
                   existingSegment=False,
                   customProcessing=None,
                   setModelMaturity=False):
        """Initialize the consumer, the producer, and start the
        maturity count."""

        self.updator = self.engine.producerUpdateScheme.updator(
            COUNT)  # use the producer's UpdateScheme
        if not existingSegment:
            self.lock = False
            self.pmmlModelMaturity.attrib["locked"] = False
        else:
            if setModelMaturity or (
                    "updateExisting" in self.producerParameters
                    and self.producerParameters["updateExisting"] is True):
                self.updator.initialize(
                    {COUNT: self.pmmlModelMaturity.attrib["numUpdates"]})
            self.lock = self.pmmlModelMaturity.attrib["locked"]

        self.consumerAlgorithm.initialize()
        self.producerAlgorithm.initialize(**self.producerParameters)

        self.constants = self.pmmlModel.child(pmml.Extension, exception=False)
        if self.constants is None:
            self.constants = NameSpaceReadOnly()
        else:
            self.constants = self.constants.child(
                pmml.X_ODG_CustomProcessingConstants, exception=False)
            if self.constants is None:
                self.constants = NameSpaceReadOnly()
            else:
                self.constants = self.constants.nameSpace

        self.userFriendly = getattr(self, "userFriendly",
                                    new.instance(Segment))
        self.userFriendly.name = self.name()
        self.userFriendly.pmmlPredicate = self.pmmlPredicate
        self.userFriendly.expression = self.expressionTree
        self.userFriendly.evaluate = self.predicateMatches
        self.userFriendly.pmmlModel = self.pmmlModel
        self.userFriendly.consumer = self.consumerAlgorithm
        self.userFriendly.producer = self.producerAlgorithm
        self.userFriendly.const = self.constants
        self.userFriendly.state = self.state

        if customProcessing is not None:
            db = customProcessing.persistentStorage.db
            if self.userFriendly.name not in db:
                db[self.userFriendly.name] = NameSpace()
            self.userFriendly.db = db[self.userFriendly.name]
Пример #3
0
    def __init__(
        self,
        baseName,
        serialization=None,
        timeformat="%Y-%m-%d_%H-%M-%S",
        indent="",
        linesep="",
        pickle=False):
        """
        note: serialization, if present, is a dictionary that contains
        {'byEventNumber':True/False, 'rollover':integer_value}
        in which 'rollover' has units of events or seconds.
        """
        self.baseName = baseName
        self.timeformat = timeformat
        self.indent = indent
        self.linesep = linesep
        self.pickle = pickle
        self._logging =  logging.getLogger()
        self._metadata = logging.getLogger('metadata')
        self.thread = None
        if serialization is None:
            self.serialization = None
        else:
            self.serialization = NameSpace(byEventNumber=False)
            if len(serialization) == 0:
                self.serialization['rollover'] = 3600  # hourly
            else:
                if 'byEventNumber' in serialization:
                    self.serialization.byEventNumber = True
                if not self.serialization.byEventNumber:
                    self.serialization['start'] = int(time.time())
                self.serialization['rollover'] = serialization['rollover']

        self.nameCollisions = {}

        # statistics on the writing process
        self._metadata.data["Models written"] = 0
        self._metadata.data["Model write collisions"] = 0
        self._metadata.data["Time writing models"] = 0
        self._metadata.data["Time copying models"] = 0
        self._metadata.data["Time waiting for write thread to unblock"] = 0
Пример #4
0
    def initialize(self):
        """Interpret PMML file, set up SegmentRecords list, and
        initialize all algorithms."""

        self.firstSegment = True

        # set up the header, so that our models can be stamped with time and event number
        header = self.pmmlFile.child(pmml.Header)
        if header.exists(pmml.Extension):
            headerExtension = header.child(pmml.Extension)
        else:
            headerExtension = pmml.Extension()
            header.children.insert(0, headerExtension)

        if headerExtension.exists(pmml.X_ODG_RandomSeed):
            del headerExtension[headerExtension.index(pmml.X_ODG_RandomSeed)]
        augustusRandomSeed = pmml.X_ODG_RandomSeed(
            value=self.augustusRandomSeed)
        headerExtension.children.append(augustusRandomSeed)

        if headerExtension.exists(pmml.X_ODG_Eventstamp):
            del headerExtension[headerExtension.index(pmml.X_ODG_Eventstamp)]
        self.eventStamp = pmml.X_ODG_Eventstamp(number=0)
        headerExtension.children.append(self.eventStamp)

        if header.exists(pmml.Timestamp):
            del header[header.index(pmml.Timestamp)]
        self.timeStamp = pmml.Timestamp(
            xmlbase.XMLText(datetime.datetime.today().isoformat()))
        header.children.append(self.timeStamp)

        # select the first model or select a model by name
        if self.modelName is None:
            self.pmmlModel = self.pmmlFile.topModels[0]
        else:
            self.pmmlModel = None
            for model in self.pmmlFile.topModels:
                if "modelName" in model.attrib and model.attrib[
                        "modelName"] == self.modelName:
                    self.pmmlModel = model
                    break
            if self.pmmlModel is None:
                raise RuntimeError, "No model named \"%s\" was found in the PMML file" % self.modelName

        # connect the dataContext to the dataStream, so that events will flow from the input file into the transformations
        self.resetDataStream(self.dataStream)

        # clear the cache the model DataContexts (initializes some dictionaries)
        self.pmmlModel.dataContext.clear()
        if self.pmmlModel.dataContext.transformationDictionary:
            self.metadata.data["Transformation dictionary elements"] = len(
                self.pmmlModel.dataContext.transformationDictionary.cast)
        else:
            self.metadata.data["Transformation dictionary elements"] = 0

        self.segmentRecords = []
        self._lookup = NameSpace(tuples={}, fields={}, other=[])
        SegmentRecord.maturityThreshold = self.maturityThreshold
        SegmentRecord.lockingThreshold = self.lockingThreshold

        if self.pmmlFile.exists(pmml.TransformationDictionary):
            if self.pmmlFile.child(pmml.TransformationDictionary).exists(
                    pmml.Aggregate, maxdepth=None):
                raise NotImplementedError, "Aggregate transformations in the TransformationDictionary are not supported"
            if self.pmmlFile.child(pmml.TransformationDictionary).exists(
                    pmml.X_ODG_AggregateReduce, maxdepth=None):
                raise NotImplementedError, "X-ODG-AggregateReduce transformations in the TransformationDictionary are not supported"

        # MiningModels are special because we handle segmentation at the Engine level
        # Currently no support for MiningModels nested within MiningModels
        if isinstance(self.pmmlModel, pmml.MiningModel):
            self.pmmlOutput = self.pmmlModel.child(pmml.Output,
                                                   exception=False)
            segmentation = self.pmmlModel.child(pmml.Segmentation,
                                                exception=False)
            # for now, assume a MiningModel without any segments will be populated through autosegmentation

            if self.pmmlModel.exists(pmml.LocalTransformations):
                if self.pmmlModel.child(pmml.LocalTransformations).exists(
                        pmml.Aggregate, maxdepth=None):
                    raise NotImplementedError, "Aggregate transformations in the MiningModel's LocalTransformations are not supported"
                if self.pmmlModel.child(pmml.LocalTransformations).exists(
                        pmml.X_ODG_AggregateReduce, maxdepth=None):
                    raise NotImplementedError, "X-ODG-AggregateReduce transformations in the MiningModel's LocalTransformations are not supported"

            if segmentation.attrib["multipleModelMethod"] == "selectFirst":
                self.multipleModelMethod = SELECTFIRST
            elif segmentation.attrib["multipleModelMethod"] == "selectAll":
                self.multipleModelMethod = SELECTALL
            else:
                raise NotImplementedError, "Only 'selectFirst', 'selectAll', and no segmentation have been implemented."
            self.metadata.data[
                "Match all segments"] = self.multipleModelMethod != SELECTFIRST

            for pmmlSegment in segmentation.matches(pmml.Segment):
                self._makeSegmentRecord(pmmlSegment)

        else:
            self.multipleModelMethod = SELECTONLY

            segmentRecord = SegmentRecord(self.pmmlModel, None, None, self)

            modelClass = self.pmmlModel.__class__
            algoName = self.producerAlgorithm[
                modelClass.__name__].attrib["algorithm"]
            segmentRecord.consumerAlgorithm = consumerAlgorithmMap[modelClass](
                self, segmentRecord)
            segmentRecord.producerAlgorithm = producerAlgorithmMap[
                modelClass, algoName](self, segmentRecord)
            segmentRecord.producerParameters = self.producerAlgorithm[
                modelClass.__name__].parameters
            self.setProvenance(self.pmmlModel, algoName,
                               segmentRecord.producerAlgorithm,
                               segmentRecord.producerParameters)

            localTransformations = self.pmmlModel.child(
                pmml.LocalTransformations, exception=False)
            if localTransformations is not None:
                segmentRecord.aggregates = localTransformations.matches(
                    pmml.Aggregate, maxdepth=None)
                segmentRecord.aggregates.extend(
                    localTransformations.matches(pmml.X_ODG_AggregateReduce,
                                                 maxdepth=None))
            else:
                segmentRecord.aggregates = []
            for aggregate in segmentRecord.aggregates:
                aggregate.initialize(self.consumerUpdateScheme)

            self.segmentRecords.append(segmentRecord)
            self.metadata.data[
                "First segment model type"] = segmentRecord.pmmlModel.tag

        self.reinitialize()
Пример #5
0

def __matchesPartition(matcher, partition):
    for bound, comparator in partition:
        if bound is not None and not comparator(matcher, bound):
            return False
    return True

_segmentHelpers = NameSpace(
    lessThan=lambda x, val: x < val,
    lessOrEqual=lambda x, val: x <= val,
    greaterThan=lambda x, val: x > val,
    greaterOrEqual=lambda x, val: x >= val,
    isCompoundAnd=lambda x:
        isinstance(x, pmml.CompoundPredicate) and
        x.attrib['booleanOperator'] == "and",
    isSimpleEqual=lambda x:
        isinstance(x, pmml.SimplePredicate) and
        x.attrib['operator'] == "equal",
    isComparator=lambda x:
        isinstance(x, pmml.SimplePredicate) and \
        x.attrib['operator'][0] in ('l', 'g'),  # less|greater + Than|OrEqual
    matchesPartition=__matchesPartition)

########################################################### Engine


class Engine:
    """Object called by Augustus main event loop to process one event/pseudoevent."""
    def __init__(self, pmmlFile, dataStream, producerUpdateScheme,
                 consumerUpdateScheme, segmentationScheme, producerAlgorithm,
Пример #6
0
    def __init__(self,
                 fromHTTP=False,
                 interactive=False,
                 isXML=True,
                 isCSV=False,
                 runForever=False,
                 maxsize=0,
                 filename=None,
                 **kwargs):
        """Set up the reading function and queue for the DataStreamer.

        DataStreamer's constructor is typically invoked by
        calling getDataStreamer(config_options), defined below.
        Error checking for appropriate configuration settings,
        and for sufficient contents in **kwargs is presumed to be
        done during XSD validation.  The reason this initialization
        function is separate is to allow an advanced user to call
        the streamer from a script and bypass having to make an
        XML object containing configuration settings.

        Arguments:

            fromHTTP (boolean; default False):
            If True, the reader will be an HTTPInterfaceServer.

            interactive (boolean; default False):
            If True, the reader will be None and the user will push
            data to the queue to score using self.enqueue(self, dictionary)
            in which dictionary is a dictionary or a UniRecord; a row in a
            UniTable.

            isXML (boolean; default False):
            If True, the reader will process the input stream as XML.

            runForever (boolean; default False):
            If True, run forever. Otherwise read all data and then exit.

            maxsize (integer; default 0):
            The maximum number of objects allowed in self.queue.
            If zero, the Queue can be arbitrarily long.

            **kwargs (arguments for the Reader)
        """
        self._runOptions =\
            NameSpace(
                fromHTTP=fromHTTP,
                interactive=interactive,
                isXML=isXML,
                runForever=runForever)
        self._fileList = filename  # None or else will become a list...
        self.currentFileNumber = 0
        self._logger = logging.getLogger()
        self._metadata = logging.getLogger('metadata')
        self._thread = None
        self._values = None
        self._queue = Queue.Queue(maxsize)
        callback = self._xmlCallback if isXML else self._unitableCallback

        if interactive:
            self._reader = None
        elif fromHTTP:

            def http_callback(data):
                wrapper = StringIO.StringIO(data)
                rdr =\
                    Reader(callback,
                        source=wrapper,
                        logger=self._logger,
                        magicheader=False,
                        unitable=not isXML,
                        wholeUniTable=not isXML)
                pipe = rdr.new_pipe()
                try:
                    result = rdr.feed_pipe(None, pipe)
                except:
                    raise IOError("Problem reading data over HTTP.")
                return result

            self._reader =\
                HTTPInterfaceServer(
                    ('', kwargs['port']), logger=logging.getLogger(''))
            self._reader.register_callback(kwargs['url'], http_callback)
            self._reader.isCSV = isCSV

        else:
            if filename == '-':
                self._fileList = ['-']
            else:
                import glob
                self._fileList = glob.glob(filename)
                self._fileList.sort()
                self._fileList.reverse()
            if len(self._fileList) == 0:
                raise RuntimeError, "No Data Input files matched %s" % filename

            self._reader = Reader(callback,
                                  unitable=not isXML,
                                  wholeUniTable=not isXML,
                                  **kwargs)
            self._reader.source = self._fileList.pop()
            self._reader.isCSV = isCSV
Пример #7
0
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
import os
import new
import string
import re
import math

import augustus.core.xmlbase as xmlbase
import augustus.core.pmml41 as pmml
from augustus.core.xmlbase import XMLValidationError, load_xsdType, load_xsdGroup, load_xsdElement
from augustus.core.defs import Atom, NameSpace

globalVariables = NameSpace()

class PmmlSed(xmlbase.XML):
    topTag = "PmmlSed"
    xsdType = {}
    xsdGroup = {}
    classMap = {}

    def __init__(self, *children, **attrib):
        # reverse-lookup the classMap
        try:
            pmmlName = (pmmlName for pmmlName, pythonObj in self.classMap.items() if pythonObj == self.__class__).next()
        except StopIteration:
            raise Exception, "PmmlSed class is missing from the classMap (programmer error)"
        xmlbase.XML.__init__(self, pmmlName, *children, **attrib)