示例#1
0
    def __init__(self, context, inputData, inputMask=None, inputState=None):
        """Create a DataTable from a type-context, input data,
        possible input masks, and possible input states.

        For maximum flexibility, very few assumptions are made about
        the format of C{inputData}.  It need only have a structure
        that is equivalent to a dictionary mapping strings (field
        names) to lists of values (data columns).  Numpy
        U{record arrays<http://docs.scipy.org/doc/numpy/user/basics.rec.html>},
        U{NpzFiles <http://docs.scipy.org/doc/numpy/reference/generated/numpy.savez.html>},
        and U{Pandas data frames<http://pandas.pydata.org/>}
        effectively present their data in this format because::

            inputData[fieldName]

        yields a column of values.  Regardless of the input type,
        these values are then interpreted by the C{context} to set
        their PMML type.

        The length of the resulting DataTable is equal to the length
        of the shortest DataColumn.  Generally, one should use
        equal-length arrays to build a DataTable.

        @type context: PmmlBinding, FieldType, string, dict, or None
        @param context: If a rooted PmmlBinding, use the PMML's DataDictionary to interpret C{inputData}.  If a FieldType, use that FieldType to interpret all fields.  If a string, use that dataType (e.g. "integer", "dateDaysSince[1960]") to interpret all fields.  If a dictionary from field names to FieldTypes or dataType strings, use them on a per-field basis.  Otherwise, assume a FieldType from the Numpy C{dtype}.  The last option only works if all C{inputData} columns are Numpy arrays.
        @type inputData: any dict-like mapping from strings to lists
        @param inputData: Maps field names (strings) to columns of data (lists or Numpy arrays) that are interpreted by C{context}.
        @type inputMask: dict-like mapping from strings to lists of bool, or None
        @param inputMask: If None, missing data are identified by C{NaN} values in the C{inputData} (Pandas convention).  Otherwise, C{NaN} or a True value in the corresponding {inputMask} would label a data item as MISSING.
        @type inputState: DataTableState or None
        @param inputState: Initial state of the DataTable.  To continue a previous calculation, use the C{dataTable.state} from the previous calculation.
        @raise TypeError: If the C{inputData} columns are not Numpy arrays and a C{context} is not given, this method raises an error.
        """

        if isinstance(context, PmmlBinding) and len(context.xpath("ancestor-or-self::pmml:PMML")) != 0:
            # get types from PMML
            dataColumns = OrderedDict()
            for fieldName, fieldDefinition in context.fieldContext().items():
                fieldType = FieldType(fieldDefinition)

                try:
                    dataField = inputData[fieldName]
                except KeyError:
                    dataField = None
                else:
                    try:
                        maskField = inputMask[fieldName]
                    except (KeyError, TypeError):
                        maskField = None

                if dataField is not None:
                    dataColumns[fieldName] = fieldType.toDataColumn(dataField, maskField)

        else:
            if not isinstance(context, dict):
                context = dict((x, context) for x in inputData)

            if all(isinstance(x, FieldType) for x in context.values()):
                # FieldTypes provided explicitly
                dataColumns = OrderedDict()
                for fieldName in sorted(context.keys()):
                    data = inputData[fieldName]
                    if inputMask is None:
                        mask = None
                    else:
                        mask = inputMask[fieldName]

                    dataColumns[fieldName] = context[fieldName].toDataColumn(data, mask)

            elif all(isinstance(x, basestring) for x in context.values()):
                # FieldTypes provided by dataType name
                dataColumns = OrderedDict()
                for fieldName in sorted(context.keys()):
                    data = inputData[fieldName]
                    if inputMask is None:
                        mask = None
                    else:
                        mask = inputMask[fieldName]

                    if context[fieldName] == "string":
                        fieldType = FakeFieldType(context[fieldName], "categorical")
                    else:
                        fieldType = FakeFieldType(context[fieldName], "continuous")
                    dataColumns[fieldName] = fieldType.toDataColumn(data, mask)

            elif all(isinstance(inputData[x], NP.ndarray) for x in inputData.keys()):
                # FieldTypes provided by NumPy types
                dataColumns = OrderedDict()
                for fieldName in sorted(context.keys()):
                    data = inputData[fieldName]
                    if inputMask is None:
                        mask = None
                    else:
                        mask = inputMask[fieldName]

                    if data.dtype in (NP.object, NP.object0, NP.object_, NP.str, NP.str_, NP.string0, NP.string_) or re.match("\|S[0-9]+", str(data.dtype)) is not None:
                        fieldType = FakeFieldType("string", "categorical")
                    elif data.dtype in (NP.int, NP.int0, NP.int8, NP.int16, NP.int32, NP.int64, NP.int_, NP.integer):
                        fieldType = FakeFieldType("integer", "continuous")
                    elif data.dtype in (NP.float, NP.__getattr__("float16", noneIfMissing=True), NP.float32):
                        fieldType = FakeFieldType("float", "continuous")
                    elif data.dtype in (NP.float64, NP.float128, NP.float_, NP.double):
                        fieldType = FakeFieldType("double", "continuous")
                    elif data.dtype in (NP.bool, NP.bool8, NP.bool_):
                        fieldType = FakeFieldType("boolean", "continuous")
                    else:
                        raise TypeError("Unrecognized NumPy dtype: %r" % data.dtype)

                    dataColumns[fieldName] = fieldType.toDataColumn(data, mask)

            else:
                raise TypeError("Context must be PMML (anchored by a <PMML> ancestor), a dictionary of FieldType objects, dataType strings, or inputData must consist entirely of NumPy arrays")

        self._configure(dataColumns, inputState)
示例#2
0
    def __init__(self, context, inputData, inputMask=None, inputState=None):
        """Create a DataTable from a type-context, input data,
        possible input masks, and possible input states.

        For maximum flexibility, very few assumptions are made about
        the format of C{inputData}.  It need only have a structure
        that is equivalent to a dictionary mapping strings (field
        names) to lists of values (data columns).  Numpy
        U{record arrays<http://docs.scipy.org/doc/numpy/user/basics.rec.html>},
        U{NpzFiles <http://docs.scipy.org/doc/numpy/reference/generated/numpy.savez.html>},
        and U{Pandas data frames<http://pandas.pydata.org/>}
        effectively present their data in this format because::

            inputData[fieldName]

        yields a column of values.  Regardless of the input type,
        these values are then interpreted by the C{context} to set
        their PMML type.

        The length of the resulting DataTable is equal to the length
        of the shortest DataColumn.  Generally, one should use
        equal-length arrays to build a DataTable.

        @type context: PmmlBinding, FieldType, string, dict, or None
        @param context: If a rooted PmmlBinding, use the PMML's DataDictionary to interpret C{inputData}.  If a FieldType, use that FieldType to interpret all fields.  If a string, use that dataType (e.g. "integer", "dateDaysSince[1960]") to interpret all fields.  If a dictionary from field names to FieldTypes or dataType strings, use them on a per-field basis.  Otherwise, assume a FieldType from the Numpy C{dtype}.  The last option only works if all C{inputData} columns are Numpy arrays.
        @type inputData: any dict-like mapping from strings to lists
        @param inputData: Maps field names (strings) to columns of data (lists or Numpy arrays) that are interpreted by C{context}.
        @type inputMask: dict-like mapping from strings to lists of bool, or None
        @param inputMask: If None, missing data are identified by C{NaN} values in the C{inputData} (Pandas convention).  Otherwise, C{NaN} or a True value in the corresponding {inputMask} would label a data item as MISSING.
        @type inputState: DataTableState or None
        @param inputState: Initial state of the DataTable.  To continue a previous calculation, use the C{dataTable.state} from the previous calculation.
        @raise TypeError: If the C{inputData} columns are not Numpy arrays and a C{context} is not given, this method raises an error.
        """

        if isinstance(context, PmmlBinding) and len(
                context.xpath("ancestor-or-self::pmml:PMML")) != 0:
            # get types from PMML
            dataColumns = OrderedDict()
            for fieldName, fieldDefinition in context.fieldContext().items():
                fieldType = FieldType(fieldDefinition)

                try:
                    dataField = inputData[fieldName]
                except KeyError:
                    dataField = None
                else:
                    try:
                        maskField = inputMask[fieldName]
                    except (KeyError, TypeError):
                        maskField = None

                if dataField is not None:
                    dataColumns[fieldName] = fieldType.toDataColumn(
                        dataField, maskField)

        else:
            if not isinstance(context, dict):
                context = dict((x, context) for x in inputData)

            if all(isinstance(x, FieldType) for x in context.values()):
                # FieldTypes provided explicitly
                dataColumns = OrderedDict()
                for fieldName in sorted(context.keys()):
                    data = inputData[fieldName]
                    if inputMask is None:
                        mask = None
                    else:
                        mask = inputMask[fieldName]

                    dataColumns[fieldName] = context[fieldName].toDataColumn(
                        data, mask)

            elif all(isinstance(x, basestring) for x in context.values()):
                # FieldTypes provided by dataType name
                dataColumns = OrderedDict()
                for fieldName in sorted(context.keys()):
                    data = inputData[fieldName]
                    if inputMask is None:
                        mask = None
                    else:
                        mask = inputMask[fieldName]

                    if context[fieldName] == "string":
                        fieldType = FakeFieldType(context[fieldName],
                                                  "categorical")
                    else:
                        fieldType = FakeFieldType(context[fieldName],
                                                  "continuous")
                    dataColumns[fieldName] = fieldType.toDataColumn(data, mask)

            elif all(
                    isinstance(inputData[x], NP.ndarray)
                    for x in inputData.keys()):
                # FieldTypes provided by NumPy types
                dataColumns = OrderedDict()
                for fieldName in sorted(context.keys()):
                    data = inputData[fieldName]
                    if inputMask is None:
                        mask = None
                    else:
                        mask = inputMask[fieldName]

                    if data.dtype in (NP.object, NP.object0, NP.object_,
                                      NP.str, NP.str_, NP.string0,
                                      NP.string_) or re.match(
                                          "\|S[0-9]+", str(
                                              data.dtype)) is not None:
                        fieldType = FakeFieldType("string", "categorical")
                    elif data.dtype in (NP.int, NP.int0, NP.int8, NP.int16,
                                        NP.int32, NP.int64, NP.int_,
                                        NP.integer):
                        fieldType = FakeFieldType("integer", "continuous")
                    elif data.dtype in (NP.float,
                                        NP.__getattr__("float16",
                                                       noneIfMissing=True),
                                        NP.float32):
                        fieldType = FakeFieldType("float", "continuous")
                    elif data.dtype in (NP.float64, NP.float128, NP.float_,
                                        NP.double):
                        fieldType = FakeFieldType("double", "continuous")
                    elif data.dtype in (NP.bool, NP.bool8, NP.bool_):
                        fieldType = FakeFieldType("boolean", "continuous")
                    else:
                        raise TypeError("Unrecognized NumPy dtype: %r" %
                                        data.dtype)

                    dataColumns[fieldName] = fieldType.toDataColumn(data, mask)

            else:
                raise TypeError(
                    "Context must be PMML (anchored by a <PMML> ancestor), a dictionary of FieldType objects, dataType strings, or inputData must consist entirely of NumPy arrays"
                )

        self._configure(dataColumns, inputState)