예제 #1
0
 def field_not_in_list(field, values):
     mv = pmml.MapValues(outputColumn='output', defaultValue=1)
     mv.append(pmml.FieldColumnPair(field=field, column='input'))
     it = pmml.InlineTable()
     for v in values:
         it.append(pmml_row(input=v, output=0))
     mv.append(it)
     return {
         DerivedFeatureTransformations.TRANSFORMATION:
         mv,
         DerivedFeatureTransformations.FUNCTION:
         lambda df: reduce(np.logical_and, [df[field] != _ for _ in values])
     }
예제 #2
0
    def transformation_dictionary(self):
        """
        Build a transformation dictionary and return a TransformationDictionary element
        """
        td = pmml.TransformationDictionary()
        # define a schema with all variables available for a model
        encoded_schema = []
        self.context.schemas[Schema.NUMERIC] = encoded_schema
        idx = {}

        # First, populate transformation dictionary for _all_ derived fields, because they can be requested later
        for f in self.context.schemas[Schema.DERIVED]:
            ef = RealNumericFeature(name=f.name)
            df = pmml.DerivedField(name=ef.full_name,
                                   optype=ef.optype.value,
                                   dataType=ef.data_type.value)
            df.append(f.transformation)
            td.append(df)
            assert f.name not in idx, 'Duplicate field definition: {}'.format(
                f.name)
            idx[f.name] = ef

        # second, define the numeric transformations for the categorical variables
        for f in self.context.schemas[Schema.INPUT]:
            assert f.name not in idx, 'Duplicate field definition: {}'.format(
                f.name)
            if isinstance(f, CategoricalFeature):
                ef = RealNumericFeature(name=f.name,
                                        namespace=Schema.NUMERIC.namespace)
                # create a record in transformation dictionary with mapping from raw values into numbers
                df = pmml.DerivedField(name=ef.full_name,
                                       optype=ef.optype.value,
                                       dataType=ef.data_type.value)
                mv = pmml.MapValues(outputColumn='output',
                                    dataType=ef.data_type.value)
                mv.append(
                    pmml.FieldColumnPair(field=f.full_name, column='input'))
                it = pmml.InlineTable()
                for i, v in enumerate(f.value_list):
                    it.append(pmml_row(input=v, output=i))
                td.append(df.append(mv.append(it)))
                idx[f.name] = ef
            else:
                idx[f.name] = f

        # now we can build a mirror of model schema into the numeric schema
        self.context.schemas[Schema.NUMERIC] = [
            idx[f.name] for f in self.context.schemas[Schema.MODEL]
        ]

        return td
예제 #3
0
 def map_values(field, value_map, default_value):
     mv = pmml.MapValues(outputColumn='output', default_value=default_value)
     mv.append(pmml.FieldColumnPair(field=field, column='input'))
     it = pmml.InlineTable()
     for k, v in value_map.items():
         it.append(pmml_row(input=k, output=v))
     mv.append(it)
     return {
         DerivedFeatureTransformations.TRANSFORMATION:
         mv,
         DerivedFeatureTransformations.FUNCTION:
         lambda df: np.vectorize(partial(value_map.get, default_value))
         (df[field])
     }
예제 #4
0
파일: model.py 프로젝트: kod3r/sklearn-pmml
    def model_verification(self, verification_data):
        """
        Build a model verification dataset
        :param verification_data: list of dictionaries
        :return: ModelVerification element
        """
        verification_data = pd.DataFrame(verification_data)
        fields = self.context.schemas[
            self.SCHEMA_INPUT] + self.context.schemas[self.SCHEMA_OUTPUT]
        assert len(verification_data) > 0, 'Verification data can not be empty'
        assert len(verification_data.columns) == len(fields), \
            'Number of fields in validation data should match to input and output schema fields'
        mv = pmml.ModelVerification(recordCount=len(verification_data),
                                    fieldCount=len(verification_data.columns))

        # step one: build verification schema
        verification_fields = pmml.VerificationFields()
        for f in fields:
            if isinstance(f, NumericFeature):
                vf = pmml.VerificationField(field=f.name,
                                            column=f.name,
                                            precision=self.EPSILON)
            else:
                vf = pmml.VerificationField(field=f.name, column=f.name)
            verification_fields.append(vf)
        mv.append(verification_fields)

        # step two: build data table
        it = pmml.InlineTable()
        for data in verification_data.iterrows():
            data = data[1]
            row = pmml.row()
            row_empty = True
            for key in verification_data.columns:
                if verification_data[key].dtype == object or not np.isnan(
                        data[key]):
                    col = bds().createChildElement(key)
                    bds().appendTextChild(data[key], col)
                    row.append(col)
                    row_empty = False
            if not row_empty:
                it.append(row)
        mv.append(it)

        return mv
 def test_transform_with_derived_field(self):
     self.est = DecisionTreeClassifier(max_depth=2)
     self.est.fit([
         [0, 0, 0],
         [0, 1, 0],
         [1, 0, 0],
         [1, 1, 1],
     ], [0, 1, 1, 1])
     mapping = pmml.MapValues(dataType="double", outputColumn="output")
     mapping.append(pmml.FieldColumnPair(column="x1", field="x1"))
     mapping.append(pmml.FieldColumnPair(column="x2", field="x2"))
     it = pmml.InlineTable()
     it.append(pmml_row(x1=0, x2='zero', output=0))
     it.append(pmml_row(x1=0, x2='one', output=0))
     it.append(pmml_row(x1=1, x2='zero', output=0))
     it.append(pmml_row(x1=1, x2='one', output=1))
     mapping.append(it)
     self.ctx = TransformationContext({
         Schema.INPUT: [
             IntegerNumericFeature('x1'),
             StringCategoricalFeature('x2', ['zero', 'one'])
         ],
         Schema.DERIVED: [
             DerivedFeature(feature=RealNumericFeature(name='x3'),
                            transformation=mapping)
         ],
         Schema.MODEL: [
             IntegerNumericFeature('x1'),
             StringCategoricalFeature('x2', ['zero', 'one']),
             RealNumericFeature(name='x3')
         ],
         Schema.OUTPUT:
         [IntegerCategoricalFeature('output', ['neg', 'pos'])]
     })
     self.converter = DecisionTreeConverter(estimator=self.est,
                                            context=self.ctx,
                                            mode=ModelMode.CLASSIFICATION)
     self.converter.pmml().toxml()
예제 #6
0
    def model_verification(self, verification_data):
        """
        Use the input verification_data, apply the transformations, evaluate the model response and produce the
        ModelVerification element
        :param verification_data: list of dictionaries or data frame
        :type verification_data: dict[str, object]|pd.DataFrame
        :return: ModelVerification element
        """
        verification_data = pd.DataFrame(verification_data)
        assert len(verification_data) > 0, 'Verification data can not be empty'

        verification_input = pd.DataFrame(index=verification_data.index)
        verification_model_input = pd.DataFrame(index=verification_data.index)
        for key in self.context.schemas[Schema.INPUT]:
            # all input features MUST be present in the verification_data
            assert key.full_name in verification_data.columns, 'Missing input field "{}"'.format(
                key.full_name)
            verification_input[Schema.INPUT.extract_feature_name(
                key)] = verification_data[key.full_name]
            if isinstance(key, CategoricalFeature):
                verification_model_input[Schema.INPUT.extract_feature_name(
                    key)] = np.vectorize(key.to_number)(
                        verification_data[key.full_name])
            else:
                verification_model_input[Schema.INPUT.extract_feature_name(
                    key)] = verification_data[key.full_name]

        for key in self.context.schemas[Schema.DERIVED]:
            assert isinstance(
                key, DerivedFeature
            ), 'Only DerivedFeatures are allowed in the DERIVED schema'
            verification_model_input[key.full_name] = key.apply(
                verification_input)

        # at this point we can check that MODEL schema contains only known features
        for key in self.context.schemas[Schema.MODEL]:
            assert Schema.MODEL.extract_feature_name(key) in verification_model_input.columns, \
                'Unknown feature "{}" in the MODEL schema'.format(key.full_name)

        # TODO: we can actually support multiple columns, but need to figure out the way to extract the data
        # TODO: from the estimator properly
        # building model results
        assert len(self.context.schemas[
            Schema.OUTPUT]) == 1, 'Only one output is currently supported'
        key = self.context.schemas[Schema.OUTPUT][0]
        model_input = verification_model_input[list(
            map(Schema.MODEL.extract_feature_name,
                self.context.schemas[Schema.MODEL]))].values
        model_results = np.vectorize(key.from_number)(
            self.estimator.predict(X=model_input))
        if key.full_name in verification_data:
            # make sure that if results are provided, the expected and actual values are equal
            assert_equal(key, model_results,
                         verification_data[key.full_name].values)
        verification_input[Schema.OUTPUT.extract_feature_name(
            key)] = model_results

        if isinstance(key, CategoricalFeature):
            probabilities = self.estimator.predict_proba(X=model_input)
            for i, key in enumerate(self.context.schemas[Schema.CATEGORIES]):
                verification_input[Schema.CATEGORIES.extract_feature_name(
                    key)] = probabilities[:, i]

        fields = []
        field_names = []
        for s in [Schema.INPUT, Schema.OUTPUT, Schema.CATEGORIES]:
            fields += self.context.schemas[s]
            field_names += list(
                map(s.extract_feature_name, self.context.schemas[s]))

        mv = pmml.ModelVerification(recordCount=len(verification_input),
                                    fieldCount=len(fields))

        # step one: build verification schema
        verification_fields = pmml.VerificationFields()
        for key in fields:
            if isinstance(key, NumericFeature):
                vf = pmml.VerificationField(field=key.name,
                                            column=key.name,
                                            precision=self.EPSILON)
            else:
                vf = pmml.VerificationField(field=key.name, column=key.name)
            verification_fields.append(vf)
        mv.append(verification_fields)

        # step two: build data table
        it = pmml.InlineTable()
        for data in verification_input.iterrows():
            data = data[1]
            row = pmml.row()
            row_empty = True
            for key in field_names:
                if verification_input[key].dtype == object or not np.isnan(
                        data[key]):
                    col = bds().createChildElement(key)
                    bds().appendTextChild(data[key], col)
                    row.append(col)
                    row_empty = False
            if not row_empty:
                it.append(row)
        mv.append(it)

        return mv