def field_not_in_list(field, values): mv = pmml.MapValues(outputColumn='output', defaultValue=1) mv.append(pmml.FieldColumnPair(field=field, column='input')) it = pmml.InlineTable() for v in values: it.append(pmml_row(input=v, output=0)) mv.append(it) return { DerivedFeatureTransformations.TRANSFORMATION: mv, DerivedFeatureTransformations.FUNCTION: lambda df: reduce(np.logical_and, [df[field] != _ for _ in values]) }
def transformation_dictionary(self): """ Build a transformation dictionary and return a TransformationDictionary element """ td = pmml.TransformationDictionary() # define a schema with all variables available for a model encoded_schema = [] self.context.schemas[Schema.NUMERIC] = encoded_schema idx = {} # First, populate transformation dictionary for _all_ derived fields, because they can be requested later for f in self.context.schemas[Schema.DERIVED]: ef = RealNumericFeature(name=f.name) df = pmml.DerivedField(name=ef.full_name, optype=ef.optype.value, dataType=ef.data_type.value) df.append(f.transformation) td.append(df) assert f.name not in idx, 'Duplicate field definition: {}'.format( f.name) idx[f.name] = ef # second, define the numeric transformations for the categorical variables for f in self.context.schemas[Schema.INPUT]: assert f.name not in idx, 'Duplicate field definition: {}'.format( f.name) if isinstance(f, CategoricalFeature): ef = RealNumericFeature(name=f.name, namespace=Schema.NUMERIC.namespace) # create a record in transformation dictionary with mapping from raw values into numbers df = pmml.DerivedField(name=ef.full_name, optype=ef.optype.value, dataType=ef.data_type.value) mv = pmml.MapValues(outputColumn='output', dataType=ef.data_type.value) mv.append( pmml.FieldColumnPair(field=f.full_name, column='input')) it = pmml.InlineTable() for i, v in enumerate(f.value_list): it.append(pmml_row(input=v, output=i)) td.append(df.append(mv.append(it))) idx[f.name] = ef else: idx[f.name] = f # now we can build a mirror of model schema into the numeric schema self.context.schemas[Schema.NUMERIC] = [ idx[f.name] for f in self.context.schemas[Schema.MODEL] ] return td
def map_values(field, value_map, default_value): mv = pmml.MapValues(outputColumn='output', default_value=default_value) mv.append(pmml.FieldColumnPair(field=field, column='input')) it = pmml.InlineTable() for k, v in value_map.items(): it.append(pmml_row(input=k, output=v)) mv.append(it) return { DerivedFeatureTransformations.TRANSFORMATION: mv, DerivedFeatureTransformations.FUNCTION: lambda df: np.vectorize(partial(value_map.get, default_value)) (df[field]) }
def model_verification(self, verification_data): """ Build a model verification dataset :param verification_data: list of dictionaries :return: ModelVerification element """ verification_data = pd.DataFrame(verification_data) fields = self.context.schemas[ self.SCHEMA_INPUT] + self.context.schemas[self.SCHEMA_OUTPUT] assert len(verification_data) > 0, 'Verification data can not be empty' assert len(verification_data.columns) == len(fields), \ 'Number of fields in validation data should match to input and output schema fields' mv = pmml.ModelVerification(recordCount=len(verification_data), fieldCount=len(verification_data.columns)) # step one: build verification schema verification_fields = pmml.VerificationFields() for f in fields: if isinstance(f, NumericFeature): vf = pmml.VerificationField(field=f.name, column=f.name, precision=self.EPSILON) else: vf = pmml.VerificationField(field=f.name, column=f.name) verification_fields.append(vf) mv.append(verification_fields) # step two: build data table it = pmml.InlineTable() for data in verification_data.iterrows(): data = data[1] row = pmml.row() row_empty = True for key in verification_data.columns: if verification_data[key].dtype == object or not np.isnan( data[key]): col = bds().createChildElement(key) bds().appendTextChild(data[key], col) row.append(col) row_empty = False if not row_empty: it.append(row) mv.append(it) return mv
def test_transform_with_derived_field(self): self.est = DecisionTreeClassifier(max_depth=2) self.est.fit([ [0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 1], ], [0, 1, 1, 1]) mapping = pmml.MapValues(dataType="double", outputColumn="output") mapping.append(pmml.FieldColumnPair(column="x1", field="x1")) mapping.append(pmml.FieldColumnPair(column="x2", field="x2")) it = pmml.InlineTable() it.append(pmml_row(x1=0, x2='zero', output=0)) it.append(pmml_row(x1=0, x2='one', output=0)) it.append(pmml_row(x1=1, x2='zero', output=0)) it.append(pmml_row(x1=1, x2='one', output=1)) mapping.append(it) self.ctx = TransformationContext({ Schema.INPUT: [ IntegerNumericFeature('x1'), StringCategoricalFeature('x2', ['zero', 'one']) ], Schema.DERIVED: [ DerivedFeature(feature=RealNumericFeature(name='x3'), transformation=mapping) ], Schema.MODEL: [ IntegerNumericFeature('x1'), StringCategoricalFeature('x2', ['zero', 'one']), RealNumericFeature(name='x3') ], Schema.OUTPUT: [IntegerCategoricalFeature('output', ['neg', 'pos'])] }) self.converter = DecisionTreeConverter(estimator=self.est, context=self.ctx, mode=ModelMode.CLASSIFICATION) self.converter.pmml().toxml()
def model_verification(self, verification_data): """ Use the input verification_data, apply the transformations, evaluate the model response and produce the ModelVerification element :param verification_data: list of dictionaries or data frame :type verification_data: dict[str, object]|pd.DataFrame :return: ModelVerification element """ verification_data = pd.DataFrame(verification_data) assert len(verification_data) > 0, 'Verification data can not be empty' verification_input = pd.DataFrame(index=verification_data.index) verification_model_input = pd.DataFrame(index=verification_data.index) for key in self.context.schemas[Schema.INPUT]: # all input features MUST be present in the verification_data assert key.full_name in verification_data.columns, 'Missing input field "{}"'.format( key.full_name) verification_input[Schema.INPUT.extract_feature_name( key)] = verification_data[key.full_name] if isinstance(key, CategoricalFeature): verification_model_input[Schema.INPUT.extract_feature_name( key)] = np.vectorize(key.to_number)( verification_data[key.full_name]) else: verification_model_input[Schema.INPUT.extract_feature_name( key)] = verification_data[key.full_name] for key in self.context.schemas[Schema.DERIVED]: assert isinstance( key, DerivedFeature ), 'Only DerivedFeatures are allowed in the DERIVED schema' verification_model_input[key.full_name] = key.apply( verification_input) # at this point we can check that MODEL schema contains only known features for key in self.context.schemas[Schema.MODEL]: assert Schema.MODEL.extract_feature_name(key) in verification_model_input.columns, \ 'Unknown feature "{}" in the MODEL schema'.format(key.full_name) # TODO: we can actually support multiple columns, but need to figure out the way to extract the data # TODO: from the estimator properly # building model results assert len(self.context.schemas[ Schema.OUTPUT]) == 1, 'Only one output is currently supported' key = self.context.schemas[Schema.OUTPUT][0] model_input = verification_model_input[list( map(Schema.MODEL.extract_feature_name, self.context.schemas[Schema.MODEL]))].values model_results = np.vectorize(key.from_number)( self.estimator.predict(X=model_input)) if key.full_name in verification_data: # make sure that if results are provided, the expected and actual values are equal assert_equal(key, model_results, verification_data[key.full_name].values) verification_input[Schema.OUTPUT.extract_feature_name( key)] = model_results if isinstance(key, CategoricalFeature): probabilities = self.estimator.predict_proba(X=model_input) for i, key in enumerate(self.context.schemas[Schema.CATEGORIES]): verification_input[Schema.CATEGORIES.extract_feature_name( key)] = probabilities[:, i] fields = [] field_names = [] for s in [Schema.INPUT, Schema.OUTPUT, Schema.CATEGORIES]: fields += self.context.schemas[s] field_names += list( map(s.extract_feature_name, self.context.schemas[s])) mv = pmml.ModelVerification(recordCount=len(verification_input), fieldCount=len(fields)) # step one: build verification schema verification_fields = pmml.VerificationFields() for key in fields: if isinstance(key, NumericFeature): vf = pmml.VerificationField(field=key.name, column=key.name, precision=self.EPSILON) else: vf = pmml.VerificationField(field=key.name, column=key.name) verification_fields.append(vf) mv.append(verification_fields) # step two: build data table it = pmml.InlineTable() for data in verification_input.iterrows(): data = data[1] row = pmml.row() row_empty = True for key in field_names: if verification_input[key].dtype == object or not np.isnan( data[key]): col = bds().createChildElement(key) bds().appendTextChild(data[key], col) row.append(col) row_empty = False if not row_empty: it.append(row) mv.append(it) return mv