def pmml_row(**columns): """ Creates pmml.row element with columns :param columns: key-value pairs to be inserted into the row :return: pmml.row element """ r = pmml.row() for name, value in columns.items(): el = bds().createChildElement(name) bds().appendTextChild(value, el) r.append(el) return r
def model_verification(self, verification_data): """ Build a model verification dataset :param verification_data: list of dictionaries :return: ModelVerification element """ verification_data = pd.DataFrame(verification_data) fields = self.context.schemas[ self.SCHEMA_INPUT] + self.context.schemas[self.SCHEMA_OUTPUT] assert len(verification_data) > 0, 'Verification data can not be empty' assert len(verification_data.columns) == len(fields), \ 'Number of fields in validation data should match to input and output schema fields' mv = pmml.ModelVerification(recordCount=len(verification_data), fieldCount=len(verification_data.columns)) # step one: build verification schema verification_fields = pmml.VerificationFields() for f in fields: if isinstance(f, NumericFeature): vf = pmml.VerificationField(field=f.name, column=f.name, precision=self.EPSILON) else: vf = pmml.VerificationField(field=f.name, column=f.name) verification_fields.append(vf) mv.append(verification_fields) # step two: build data table it = pmml.InlineTable() for data in verification_data.iterrows(): data = data[1] row = pmml.row() row_empty = True for key in verification_data.columns: if verification_data[key].dtype == object or not np.isnan( data[key]): col = bds().createChildElement(key) bds().appendTextChild(data[key], col) row.append(col) row_empty = False if not row_empty: it.append(row) mv.append(it) return mv
def model_verification(self, verification_data): """ Build a model verification dataset :param verification_data: list of dictionaries :return: ModelVerification element """ verification_data = pd.DataFrame(verification_data) fields = self.context.schemas[self.SCHEMA_INPUT] + self.context.schemas[self.SCHEMA_OUTPUT] assert len(verification_data) > 0, "Verification data can not be empty" assert len(verification_data.columns) == len( fields ), "Number of fields in validation data should match to input and output schema fields" mv = pmml.ModelVerification(recordCount=len(verification_data), fieldCount=len(verification_data.columns)) # step one: build verification schema verification_fields = pmml.VerificationFields() for f in fields: if isinstance(f, NumericFeature): vf = pmml.VerificationField(field=f.name, column=f.name, precision=self.EPSILON) else: vf = pmml.VerificationField(field=f.name, column=f.name) verification_fields.append(vf) mv.append(verification_fields) # step two: build data table it = pmml.InlineTable() for data in verification_data.iterrows(): data = data[1] row = pmml.row() row_empty = True for key in verification_data.columns: if verification_data[key].dtype == object or not np.isnan(data[key]): col = bds().createChildElement(key) bds().appendTextChild(data[key], col) row.append(col) row_empty = False if not row_empty: it.append(row) mv.append(it) return mv
def model_verification(self, verification_data): """ Use the input verification_data, apply the transformations, evaluate the model response and produce the ModelVerification element :param verification_data: list of dictionaries or data frame :type verification_data: dict[str, object]|pd.DataFrame :return: ModelVerification element """ verification_data = pd.DataFrame(verification_data) assert len(verification_data) > 0, 'Verification data can not be empty' verification_input = pd.DataFrame(index=verification_data.index) verification_model_input = pd.DataFrame(index=verification_data.index) for key in self.context.schemas[Schema.INPUT]: # all input features MUST be present in the verification_data assert key.full_name in verification_data.columns, 'Missing input field "{}"'.format(key.full_name) verification_input[Schema.INPUT.extract_feature_name(key)] = verification_data[key.full_name] if isinstance(key, CategoricalFeature): verification_model_input[Schema.INPUT.extract_feature_name(key)] = np.vectorize(key.to_number)(verification_data[key.full_name]) else: verification_model_input[Schema.INPUT.extract_feature_name(key)] = verification_data[key.full_name] for key in self.context.schemas[Schema.DERIVED]: assert isinstance(key, DerivedFeature), 'Only DerivedFeatures are allowed in the DERIVED schema' verification_model_input[key.full_name] = key.apply(verification_input) # at this point we can check that MODEL schema contains only known features for key in self.context.schemas[Schema.MODEL]: assert Schema.MODEL.extract_feature_name(key) in verification_model_input.columns, \ 'Unknown feature "{}" in the MODEL schema'.format(key.full_name) # TODO: we can actually support multiple columns, but need to figure out the way to extract the data # TODO: from the estimator properly # building model results assert len(self.context.schemas[Schema.OUTPUT]) == 1, 'Only one output is currently supported' key = self.context.schemas[Schema.OUTPUT][0] model_input = verification_model_input[list(map(Schema.MODEL.extract_feature_name, self.context.schemas[Schema.MODEL]))].values model_results = np.vectorize(key.from_number)(self.estimator.predict(X=model_input)) if key.full_name in verification_data: # make sure that if results are provided, the expected and actual values are equal assert_equal(key, model_results, verification_data[key.full_name].values) verification_input[Schema.OUTPUT.extract_feature_name(key)] = model_results if isinstance(key, CategoricalFeature): probabilities = self.estimator.predict_proba(X=model_input) for i, key in enumerate(self.context.schemas[Schema.CATEGORIES]): verification_input[Schema.CATEGORIES.extract_feature_name(key)] = probabilities[:, i] fields = [] field_names = [] for s in [Schema.INPUT, Schema.OUTPUT, Schema.CATEGORIES]: fields += self.context.schemas[s] field_names += list(map(s.extract_feature_name, self.context.schemas[s])) mv = pmml.ModelVerification(recordCount=len(verification_input), fieldCount=len(fields)) # step one: build verification schema verification_fields = pmml.VerificationFields() for key in fields: if isinstance(key, NumericFeature): vf = pmml.VerificationField(field=key.name, column=key.name, precision=self.EPSILON) else: vf = pmml.VerificationField(field=key.name, column=key.name) verification_fields.append(vf) mv.append(verification_fields) # step two: build data table it = pmml.InlineTable() for data in verification_input.iterrows(): data = data[1] row = pmml.row() row_empty = True for key in field_names: if verification_input[key].dtype == object or not np.isnan(data[key]): col = bds().createChildElement(key) bds().appendTextChild(data[key], col) row.append(col) row_empty = False if not row_empty: it.append(row) mv.append(it) return mv
def model_verification(self, verification_data): """ Use the input verification_data, apply the transformations, evaluate the model response and produce the ModelVerification element :param verification_data: list of dictionaries or data frame :type verification_data: dict[str, object]|pd.DataFrame :return: ModelVerification element """ verification_data = pd.DataFrame(verification_data) assert len(verification_data) > 0, 'Verification data can not be empty' verification_input = pd.DataFrame(index=verification_data.index) verification_model_input = pd.DataFrame(index=verification_data.index) for key in self.context.schemas[Schema.INPUT]: # all input features MUST be present in the verification_data assert key.full_name in verification_data.columns, 'Missing input field "{}"'.format( key.full_name) verification_input[Schema.INPUT.extract_feature_name( key)] = verification_data[key.full_name] if isinstance(key, CategoricalFeature): verification_model_input[Schema.INPUT.extract_feature_name( key)] = np.vectorize(key.to_number)( verification_data[key.full_name]) else: verification_model_input[Schema.INPUT.extract_feature_name( key)] = verification_data[key.full_name] for key in self.context.schemas[Schema.DERIVED]: assert isinstance( key, DerivedFeature ), 'Only DerivedFeatures are allowed in the DERIVED schema' verification_model_input[key.full_name] = key.apply( verification_input) # at this point we can check that MODEL schema contains only known features for key in self.context.schemas[Schema.MODEL]: assert Schema.MODEL.extract_feature_name(key) in verification_model_input.columns, \ 'Unknown feature "{}" in the MODEL schema'.format(key.full_name) # TODO: we can actually support multiple columns, but need to figure out the way to extract the data # TODO: from the estimator properly # building model results assert len(self.context.schemas[ Schema.OUTPUT]) == 1, 'Only one output is currently supported' key = self.context.schemas[Schema.OUTPUT][0] model_input = verification_model_input[list( map(Schema.MODEL.extract_feature_name, self.context.schemas[Schema.MODEL]))].values model_results = np.vectorize(key.from_number)( self.estimator.predict(X=model_input)) if key.full_name in verification_data: # make sure that if results are provided, the expected and actual values are equal assert_equal(key, model_results, verification_data[key.full_name].values) verification_input[Schema.OUTPUT.extract_feature_name( key)] = model_results if isinstance(key, CategoricalFeature): probabilities = self.estimator.predict_proba(X=model_input) for i, key in enumerate(self.context.schemas[Schema.CATEGORIES]): verification_input[Schema.CATEGORIES.extract_feature_name( key)] = probabilities[:, i] fields = [] field_names = [] for s in [Schema.INPUT, Schema.OUTPUT, Schema.CATEGORIES]: fields += self.context.schemas[s] field_names += list( map(s.extract_feature_name, self.context.schemas[s])) mv = pmml.ModelVerification(recordCount=len(verification_input), fieldCount=len(fields)) # step one: build verification schema verification_fields = pmml.VerificationFields() for key in fields: if isinstance(key, NumericFeature): vf = pmml.VerificationField(field=key.name, column=key.name, precision=self.EPSILON) else: vf = pmml.VerificationField(field=key.name, column=key.name) verification_fields.append(vf) mv.append(verification_fields) # step two: build data table it = pmml.InlineTable() for data in verification_input.iterrows(): data = data[1] row = pmml.row() row_empty = True for key in field_names: if verification_input[key].dtype == object or not np.isnan( data[key]): col = bds().createChildElement(key) bds().appendTextChild(data[key], col) row.append(col) row_empty = False if not row_empty: it.append(row) mv.append(it) return mv