def convert(model, features, target): """Convert a LinearSVR model to the protobuf spec. Parameters ---------- model: LinearSVR A trained LinearSVR model. feature_names: [str] Name of the input columns. target: str Name of the output column. Returns ------- model_spec: An object of type Model_pb. Protobuf representation of the model """ if not (_HAS_SKLEARN): raise RuntimeError( 'scikit-learn not found. scikit-learn conversion API is disabled.') # Check the scikit learn model _sklearn_util.check_expected_type(model, _LinearSVR) _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'coef_')) return _MLModel(_linear_regression._convert(model, features, target))
def convert(model, feature_names, target): """Convert a decision tree model to protobuf format. Parameters ---------- decision_tree : DecisionTreeRegressor A trained scikit-learn tree model. feature_names: [str] Name of the input columns. target: str Name of the output column. Returns ------- model_spec: An object of type Model_pb. Protobuf representation of the model """ if not (_HAS_SKLEARN): raise RuntimeError( 'scikit-learn not found. scikit-learn conversion API is disabled.') _sklearn_util.check_expected_type(model, _tree.DecisionTreeRegressor) _sklearn_util.check_fitted( model, lambda m: hasattr(m, 'tree_') and model.tree_ is not None) return _MLModel(_convert_tree_ensemble(model, feature_names, target))
def convert(model, input_name, output_features): """Convert a decision tree model to protobuf format. Parameters ---------- decision_tree : DecisionTreeClassifier A trained scikit-learn tree model. input_name: str Name of the input columns. output_name: str Name of the output columns. Returns ------- model_spec: An object of type Model_pb. Protobuf representation of the model """ if not (HAS_SKLEARN): raise RuntimeError( 'scikit-learn not found. scikit-learn conversion API is disabled.') _sklearn_util.check_expected_type(model, _tree.DecisionTreeClassifier) _sklearn_util.check_fitted( model, lambda m: hasattr(m, 'tree_') and model.tree_ is not None) return _MLModel( convert_tree_ensemble(model, input_name, output_features, mode='classifier', class_labels=model.classes_))
def get_input_dimension(model): if not (_HAS_SKLEARN): raise RuntimeError( 'scikit-learn not found. scikit-learn conversion API is disabled.') _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'statistics_')) return len(model.statistics_)
def update_dimension(model, input_dimension): if not(_HAS_SKLEARN): raise RuntimeError('scikit-learn not found. scikit-learn conversion API is disabled.') _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'mean_')) _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'scale_')) # Nothing to do for this model return input_dimension
def convert(model, input_features, output_features): """Convert a DictVectorizer model to the protobuf spec. Parameters ---------- model: DictVectorizer A fitted DictVectorizer model. input_features: str Name of the input column. output_features: str Name of the output column. Returns ------- model_spec: An object of type Model_pb. Protobuf representation of the model """ if not (_HAS_SKLEARN): raise RuntimeError( 'scikit-learn not found. scikit-learn conversion API is disabled.') # Set the interface params. spec = _Model_pb2.Model() spec.specificationVersion = SPECIFICATION_VERSION assert len(input_features) == 1 assert isinstance(input_features[0][1], datatypes.Array) # feature name in and out are the same here spec = set_transform_interface_params(spec, input_features, output_features) # Test the scikit-learn model _sklearn_util.check_expected_type(model, Imputer) _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'statistics_')) if model.axis != 0: raise ValueError("Imputation is only supported along axis = 0.") # The imputer in our framework only works on single columns, so # we need to translate that over. The easiest way to do that is to # put it in a nested pipeline with a feature extractor and a tr_spec = spec.imputer for v in model.statistics_: tr_spec.imputedDoubleArray.vector.append(v) try: tr_spec.replaceDoubleValue = float(model.missing_values) except ValueError: raise ValueError("Only scalar values or NAN as missing_values " "in _imputer are supported.") return _MLModel(spec)
def convert(model, feature_names, target): """Convert a boosted tree model to protobuf format. Parameters ---------- decision_tree : GradientBoostingClassifier A trained scikit-learn tree model. feature_names: [str] Name of the input columns. target: str Name of the output column. Returns ------- model_spec: An object of type Model_pb. Protobuf representation of the model """ if not (_HAS_SKLEARN): raise RuntimeError( 'scikit-learn not found. scikit-learn conversion API is disabled.') _sklearn_util.check_expected_type(model, _ensemble.GradientBoostingClassifier) def is_gbr_model(m): if len(m.estimators_) == 0: return False if hasattr(m, 'estimators_') and m.estimators_ is not None: for t in m.estimators_.flatten(): if not hasattr(t, 'tree_') or t.tree_ is None: return False return True else: return False _sklearn_util.check_fitted(model, is_gbr_model) post_evaluation_transform = None if model.n_classes_ == 2: base_prediction = [model.init_.prior] post_evaluation_transform = 'Regression_Logistic' else: base_prediction = list(model.init_.priors) post_evaluation_transform = 'Classification_SoftMax' return _MLModel( _convert_tree_ensemble( model, feature_names, target, mode='classifier', base_prediction=base_prediction, class_labels=model.classes_, post_evaluation_transform=post_evaluation_transform))
def get_input_dimension(model): if not (_HAS_SKLEARN): raise RuntimeError( 'scikit-learn not found. scikit-learn conversion API is disabled.') _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'active_features_')) _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'n_values_')) if model.categorical_features == 'all': return len(model.feature_indices_) - 1 else: # This can't actually be determined from the model as indices after the # rest of the categorical values don't seem to be tracked return None
def convert(model, input_features, output_features): """Convert a boosted tree model to protobuf format. Parameters ---------- decision_tree : GradientBoostingRegressor A trained scikit-learn tree model. input_feature: [str] Name of the input columns. output_features: str Name of the output column. Returns ------- model_spec: An object of type Model_pb. Protobuf representation of the model """ if not (_HAS_SKLEARN): raise RuntimeError( 'scikit-learn not found. scikit-learn conversion API is disabled.') _sklearn_util.check_expected_type(model, _ensemble.GradientBoostingRegressor) def is_gbr_model(m): if len(m.estimators_) == 0: return False if hasattr(m, 'estimators_') and m.estimators_ is not None: for t in m.estimators_.flatten(): if not hasattr(t, 'tree_') or t.tree_ is None: return False return True else: return False _sklearn_util.check_fitted(model, is_gbr_model) base_prediction = model.init_.mean return _MLModel( _convert_tree_ensemble(model, input_features, output_features, base_prediction=base_prediction))
def convert(model, input_features, output_features): """Convert a normalizer model to the protobuf spec. Parameters ---------- model: Normalizer A Normalizer. input_features: str Name of the input column. output_features: str Name of the output column. Returns ------- model_spec: An object of type Model_pb. Protobuf representation of the model """ if not (_HAS_SKLEARN): raise RuntimeError( 'scikit-learn not found. scikit-learn conversion API is disabled.') # Test the scikit-learn model _sklearn_util.check_expected_type(model, Normalizer) _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'norm')) # Set the interface params. spec = _Model_pb2.Model() spec.specificationVersion = SPECIFICATION_VERSION spec = _set_transform_interface_params(spec, input_features, output_features) # Set the one hot encoder parameters _normalizer_spec = spec.normalizer if model.norm == 'l1': _normalizer_spec.normType = _proto__normalizer.L1 elif model.norm == 'l2': _normalizer_spec.normType = _proto__normalizer.L2 elif model.norm == 'max': _normalizer_spec.normType = _proto__normalizer.LMax return _MLModel(spec)
def update_dimension(model, input_dimension): """ Given a model that takes an array of dimension input_dimension, returns the output dimension. """ if not (_HAS_SKLEARN): raise RuntimeError( 'scikit-learn not found. scikit-learn conversion API is disabled.') _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'active_features_')) _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'n_values_')) if model.categorical_features == 'all': return len(model.active_features_) else: out_dimension = (len(model.active_features_) + (input_dimension - len(model.n_values_))) return out_dimension
def convert(model, feature_names, target): """Convert a boosted tree model to protobuf format. Parameters ---------- decision_tree : RandomForestClassifier A trained scikit-learn tree model. feature_names: [str] Name of the input columns. target: str Name of the output column. Returns ------- model_spec: An object of type Model_pb. Protobuf representation of the model """ if not (_HAS_SKLEARN): raise RuntimeError( 'scikit-learn not found. scikit-learn conversion API is disabled.') _sklearn_util.check_expected_type(model, _ensemble.RandomForestClassifier) def is_rf_model(m): if len(m.estimators_) == 0: return False if hasattr(m, 'estimators_') and m.estimators_ is not None: for t in m.estimators_: if not hasattr(t, 'tree_') or t.tree_ is None: return False return True else: return False _sklearn_util.check_fitted(model, is_rf_model) return _MLModel( _convert_tree_ensemble(model, feature_names, target, mode='classifier', class_labels=model.classes_))
def convert(model, input_features, output_features): """Convert a _imputer model to the protobuf spec. Parameters ---------- model: Imputer A trained Imputer model. input_features: str Name of the input column. output_features: str Name of the output column. Returns ------- model_spec: An object of type Model_pb. Protobuf representation of the model """ if not(_HAS_SKLEARN): raise RuntimeError('scikit-learn not found. scikit-learn conversion API is disabled.') # Test the scikit-learn model _sklearn_util.check_expected_type(model, StandardScaler) _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'mean_')) _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'scale_')) # Set the interface params. spec = _Model_pb2.Model() spec.specificationVersion = SPECIFICATION_VERSION spec = _set_transform_interface_params(spec, input_features, output_features) # Set the parameters tr_spec = spec.scaler for x in model.mean_: tr_spec.shiftValue.append(-x) for x in model.scale_: tr_spec.scaleValue.append(1.0 / x) return _MLModel(spec)
def convert(model, input_features, output_features): """Convert a one-hot-encoder model to the protobuf spec. Parameters ---------- model: OneHotEncoder A trained one-hot encoder model. input_features: str, optional Name of the input column. output_features: str, optional Name of the output column. Returns ------- model_spec: An object of type Model_pb. Protobuf representation of the model """ if not (_HAS_SKLEARN): raise RuntimeError( 'scikit-learn not found. scikit-learn conversion API is disabled.') # Make sure the model is fitted. _sklearn_util.check_expected_type(model, OneHotEncoder) _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'active_features_')) _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'n_values_')) input_dimension = get_input_dimension(model) if input_dimension is not None: # Make sure that our starting dimensions are correctly managed. assert len(input_features) == 1 assert input_features[0][1] == datatypes.Array(input_dimension) input_dimension = input_features[0][1].num_elements expected_output_dimension = update_dimension(model, input_dimension) assert output_features[0][1] == datatypes.Array(expected_output_dimension) # Create a pipeline that can do all of the subsequent feature extraction. feature_vectorizer_input_features = [] feature_vectorizer_size_map = {} if model.categorical_features == 'all': _categorical_features = set(xrange(input_dimension)) _cat_feature_idx_mapping = dict( (i, i) for i in xrange(input_dimension)) else: _categorical_features = set(model.categorical_features) _cat_feature_idx_mapping = dict( (_idx, i) for i, _idx in enumerate(sorted(model.categorical_features))) pline = Pipeline(input_features, output_features) # Track the overall packing index, which determins the output ordering. pack_idx = 0 # First, go through all the columns that are encoded. The sklearn OHE puts # all of these first, regardless of their original ordering. for idx in xrange(input_dimension): f_name = "__OHE_%d__" % pack_idx if idx in _categorical_features: # This input column is one hot encoded feature_extractor_spec = create_array_feature_extractor( input_features, f_name, idx, output_type='Int64') pline.add_model(feature_extractor_spec) _cat_feature_idx = _cat_feature_idx_mapping[idx] ohe_input_features = [(f_name, datatypes.Int64())] ohe_output_features = [(f_name, datatypes.Dictionary('Int64'))] # Create a one hot encoder per column o_spec = _Model_pb2.Model() o_spec.specificationVersion = SPECIFICATION_VERSION o_spec = set_transform_interface_params(o_spec, ohe_input_features, ohe_output_features) ohe_spec = o_spec.oneHotEncoder ohe_spec.outputSparse = True if model.handle_unknown == 'error': ohe_spec.handleUnknown = _OHE_pb2.OneHotEncoder.HandleUnknown.Value( 'ErrorOnUnknown') else: ohe_spec.handleUnknown = _OHE_pb2.OneHotEncoder.HandleUnknown.Value( 'IgnoreUnknown') # Need to do a quick search to find the part of the active_features_ mask # that represents the categorical variables in our part. Could do this # with binary search, but we probably don't need speed so much here. def bs_find(a, i): lb, k = 0, len(a) while k > 0: _idx = lb + (k // 2) if a[_idx] < i: lb = _idx + 1 k -= 1 k = (k // 2) return lb # Here are the indices we are looking fo f_idx_bottom = model.feature_indices_[_cat_feature_idx] f_idx_top = model.feature_indices_[_cat_feature_idx + 1] # Now find where in the active features list we should look. cat_feat_idx_bottom = bs_find(model.active_features_, f_idx_bottom) cat_feat_idx_top = bs_find(model.active_features_, f_idx_top) n_cat_values = cat_feat_idx_top - cat_feat_idx_bottom for i in range(cat_feat_idx_bottom, cat_feat_idx_top): # The actual categorical value is stored as an offset in the active_features list. cat_idx = model.active_features_[i] - f_idx_bottom ohe_spec.int64Categories.vector.append(cat_idx) # Add the ohe to the pipeline pline.add_model(o_spec) # Add the result to the feature_vectorizer at the end. feature_vectorizer_input_features.append( (f_name, datatypes.Dictionary('Int64'))) feature_vectorizer_size_map[f_name] = n_cat_values pack_idx += 1 # Now go through all the columns that are not encoded as the sklearn OHE puts # these after the encoded ones. For speed, we can put these all in a single # ArrayFeatureExtractor # pass_through_features = [ idx for idx in xrange(input_dimension) if idx not in _categorical_features ] if pass_through_features: f_name = "__OHE_pass_through__" # This input column is not one hot encoded feature_extractor_spec = create_array_feature_extractor( input_features, f_name, pass_through_features) pline.add_model(feature_extractor_spec) feature_vectorizer_input_features.append( (f_name, datatypes.Array(len(pass_through_features)))) # Finally, add the feature vectorizer to the pipeline. output_feature_name = output_features[0][0] output_feature_dimension = output_features[0][1].num_elements fvec, _num_out_dim = create_feature_vectorizer( feature_vectorizer_input_features, output_features[0][0], feature_vectorizer_size_map) # Make sure that the feature vectorizer input actually matches up with the assert _num_out_dim == output_features[0][1].num_elements pline.add_model(fvec) return _MLModel(pline.spec)