def transform_ct(x_in, model, encoding): """ Transform when using a ColumnsTransformer. As ColumnsTransformer output hstack the result of transformers, if the TOP-preprocessed data are re-ordered after the ColumnTransformer the inverse transform must return false result. We successively apply the transformers with columns position. That's why colnames are prefixed by the transformers names. Parameters ---------- x_in : pandas.DataFrame Raw dataset to apply preprocessing model: model object model used to check the different values of target estimate predict_proba encoding : list The list must contain a single ColumnsTransformer and an optional list of dict. Returns ------- pandas.Dataframe The data preprocessed for the given list of encoding. """ if str(type(encoding)) == columntransformer: # We use inverse tranform from the encoding method base on columns position if str(type(model)) in sklearn_model: rst = pd.DataFrame(encoding.transform(x_in), index=x_in.index) rst.columns = ["col_" + str(feature) for feature in rst.columns] elif str(type(model)) in other_model: rst = pd.DataFrame(encoding.transform(x_in), columns=extract_features_model( model, dict_model_feature[str(type(model))]), index=x_in.index) else: raise ValueError("Model specified isn't supported by Shapash.") elif str(type(encoding)) == "<class 'list'>": rst = transform_ordinal(x_in, encoding) else: raise Exception( f"{encoding.__class__.__name__} not supported, no preprocessing done." ) return rst
def check_consistency_model_features(features_dict, model, columns_dict, features_types, mask_params=None, preprocessing=None, postprocessing=None, list_preprocessing=None, features_groups=None): """ Check the matching between attributes, features names are same, or include Parameters ---------- features_dict: dict Dictionary mapping technical feature names to domain names. model: model object model used to check the different values of target estimate predict_proba columns_dict: dict Dictionary mapping integer column number (in the same order of the trained dataset) to technical feature names. features_types: dict Dictionnary mapping features with the right types needed. preprocessing: category_encoders, ColumnTransformer, list or dict (optional) The processing apply to the original data mask_params: dict (optional) Dictionnary allowing the user to define a apply a filter to summarize the local explainability. postprocessing : dict Dictionnary of postprocessing that need to be checked. list_preprocessing: list (optional) list containing all preprocessing. features_groups: list (optional) list containing all groups of features. """ # Features dict can include additional entries for groups of features. # We don't want to check them here as they may not be in other dict features_dict = copy.deepcopy(features_dict) if features_groups is not None: for feat in features_groups.keys(): if feat in features_dict.keys(): features_dict.pop(feat) if features_dict is not None: if not all(feat in features_types for feat in features_dict): raise ValueError( "All features of features_dict must be in features_types") if set(features_types) != set(columns_dict.values()): raise ValueError( "features of features_types and columns_dict must be the same") if mask_params is not None: if mask_params['features_to_hide'] is not None: if not all(feature in set(features_types) for feature in mask_params['features_to_hide']): raise ValueError( "All features of mask_params must be in model") if preprocessing is not None and str( type(preprocessing)) in (supported_category_encoder): if not all(feature in set(columns_dict.values()) for feature in set(preprocessing.cols)): raise ValueError( "All features of preprocessing must be in columns_dict") model_features = extract_features_model( model, dict_model_feature[str(type(model))]) if isinstance(model_features, list): feature_expected_model = model_features model_expected = len(set(model_features)) else: feature_expected_model = None model_expected = model_features if preprocessing is None: if isinstance(feature_expected_model, list): if set(columns_dict.values()) != set(feature_expected_model): columns_dict_feature = [ str(feature) for feature in columns_dict.values() ] if set(columns_dict_feature) != set(feature_expected_model): raise ValueError( "Features of columns_dict and model must be the same.") else: if len(set(columns_dict.values())) != model_expected: raise ValueError( "Features of columns_dict and model must have the same length" ) if str(type(preprocessing)) in supported_category_encoder and isinstance( feature_expected_model, list): if set(preprocessing.feature_names) != set(feature_expected_model): raise ValueError(""" One of features returned by the Category_Encoders preprocessing doesn't match the model's expected features. """) elif preprocessing is not None: feature_encoded = get_list_features_names(list_preprocessing, columns_dict) if model_expected != len(feature_encoded): raise ValueError(""" Number of features returned by the preprocessing step doesn't match the model's expected features. """) if postprocessing: if not isinstance(postprocessing, dict): raise ValueError("Postprocessing parameter must be a dictionnary") for feature in postprocessing.keys(): if feature not in features_types.keys(): raise ValueError( "Postprocessing and features_types must have the same features names." ) if feature not in columns_dict.values(): raise ValueError( "Postprocessing and columns_dict must have the same features names." ) check_postprocessing(features_types, postprocessing)
def check_consistency_model_features(features_dict, model, columns_dict, features_types, mask_params=None, preprocessing=None, postprocessing=None): """ Check the matching between attributes, features names are same, or include Parameters ---------- features_dict: dict Dictionary mapping technical feature names to domain names. model: model object model used to check the different values of target estimate predict_proba columns_dict: dict Dictionary mapping integer column number (in the same order of the trained dataset) to technical feature names. features_types: dict Dictionnary mapping features with the right types needed. preprocessing: category_encoders, ColumnTransformer, list or dict (optional) The processing apply to the original data mask_params: dict (optional) Dictionnary allowing the user to define a apply a filter to summarize the local explainability. postprocessing : dict Dictionnary of postprocessing that need to be checked. """ if features_dict is not None: if not all(feat in features_types for feat in features_dict): raise ValueError( "All features of features_dict must be in features_types") if set(features_types) != set(columns_dict.values()): raise ValueError( "features of features_types and model must be the same") if mask_params is not None: if mask_params['features_to_hide'] is not None: if not all(feature in set(features_types) for feature in mask_params['features_to_hide']): raise ValueError( "All features of mask_params must be in model") if preprocessing is not None and str( type(preprocessing)) in (supported_category_encoder, supported_sklearn): if not all(feature in set(columns_dict.values()) for feature in set(preprocessing.cols)): raise ValueError( "All features of preprocessing must be in columns_dict") model_features = extract_features_model( model, dict_model_feature[str(type(model))]) if isinstance(model_features, list): if str(type(preprocessing)) in no_dummies_category_encoder: if set(columns_dict.values()) != set(model_features): raise ValueError( "features of columns_dict and model must be the same") elif str(type(preprocessing)) in (no_dummies_sklearn, columntransformer): if len(set(columns_dict.values())) != len(set(model_features)): raise ValueError( "length of features of columns_dict and model must be the same" ) elif str(type(preprocessing)) not in (no_dummies_category_encoder, no_dummies_sklearn, columntransformer)\ and preprocessing is not None: raise ValueError( "this type of encoder is not supported in SmartPredictor") else: model_length_features = model_features if len(set(columns_dict.values())) != model_length_features: raise ValueError( "features of columns_dict and model must have the same length") if postprocessing: if not isinstance(postprocessing, dict): raise ValueError("Postprocessing parameter must be a dictionnary") for feature in postprocessing.keys(): if feature not in features_types.keys(): raise ValueError( "Postprocessing and features_types must have the same features names." ) if feature not in columns_dict.values(): raise ValueError( "Postprocessing and columns_dict must have the same features names." ) check_postprocessing(features_types, postprocessing)