def test_check_input_parameter_variables(): vars_ls = ["var1", "var2", "var1"] vars_int_ls = [0, 1, 2, 3] vars_none = None vars_str = "var1" vars_int = 0 vars_tuple = ("var1", "var2") vars_set = {"var1", "var2"} vars_dict = {"var1": 1, "var2": 2} assert _check_input_parameter_variables(vars_ls) == [ "var1", "var2", "var1" ] assert _check_input_parameter_variables(vars_int_ls) == [0, 1, 2, 3] assert _check_input_parameter_variables(vars_none) is None assert _check_input_parameter_variables(vars_str) == "var1" assert _check_input_parameter_variables(vars_int) == 0 with pytest.raises(ValueError): assert _check_input_parameter_variables(vars_tuple) with pytest.raises(ValueError): assert _check_input_parameter_variables(vars_set) with pytest.raises(ValueError): assert _check_input_parameter_variables(vars_dict)
def __init__( self, estimator, scoring: str = "roc_auc", cv=3, threshold: Union[int, float] = None, variables: Variables = None, confirm_variables: bool = False, ): if threshold: if not isinstance(threshold, (int, float)): raise ValueError( "threshold can only be integer, float or None") if scoring == "roc_auc" and (threshold < 0.5 or threshold > 1): raise ValueError( "roc-auc score should vary between 0.5 and 1. Pick a " "threshold within this interval.") if scoring == "r2" and (threshold < 0 or threshold > 1): raise ValueError( "r2 takes values between -1 and 1. To select features the " "transformer considers the absolute value. Pick a threshold within " "0 and 1.") super().__init__(confirm_variables) self.variables = _check_input_parameter_variables(variables) self.estimator = estimator self.scoring = scoring self.threshold = threshold self.cv = cv
def __init__( self, estimator=RandomForestClassifier(), scoring: str = "roc_auc", cv: int = 3, threshold: Union[int, float] = None, variables: Variables = None, ): if not isinstance(cv, int) or cv < 1: raise ValueError( "cv can only take positive integers bigger than 1") if threshold: if not isinstance(threshold, (int, float)): raise ValueError( "threshold can only be integer, float or None") if scoring == "roc_auc" and (threshold < 0.5 or threshold > 1): raise ValueError( "roc-auc score should vary between 0.5 and 1. Pick a " "threshold within this interval.") if scoring == "r2" and (threshold < 0 or threshold > 1): raise ValueError( "r2 takes values between -1 and 1. To select features the " "transformer considers the absolute value. Pick a threshold within " "0 and 1.") self.variables = _check_input_parameter_variables(variables) self.estimator = estimator self.scoring = scoring self.threshold = threshold self.cv = cv
def __init__( self, tol: float = 0.05, n_categories: int = 10, max_n_categories: Optional[int] = None, variables: Union[None, int, str, List[Union[str, int]]] = None, replace_with: str = "Rare", ) -> None: if tol < 0 or tol > 1: raise ValueError("tol takes values between 0 and 1") if n_categories < 0 or not isinstance(n_categories, int): raise ValueError("n_categories takes only positive integer numbers") if max_n_categories is not None: if max_n_categories < 0 or not isinstance(max_n_categories, int): raise ValueError("max_n_categories takes only positive integer numbers") if not isinstance(replace_with, str): raise ValueError("replace_with takes only strings as values.") self.tol = tol self.n_categories = n_categories self.max_n_categories = max_n_categories self.variables = _check_input_parameter_variables(variables) self.replace_with = replace_with
def __init__( self, estimator=RandomForestClassifier(), scoring: str = "roc_auc", cv: int = 3, threshold: Union[int, float] = 0.5, variables: Variables = None, ): if not isinstance(cv, int) or cv < 1: raise ValueError( "cv can only take positive integers bigger than 1") if not isinstance(threshold, (int, float)): raise ValueError("threshold can only be integer or float") if scoring == "roc_auc" and (threshold < 0.5 or threshold > 1): raise ValueError( "roc-auc score should vary between 0.5 and 1. Pick a " "threshold within this interval.") self.variables = _check_input_parameter_variables(variables) self.estimator = estimator self.scoring = scoring self.threshold = threshold self.cv = cv
def __init__( self, capping_method: str = "gaussian", tail: str = "right", fold: Union[int, float] = 3, variables: Union[None, int, str, List[Union[str, int]]] = None, missing_values: str = "raise", ) -> None: if capping_method not in ["gaussian", "iqr", "quantiles"]: raise ValueError( "capping_method takes only values 'gaussian', 'iqr' or 'quantiles'" ) if tail not in ["right", "left", "both"]: raise ValueError( "tail takes only values 'right', 'left' or 'both'") if fold <= 0: raise ValueError("fold takes only positive numbers") if capping_method == "quantiles" and fold > 0.2: raise ValueError( "with capping_method ='quantiles', fold takes values between 0 and " "0.20 only.") if missing_values not in ["raise", "ignore"]: raise ValueError( "missing_values takes only values 'raise' or 'ignore'") self.capping_method = capping_method self.tail = tail self.fold = fold self.variables = _check_input_parameter_variables(variables) self.missing_values = missing_values
def __init__( self, tol: float = 0.05, n_categories: int = 10, max_n_categories: Optional[int] = None, replace_with: Union[str, int, float] = "Rare", variables: Union[None, int, str, List[Union[str, int]]] = None, ignore_format: bool = False, ) -> None: if tol < 0 or tol > 1: raise ValueError("tol takes values between 0 and 1") if n_categories < 0 or not isinstance(n_categories, int): raise ValueError( "n_categories takes only positive integer numbers") if max_n_categories is not None: if max_n_categories < 0 or not isinstance(max_n_categories, int): raise ValueError( "max_n_categories takes only positive integer numbers") if not isinstance(ignore_format, bool): raise ValueError( "ignore_format takes only booleans True and False") self.tol = tol self.n_categories = n_categories self.max_n_categories = max_n_categories self.replace_with = replace_with self.variables = _check_input_parameter_variables(variables) self.ignore_format = ignore_format
def __init__( self, variables: Union[None, int, str, List[Union[str, int]]] = None, random_state: Union[None, int, str, List[Union[str, int]]] = None, seed: str = "general", seeding_method: str = "add", ) -> None: if seed not in ["general", "observation"]: raise ValueError( "seed takes only values 'general' or 'observation'") if seeding_method not in ["add", "multiply"]: raise ValueError( "seeding_method takes only values 'add' or 'multiply'") if seed == "general" and random_state: if not isinstance(random_state, int): raise ValueError( "if seed == 'general' then random_state must take an integer" ) if seed == "observation" and not random_state: raise ValueError( "if seed == 'observation' the random state must take the name of one " "or more variables which will be used to seed the imputer") self.variables = _check_input_parameter_variables(variables) self.random_state = random_state self.seed = seed self.seeding_method = seeding_method
def __init__( self, top_categories: Optional[int] = None, drop_last: bool = False, drop_last_binary: bool = False, variables: Union[None, int, str, List[Union[str, int]]] = None, ignore_format: bool = False, ) -> None: if top_categories and not isinstance(top_categories, int): raise ValueError( "top_categories takes only integer numbers, 1, 2, 3, etc.") if not isinstance(drop_last, bool): raise ValueError("drop_last takes only True or False") if not isinstance(drop_last_binary, bool): raise ValueError("drop_last_binary takes only True or False") if not isinstance(ignore_format, bool): raise ValueError( "ignore_format takes only booleans True and False") self.top_categories = top_categories self.drop_last = drop_last self.drop_last_binary = drop_last_binary self.variables = _check_input_parameter_variables(variables) self.ignore_format = ignore_format
def __init__( self, cv: int = 3, scoring: str = "neg_mean_squared_error", variables: Union[None, int, str, List[Union[str, int]]] = None, param_grid: Optional[Dict[str, Union[str, int, float, List[int]]]] = None, regression: bool = True, random_state: Optional[int] = None, ) -> None: if param_grid is None: param_grid = {"max_depth": [1, 2, 3, 4]} if not isinstance(cv, int) or cv < 0: raise ValueError("cv can only take only positive integers") if not isinstance(regression, bool): raise ValueError("regression can only take True or False") self.cv = cv self.scoring = scoring self.regression = regression self.variables = _check_input_parameter_variables(variables) self.param_grid = param_grid self.random_state = random_state
def __init__(self, variables: Union[None, int, str, List[Union[str, int]]] = None, percent_threshold=0.02, other_val='_OTHER_'): self.variables = _check_input_parameter_variables(variables) self.percent_threshold = percent_threshold self.other_val = other_val
def __init__(self, variables: Union[None, int, str, List[Union[str, int]]] = None, max_levels=20, other_val='_OTHER_'): self.variables = _check_input_parameter_variables(variables) self.max_levels = max_levels self.other_val = other_val
def __init__(self, variables: Union[None, int, str, List[Union[str, int]]] = None, cum_percent=0.95, other_val='_OTHER_'): self.variables = _check_input_parameter_variables(variables) self.cum_percent = cum_percent self.other_val = other_val
def __init__( self, variables: Variables = None, method: str = "pearson", threshold: float = 0.8, missing_values: str = "ignore", selection_method: str = "missing_values", estimator=None, scoring: str = "roc_auc", cv: int = 3, ): if method not in ["pearson", "spearman", "kendall"]: raise ValueError( "correlation method takes only values 'pearson', 'spearman', 'kendall'" ) if not isinstance(threshold, float) or threshold < 0 or threshold > 1: raise ValueError("threshold must be a float between 0 and 1") if missing_values not in ["raise", "ignore"]: raise ValueError( "missing_values takes only values 'raise' or 'ignore'.") if selection_method not in [ "missing_values", "cardinality", "variance", "model_performance", ]: raise ValueError( "selection_method takes only values 'missing_values', 'cardinality', " "'variance' or 'model_performance'.") if not isinstance(cv, int) or cv < 1: raise ValueError( "cv can only take positive integers bigger than 1") if selection_method == "model_performance" and estimator is None: raise ValueError("Please provide an estimator, e.g., " "RandomForestClassifier or select another " "selection_method") if selection_method == "missing_values" and missing_values == "raise": raise ValueError( "To select the variables with least missing values, we " "need to allow this transformer to contemplate variables " "with NaN by setting missing_values to 'ignore.") self.variables = _check_input_parameter_variables(variables) self.method = method self.threshold = threshold self.missing_values = missing_values self.selection_method = selection_method self.estimator = estimator self.scoring = scoring self.cv = cv
def __init__( self, variables: Variables = None, scoring: str = "roc_auc_score", threshold: float = 0.5, bins: int = 5, strategy: str = "equal_width", cv: int = 3, random_state: int = None, ): if scoring not in ["roc_auc_score", "r2_score"]: raise ValueError( "At the moment, the selector can evaluate only the " "roc_auc and r2 scores. Please enter either " "'roc_auc_score' or 'r2_score' for the parameter " "'scoring'" ) if not isinstance(threshold, (int, float)): raise ValueError("threshold can only take integer or float") if scoring == "roc_auc_score" and (threshold < 0.5 or threshold > 1): raise ValueError( "roc-auc score should vary between 0.5 and 1. Pick a " "threshold within this interval." ) if scoring == "r2_score" and (threshold < 0 or threshold > 1): raise ValueError( "r2 score should vary between 0 and 1. Pick a " "threshold within this interval." ) if not isinstance(bins, int): raise TypeError("'bins' takes only integers") if strategy not in ["equal_width", "equal_frequency"]: raise ValueError( "'strategy' takes boolean values 'equal_width' and " "'equal_frequency'." ) if not isinstance(cv, int) or cv <= 1: raise ValueError("cv takes integers bigger than 1") if random_state and not isinstance(random_state, int): raise TypeError("'random_state' takes only integers") self.variables = _check_input_parameter_variables(variables) self.scoring = scoring self.threshold = threshold self.bins = bins self.strategy = strategy self.cv = cv self.random_state = random_state
def __init__(self, variables: Variables = None, missing_values: str = "ignore"): if missing_values not in ["raise", "ignore"]: raise ValueError( "missing_values takes only values 'raise' or 'ignore'.") self.variables = _check_input_parameter_variables(variables) self.missing_values = missing_values
def __init__( self, variables: Union[None, int, str, List[Union[str, int]]] = None, exp: Union[float, int] = 0.5, ): if not isinstance(exp, (float, int)): raise ValueError("exp must be a float or an int") self.exp = exp self.variables = _check_input_parameter_variables(variables)
def __init__( self, variables: Union[None, int, str, List[Union[str, int]]] = None, ignore_format: bool = False, ) -> None: if not isinstance(ignore_format, bool): raise ValueError("ignore_format takes only booleans True and False") self.variables = _check_input_parameter_variables(variables) self.ignore_format = ignore_format
def __init__( self, imputation_method: str = "median", variables: Union[None, int, str, List[Union[str, int]]] = None, ) -> None: if imputation_method not in ["median", "mean"]: raise ValueError("imputation_method takes only values 'median' or 'mean'") self.imputation_method = imputation_method self.variables = _check_input_parameter_variables(variables)
def __init__( self, missing_only: bool = True, variables: Union[None, int, str, List[Union[str, int]]] = None, ) -> None: if not isinstance(missing_only, bool): raise ValueError("missing_only takes values True or False") self.variables = _check_input_parameter_variables(variables) self.missing_only = missing_only
def __init__( self, variables: Union[None, int, str, List[Union[str, int]]] = None, base: str = "e", ) -> None: if base not in ["e", "10"]: raise ValueError("base can take only '10' or 'e' as values") self.variables = _check_input_parameter_variables(variables) self.base = base
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ Makes a copy of the train set. Only stores a copy of the variables to impute. This copy is then used to randomly extract the values to fill the missing data during transform. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training dataset. Only a copy of the indicated variables will be stored in the transformer. y : None y is not needed in this imputation. You can pass None or y. Raises ------ TypeError If the input is not a Pandas DataFrame Returns ------- self """ # check input dataframe X = _is_dataframe(X) # find variables to impute if not self.variables: self.variables = [var for var in X.columns] else: self.variables = self.variables # take a copy of the selected variables self.X_ = X[self.variables].copy() # check the variables assigned to the random state if self.seed == "observation": self.random_state = _check_input_parameter_variables(self.random_state) if isinstance(self.random_state, (int, str)): self.random_state = [self.random_state] if self.random_state and any( var for var in self.random_state if var not in X.columns ): raise ValueError( "There are variables assigned as random state which are not part " "of the training dataframe." ) self.input_shape_ = X.shape return self
def __init__( self, encoding_method: str = "ordered", variables: Union[None, int, str, List[Union[str, int]]] = None, ) -> None: if encoding_method not in ["ordered", "arbitrary"]: raise ValueError( "encoding_method takes only values 'ordered' and 'arbitrary'") self.encoding_method = encoding_method self.variables = _check_input_parameter_variables(variables)
def __init__( self, transformer, variables: Union[None, int, str, List[Union[str, int]]] = None, ) -> None: if not issubclass(transformer.__class__, BaseEstimator): raise TypeError("transformer expected a Scikit-learn transformer, " f"got {transformer} instead.") self.transformer = transformer self.variables = _check_input_parameter_variables(variables)
def __init__( self, encoding_method: str = "count", variables: Union[None, int, str, List[Union[str, int]]] = None, ) -> None: if encoding_method not in ["count", "frequency"]: raise ValueError( "encoding_method takes only values 'count' and 'frequency'") self.encoding_method = encoding_method self.variables = _check_input_parameter_variables(variables)
def __init__( self, variables: Union[None, int, str, List[Union[str, int]]] = None, transformer=None, ) -> None: self.variables = _check_input_parameter_variables(variables) self.transformer = transformer if isinstance(self.transformer, OneHotEncoder) and self.transformer.sparse: raise AttributeError( "The SklearnTransformerWrapper can only wrap the OneHotEncoder if you " "set its sparse attribute to False")
def __init__( self, variables: Union[None, int, str, List[Union[str, int]]] = None, features_to_extract: Union[None, str, List[str]] = None, drop_original: bool = True, missing_values: str = "raise", dayfirst: bool = False, yearfirst: bool = False, utc: Union[None, bool] = None, ) -> None: if features_to_extract: if not ( isinstance(features_to_extract, list) or features_to_extract == "all" ): raise ValueError( "features_to_extract must be a list of strings or 'all'. " f"Got {features_to_extract} instead." ) elif isinstance(features_to_extract, list) and any( feat not in FEATURES_SUPPORTED for feat in features_to_extract ): raise ValueError( "Some of the requested features are not supported. " "Supported features are {}.".format(", ".join(FEATURES_SUPPORTED)) ) if not isinstance(drop_original, bool): raise ValueError( "drop_original takes only booleans True or False. " f"Got {drop_original} instead." ) if missing_values not in ["raise", "ignore"]: raise ValueError( "missing_values takes only values 'raise' or 'ignore'. " f"Got {missing_values} instead." ) if utc is not None and not isinstance(utc, bool): raise ValueError("utc takes only booleans or None. " f"Got {utc} instead.") self.variables = _check_input_parameter_variables(variables) self.drop_original = drop_original self.missing_values = missing_values self.dayfirst = dayfirst self.yearfirst = yearfirst self.utc = utc self.features_to_extract = features_to_extract
def __init__( self, arbitrary_number: Union[int, float] = 999, variables: Union[None, int, str, List[Union[str, int]]] = None, imputer_dict: Optional[dict] = None, ) -> None: if isinstance(arbitrary_number, int) or isinstance(arbitrary_number, float): self.arbitrary_number = arbitrary_number else: raise ValueError("arbitrary_number must be numeric of type int or float") self.variables = _check_input_parameter_variables(variables) self.imputer_dict = _define_numerical_dict(imputer_dict)
def __init__( self, variables: Union[None, int, str, List[Union[str, int]]] = None, bins: int = 10, return_object: bool = False, return_boundaries: bool = False, ) -> None: if not isinstance(bins, int): raise ValueError(f"bins must be an integer. Got {bins} instead.") super().__init__(return_object, return_boundaries) self.bins = bins self.variables = _check_input_parameter_variables(variables)
def __init__( self, variables: Variables = None, bins: int = 5, strategy: str = "equal_width", scoring: str = "roc_auc", cv=3, threshold: Union[int, float] = None, regression: bool = False, confirm_variables: bool = False, ): if not isinstance(bins, int): raise ValueError(f"bins must be an integer. Got {bins} instead.") if strategy not in ["equal_width", "equal_frequency"]: raise ValueError( "strategy takes only values 'equal_width' or 'equal_frequency'. " f"Got {strategy} instead." ) if threshold is not None and not isinstance(threshold, (int, float)): raise ValueError( "threshold can only take integer or float. " f"Got {threshold} instead." ) if regression is True and scoring not in _REGRESSION_METRICS: raise ValueError( f"The metric {scoring} is not suitable for regression. Set the " "parameter regression to False or choose a different performance " "metric." ) if regression is False and scoring not in _CLASSIFICATION_METRICS: raise ValueError( f"The metric {scoring} is not suitable for classification. Set the" "parameter regression to True or choose a different performance " "metric." ) super().__init__(confirm_variables) self.variables = _check_input_parameter_variables(variables) self.bins = bins self.strategy = strategy self.scoring = scoring self.cv = cv self.threshold = threshold self.regression = regression