def test_define_variables(): vars_ls = ['var1', 'var2', 'var1'] vars_none = None vars_str = 'var1' assert _define_variables(vars_ls) == vars_ls assert _define_variables(vars_none) == vars_none assert _define_variables(vars_str) == [vars_str]
def __init__(self, distribution='gaussian', tail='right', fold=3, variables=None, missing_values='raise'): if distribution not in ['gaussian', 'skewed', 'quantiles']: raise ValueError( "distribution takes only values 'gaussian', 'skewed' or 'quantiles'" ) if tail not in ['right', 'left', 'both']: raise ValueError( "tail takes only values 'right', 'left' or 'both'") if fold <= 0: raise ValueError("fold takes only positive numbers") if distribution == 'quantiles' and fold > 0.2: raise ValueError( "with distribution='quantiles', fold takes values between 0 and 0.20 only." ) if missing_values not in ['raise', 'ignore']: raise ValueError( "missing_values takes only values 'raise' or 'ignore'") self.distribution = distribution self.tail = tail self.fold = fold self.variables = _define_variables(variables) self.missing_values = missing_values
def __init__(self, how='missing_only', variables=None): if how not in ['missing_only', 'all']: raise ValueError("how takes only values 'missing_only' or 'all'") self.variables = _define_variables(variables) self.how = how
def __init__(self, tol=0.05, n_categories=10, max_n_categories=None, variables=None, replace_with='Rare'): if tol < 0 or tol > 1: raise ValueError("tol takes values between 0 and 1") if n_categories < 0 or not isinstance(n_categories, int): raise ValueError( "n_categories takes only positive integer numbers") if max_n_categories is not None: if max_n_categories < 0 or not isinstance(max_n_categories, int): raise ValueError( "max_n_categories takes only positive integer numbers") if not isinstance(replace_with, str): raise ValueError("replace_with takes only strings as values.") self.tol = tol self.n_categories = n_categories self.max_n_categories = max_n_categories self.variables = _define_variables(variables) self.replace_with = replace_with
def __init__(self, variables=None, random_state=None, seed='general', seeding_method='add'): if seed not in ['general', 'observation']: raise ValueError( "seed takes only values 'general' or 'observation'") if seeding_method not in ['add', 'multiply']: raise ValueError( "seeding_method takes only values 'add' or 'multiply'") if seed == 'general' and random_state: if not isinstance(random_state, int): raise ValueError( "if seed == 'general' the random state must take an integer" ) if seed == 'observation' and not random_state: raise ValueError( "if seed == 'observation' the random state must take the name of one or more variables " "which will be used to seed the imputer") self.variables = _define_variables(variables) self.random_state = random_state self.seed = seed self.seeding_method = seeding_method
def __init__(self, exp=0.5, variables=None): if not isinstance(exp, float) and not isinstance(exp, int): raise ValueError('exp must be a float or an int') self.exp = exp self.variables = _define_variables(variables)
def __init__(self, base='e', variables=None): if base not in ['e', '10']: raise ValueError("base can take only '10' or 'e' as values") self.variables = _define_variables(variables) self.base = base
def __init__(self, features_to_drop=None): self.features_to_drop = _define_variables(features_to_drop) if len(self.features_to_drop) == 0: raise ValueError( 'List of features to drop cannot be empty. Please pass at least 1 variable to drop' )
def __init__(self, imputation_method='median', variables=None): if imputation_method not in ['median', 'mean']: raise ValueError( "imputation_method takes only values 'median' or 'mean'") self.imputation_method = imputation_method self.variables = _define_variables(variables)
def __init__(self, encoding_method='ordered', variables=None): if encoding_method not in ['ordered', 'arbitrary']: raise ValueError( "encoding_method takes only values 'ordered' and 'arbitrary'") self.encoding_method = encoding_method self.variables = _define_variables(variables)
def __init__(self, encoding_method='count', variables=None): if encoding_method not in ['count', 'frequency']: raise ValueError( "encoding_method takes only values 'count' and 'frequency'") self.encoding_method = encoding_method self.variables = _define_variables(variables)
def __init__(self, encoding_method='woe', variables=None): if encoding_method not in ['woe', 'ratio', 'log_ratio']: raise ValueError( "encoding_method takes only values 'woe', 'ratio' and 'log_ratio'" ) self.encoding_method = encoding_method self.variables = _define_variables(variables)
def __init__(self, top_categories=None, variables=None, drop_last=False): if top_categories: if not isinstance(top_categories, int): raise ValueError( "top_categories takes only integer numbers, 1, 2, 3, etc.") if drop_last not in [True, False]: raise ValueError("drop_last takes only True or False") self.top_categories = top_categories self.drop_last = drop_last self.variables = _define_variables(variables)
def __init__(self, q=10, variables=None, return_object=False, return_boundaries=False): if not isinstance(q, int): raise ValueError('q must be an integer') if not isinstance(return_object, bool): raise ValueError('return_object must be True or False') if not isinstance(return_boundaries, bool): raise ValueError('return_boundaries must be True or False') self.q = q self.variables = _define_variables(variables) self.return_object = return_object self.return_boundaries = return_boundaries
def __init__(self, arbitrary_number=999, variables=None, imputer_dict=None): if isinstance(arbitrary_number, int) or isinstance( arbitrary_number, float): self.arbitrary_number = arbitrary_number else: raise ValueError( 'arbitrary_number must be numeric of type int or float') self.variables = _define_variables(variables) self.imputer_dict = _define_numerical_dict(imputer_dict)
def __init__(self, cv=3, scoring='neg_mean_squared_error', variables=None, param_grid={'max_depth': [1, 2, 3, 4]}, regression=True, random_state=None): if not isinstance(cv, int) or cv < 0: raise ValueError('cv can only take only positive integers') if not isinstance(regression, bool): raise ValueError('regression can only take True or False') self.cv = cv self.scoring = scoring self.regression = regression self.variables = _define_variables(variables) self.param_grid = param_grid self.random_state = random_state
def __init__(self, encoding_method='arbitrary', cv=3, scoring='neg_mean_squared_error', param_grid={'max_depth': [1, 2, 3, 4]}, regression=True, random_state=None, variables=None): self.encoding_method = encoding_method self.cv = cv self.scoring = scoring self.regression = regression self.param_grid = param_grid self.random_state = random_state self.variables = _define_variables(variables)
def __init__(self, imputation_method='missing', fill_value='Missing', variables=None, return_object=False): if imputation_method not in ['missing', 'frequent']: raise ValueError( "imputation_method takes only values 'missing' or 'frequent'") if not isinstance(fill_value, str): raise ValueError("parameter 'fill_value' should be string") self.imputation_method = imputation_method self.fill_value = fill_value self.variables = _define_variables(variables) self.return_object = return_object
def fit(self, X, y=None): """ Makes a copy of the variables to impute in the training dataframe from which it will randomly extract the values to fill the missing data during transform. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just he variables to impute. y : None y is not needed in this imputation. You can pass None or y. Attributes ---------- X_ : dataframe. Copy of the training dataframe from which to extract the random samples. """ # check input dataframe X = _is_dataframe(X) # find variables to impute if not self.variables: self.variables = [var for var in X.columns] else: self.variables = self.variables # take a copy of the selected variables self.X_ = X[self.variables].copy() # check the variables assigned to the random state if self.seed == 'observation': self.random_state = _define_variables(self.random_state) if len([var for var in self.random_state if var not in X.columns]) > 0: raise ValueError( "There are variables assigned as random state which are not part of the training " "dataframe.") self.input_shape_ = X.shape return self
def __init__(self, distribution='gaussian', tail='right', fold=3, variables=None): if distribution not in ['gaussian', 'skewed', 'max']: raise ValueError( "distribution takes only values 'gaussian', 'skewed' or 'max'") if tail not in ['right', 'left']: raise ValueError("tail takes only values 'right' or 'left'") if fold <= 0: raise ValueError("fold takes only positive numbers") self.distribution = distribution self.tail = tail self.fold = fold self.variables = _define_variables(variables)
def __init__(self, variables=None, transformer=None): self.variables = _define_variables(variables) self.transformer = transformer if isinstance(self.transformer, OneHotEncoder) and self.transformer.sparse: raise AttributeError('The SklearnTransformerWrapper can only wrap the OneHotEncoder if you ' 'set its sparse attribute to False')
def __init__(self, variables=None): self.variables = _define_variables(variables)