def _find_or_check_datetime_variables(X: pd.DataFrame, variables: Variables = None ) -> List[Union[str, int]]: """ Checks that variables provided by the user are of type datetime. If None, finds all datetime variables in the DataFrame. Parameters ---------- X : pandas DataFrame variables : variable or list of variables. Defaults to None. Returns ------- variables : List of datetime variables. """ if variables is None: variables = [ column for column in X.select_dtypes(exclude="number").columns if is_datetime(X[column]) or _is_categorical_and_is_datetime(X[column]) ] if len(variables) == 0: raise ValueError("No datetime variables found in this dataframe.") elif isinstance(variables, (str, int)): if is_datetime(X[variables]) or (not is_numeric(X[variables]) and _is_categorical_and_is_datetime( X[variables])): variables = [variables] else: raise TypeError("The variable entered is not datetime.") else: if len(variables) == 0: raise ValueError("The indicated list of variables is empty.") # check that the variables entered by the user are datetime else: vars_non_dt = [ column for column in X[variables].select_dtypes(exclude="datetime") if is_numeric(X[column]) or not _is_categorical_and_is_datetime(X[column]) ] if len(vars_non_dt) > 0: raise TypeError("Some of the variables are not datetime.") return variables
def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ This transformer does not learn any parameter. Finds datetime variables or checks that the variables selected by the user can be converted to datetime. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just the variables to transform. y: pandas Series, default=None It is not needed in this transformer. You can pass y or None. """ # check input dataframe X = check_X(X) # special case index if self.variables == "index": if not ( is_datetime(X.index) or ( not is_numeric(X.index) and _is_categorical_and_is_datetime(X.index) ) ): raise TypeError("The dataframe index is not datetime.") if self.missing_values == "raise": self._check_index_contains_na(X.index) self.variables_ = None else: # find or check for datetime variables self.variables_ = _find_or_check_datetime_variables(X, self.variables) # check if datetime variables contains na if self.missing_values == "raise": _check_contains_na(X, self.variables_) if self.features_to_extract is None: self.features_to_extract_ = FEATURES_DEFAULT elif isinstance(self.features_to_extract, str): self.features_to_extract_ = FEATURES_SUPPORTED else: self.features_to_extract_ = self.features_to_extract # save input features self.feature_names_in_ = X.columns.tolist() # save train set shape self.n_features_in_ = X.shape[1] return self
def _find_or_check_numerical_variables(X: pd.DataFrame, variables: Variables = None ) -> List[Union[str, int]]: """ Checks that variables provided by the user are of type numerical. If None, finds all the numerical variables in the DataFrame. Parameters ---------- X : Pandas DataFrame. variables : variable or list of variables. Defaults to None. Raises ------ ValueError If there are no numerical variables in the df or the df is empty. TypeError If any of the user provided variables are not numerical. Returns ------- variables: List of numerical variables. """ if variables is None: # find numerical variables in dataset variables = list(X.select_dtypes(include="number").columns) if len(variables) == 0: raise ValueError( "No numerical variables found in this dataframe. Please check " "variable format with pandas dtypes.") elif isinstance(variables, (str, int)): if is_numeric(X[variables]): variables = [variables] else: raise TypeError("The variable entered is not numeric.") else: if len(variables) == 0: raise ValueError("The list of variables is empty.") # check that user entered variables are of type numerical else: if len(X[variables].select_dtypes(exclude="number").columns) > 0: raise TypeError( "Some of the variables are not numerical. Please cast them as " "numerical before using this transformer.") return variables
def check_column_dtype(self,data,column_name,required_dtype): ''' :param data : Data :type data : pd.DataFrame, : str - filename (csv/xls/xlsx) :param column_name : column name :type column_name : str :param required_dtype : one among {"Indicator","Date","Country","Value"} :type required_dtype : str :Functionality: checks if values in corresponding column("column_name") of data : has the property of required_dtype : For "Indicator" - all values of col should be str : For "Date" - all values of col should be : For "Country" - all values of col should be str # tbd - check if they are actual country names : For "Value" - all values of col should be int ''' self.check_data(data) assert isinstance(column_name,str) or isinstance(column_name,int) or isinstance(column_name,dt.date) assert column_name in data.keys().values assert required_dtype in self.params["REQUIREDCOLUMNS"]["TYPE1"] if required_dtype=="Indicator": # check if each value in the data.column_name column is str type assert is_string(self.data[column_name]) elif required_dtype=="Date": # check if each value in the data.column_name column is datetime64 type assert is_datetime(self.data[column_name]) elif required_dtype=="Country": # check if each value in the data.column_name column is str type assert is_string(self.data[column_name]) # tbd - check if they are actual country names elif required_dtype=="Value": # check if each value in the data.column_name column is int type assert is_numeric(self.data[column_name]) else: raise NotImplementedError
def contains_op(cls, series: pd.Series) -> bool: return pdt.is_numeric(series)
def _is_convertible_to_num(column: pd.Series) -> bool: return is_numeric(pd.to_numeric(column, errors="ignore"))
def _find_categorical_and_numerical_variables( X: pd.DataFrame, variables: Variables = None ) -> Tuple[List[Union[str, int]], List[Union[str, int]]]: """ Find numerical and categorical variables. Parameters ---------- X : pandas DataFrame variables : List of variables. Defaults to None. Returns ------- variables : Tuple with List of numerical and list of categorical variables. """ # If the user passes just 1 variable outside a list. if isinstance(variables, (str, int)): if is_categorical(X[variables]) or is_object(X[variables]): variables_cat = [variables] variables_num = [] elif is_numeric(X[variables]): variables_num = [variables] variables_cat = [] else: raise TypeError( "The variable entered is neither numerical nor categorical.") # If user leaves default None parameter. elif variables is None: # find categorical variables if variables is None: variables_cat = [ column for column in X.select_dtypes( include=["O", "category"]).columns if _is_categorical_and_is_not_datetime(X[column]) ] # find numerical variables in dataset variables_num = list(X.select_dtypes(include="number").columns) if len(variables_num) == 0 and len(variables_cat) == 0: raise TypeError( "There are no numerical or categorical variables in the dataframe" ) # If user passes variable list. else: if len(variables) == 0: raise ValueError("The list of variables is empty.") # find categorical variables variables_cat = [ var for var in X[variables].select_dtypes( include=["O", "category"]).columns ] # find numerical variables variables_num = list( X[variables].select_dtypes(include="number").columns) if any( [v for v in variables if v not in variables_cat + variables_num]): raise TypeError( "Some of the variables are neither numerical nor categorical.") return variables_cat, variables_num
def _is_categories_num(column: pd.Series) -> bool: return is_numeric(column.dtype.categories)