예제 #1
def _find_or_check_datetime_variables(X: pd.DataFrame,
                                      variables: Variables = None
                                      ) -> List[Union[str, int]]:
    Checks that variables provided by the user are of type datetime.
    If None, finds all datetime variables in the DataFrame.

    X : pandas DataFrame
    variables : variable or list of variables. Defaults to None.

    variables : List of datetime variables.

    if variables is None:
        variables = [
            column for column in X.select_dtypes(exclude="number").columns
            if is_datetime(X[column])
            or _is_categorical_and_is_datetime(X[column])

        if len(variables) == 0:
            raise ValueError("No datetime variables found in this dataframe.")

    elif isinstance(variables, (str, int)):

        if is_datetime(X[variables]) or (not is_numeric(X[variables])
                                         and _is_categorical_and_is_datetime(
            variables = [variables]
            raise TypeError("The variable entered is not datetime.")

        if len(variables) == 0:
            raise ValueError("The indicated list of variables is empty.")

        # check that the variables entered by the user are datetime
            vars_non_dt = [
                for column in X[variables].select_dtypes(exclude="datetime")
                if is_numeric(X[column])
                or not _is_categorical_and_is_datetime(X[column])

            if len(vars_non_dt) > 0:
                raise TypeError("Some of the variables are not datetime.")

    return variables
예제 #2
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        This transformer does not learn any parameter.

        Finds datetime variables or checks that the variables selected by the user
        can be converted to datetime.

        X: pandas dataframe of shape = [n_samples, n_features]
            The training input samples. Can be the entire dataframe, not just the
            variables to transform.

        y: pandas Series, default=None
            It is not needed in this transformer. You can pass y or None.
        # check input dataframe
        X = check_X(X)

        # special case index
        if self.variables == "index":

            if not (
                or (
                    not is_numeric(X.index) and _is_categorical_and_is_datetime(X.index)
                raise TypeError("The dataframe index is not datetime.")

            if self.missing_values == "raise":

            self.variables_ = None

            # find or check for datetime variables
            self.variables_ = _find_or_check_datetime_variables(X, self.variables)

            # check if datetime variables contains na
            if self.missing_values == "raise":
                _check_contains_na(X, self.variables_)

        if self.features_to_extract is None:
            self.features_to_extract_ = FEATURES_DEFAULT
        elif isinstance(self.features_to_extract, str):
            self.features_to_extract_ = FEATURES_SUPPORTED
            self.features_to_extract_ = self.features_to_extract

        # save input features
        self.feature_names_in_ = X.columns.tolist()

        # save train set shape
        self.n_features_in_ = X.shape[1]

        return self
예제 #3
def _find_or_check_numerical_variables(X: pd.DataFrame,
                                       variables: Variables = None
                                       ) -> List[Union[str, int]]:
    Checks that variables provided by the user are of type numerical. If None, finds
    all the numerical variables in the DataFrame.

    X : Pandas DataFrame.
    variables : variable or list of variables. Defaults to None.

        If there are no numerical variables in the df or the df is empty.
        If any of the user provided variables are not numerical.

    variables: List of numerical variables.

    if variables is None:
        # find numerical variables in dataset
        variables = list(X.select_dtypes(include="number").columns)
        if len(variables) == 0:
            raise ValueError(
                "No numerical variables found in this dataframe. Please check "
                "variable format with pandas dtypes.")

    elif isinstance(variables, (str, int)):
        if is_numeric(X[variables]):
            variables = [variables]
            raise TypeError("The variable entered is not numeric.")

        if len(variables) == 0:
            raise ValueError("The list of variables is empty.")

        # check that user entered variables are of type numerical
            if len(X[variables].select_dtypes(exclude="number").columns) > 0:
                raise TypeError(
                    "Some of the variables are not numerical. Please cast them as "
                    "numerical before using this transformer.")

    return variables
예제 #4
	def check_column_dtype(self,data,column_name,required_dtype):
		:param data : Data 
		:type data  : pd.DataFrame,
		:			  str - filename (csv/xls/xlsx)
		:param column_name : column name
		:type column_name  : str
		:param required_dtype : one among {"Indicator","Date","Country","Value"}
		:type required_dtype  : str
		:Functionality: checks if values in corresponding column("column_name") of data
		:               has the property of required_dtype
		:               For "Indicator" - all values of col should be str
		:               For "Date" - all values of col should be 
		:               For "Country" - all values of col should be str # tbd - check if they are actual country names
		:               For "Value" - all values of col should be int

		assert isinstance(column_name,str) or isinstance(column_name,int) or isinstance(column_name,dt.date)
		assert column_name in data.keys().values
		assert required_dtype in self.params["REQUIREDCOLUMNS"]["TYPE1"]
		if required_dtype=="Indicator":
			# check if each value in the data.column_name column is str type
			assert is_string(self.data[column_name])
		elif required_dtype=="Date":
			# check if each value in the data.column_name column is datetime64 type
			assert is_datetime(self.data[column_name])
		elif required_dtype=="Country":
			# check if each value in the data.column_name column is str type
			assert is_string(self.data[column_name])    
			# tbd - check if they are actual country names
		elif required_dtype=="Value":
			# check if each value in the data.column_name column is int type
			assert is_numeric(self.data[column_name])
			raise NotImplementedError
예제 #5
 def contains_op(cls, series: pd.Series) -> bool:
     return pdt.is_numeric(series)
예제 #6
def _is_convertible_to_num(column: pd.Series) -> bool:
    return is_numeric(pd.to_numeric(column, errors="ignore"))
예제 #7
def _find_categorical_and_numerical_variables(
    X: pd.DataFrame,
    variables: Variables = None
) -> Tuple[List[Union[str, int]], List[Union[str, int]]]:
    Find numerical and categorical variables.

    X :  pandas DataFrame

    variables : List of variables. Defaults to None.

    variables : Tuple with List of numerical and list of categorical variables.

    # If the user passes just 1 variable outside a list.
    if isinstance(variables, (str, int)):

        if is_categorical(X[variables]) or is_object(X[variables]):
            variables_cat = [variables]
            variables_num = []
        elif is_numeric(X[variables]):
            variables_num = [variables]
            variables_cat = []
            raise TypeError(
                "The variable entered is neither numerical nor categorical.")

    # If user leaves default None parameter.
    elif variables is None:
        # find categorical variables
        if variables is None:
            variables_cat = [
                column for column in X.select_dtypes(
                    include=["O", "category"]).columns
                if _is_categorical_and_is_not_datetime(X[column])
        # find numerical variables in dataset
        variables_num = list(X.select_dtypes(include="number").columns)

        if len(variables_num) == 0 and len(variables_cat) == 0:
            raise TypeError(
                "There are no numerical or categorical variables in the dataframe"

    # If user passes variable list.
        if len(variables) == 0:
            raise ValueError("The list of variables is empty.")

        # find categorical variables
        variables_cat = [
            var for var in X[variables].select_dtypes(
                include=["O", "category"]).columns

        # find numerical variables
        variables_num = list(

        if any(
            [v for v in variables if v not in variables_cat + variables_num]):
            raise TypeError(
                "Some of the variables are neither numerical nor categorical.")

    return variables_cat, variables_num
예제 #8
def _is_categories_num(column: pd.Series) -> bool:
    return is_numeric(column.dtype.categories)