예제 #1
0
def _find_or_check_datetime_variables(X: pd.DataFrame,
                                      variables: Variables = None
                                      ) -> List[Union[str, int]]:
    """
    Checks that variables provided by the user are of type datetime.
    If None, finds all datetime variables in the DataFrame.

    Parameters
    ----------
    X : pandas DataFrame
    variables : variable or list of variables. Defaults to None.

    Returns
    -------
    variables : List of datetime variables.
    """

    if variables is None:
        variables = [
            column for column in X.select_dtypes(exclude="number").columns
            if is_datetime(X[column])
            or _is_categorical_and_is_datetime(X[column])
        ]

        if len(variables) == 0:
            raise ValueError("No datetime variables found in this dataframe.")

    elif isinstance(variables, (str, int)):

        if is_datetime(X[variables]) or (not is_numeric(X[variables])
                                         and _is_categorical_and_is_datetime(
                                             X[variables])):
            variables = [variables]
        else:
            raise TypeError("The variable entered is not datetime.")

    else:
        if len(variables) == 0:
            raise ValueError("The indicated list of variables is empty.")

        # check that the variables entered by the user are datetime
        else:
            vars_non_dt = [
                column
                for column in X[variables].select_dtypes(exclude="datetime")
                if is_numeric(X[column])
                or not _is_categorical_and_is_datetime(X[column])
            ]

            if len(vars_non_dt) > 0:
                raise TypeError("Some of the variables are not datetime.")

    return variables
예제 #2
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        This transformer does not learn any parameter.

        Finds datetime variables or checks that the variables selected by the user
        can be converted to datetime.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training input samples. Can be the entire dataframe, not just the
            variables to transform.

        y: pandas Series, default=None
            It is not needed in this transformer. You can pass y or None.
        """
        # check input dataframe
        X = check_X(X)

        # special case index
        if self.variables == "index":

            if not (
                is_datetime(X.index)
                or (
                    not is_numeric(X.index) and _is_categorical_and_is_datetime(X.index)
                )
            ):
                raise TypeError("The dataframe index is not datetime.")

            if self.missing_values == "raise":
                self._check_index_contains_na(X.index)

            self.variables_ = None

        else:
            # find or check for datetime variables
            self.variables_ = _find_or_check_datetime_variables(X, self.variables)

            # check if datetime variables contains na
            if self.missing_values == "raise":
                _check_contains_na(X, self.variables_)

        if self.features_to_extract is None:
            self.features_to_extract_ = FEATURES_DEFAULT
        elif isinstance(self.features_to_extract, str):
            self.features_to_extract_ = FEATURES_SUPPORTED
        else:
            self.features_to_extract_ = self.features_to_extract

        # save input features
        self.feature_names_in_ = X.columns.tolist()

        # save train set shape
        self.n_features_in_ = X.shape[1]

        return self
예제 #3
0
def _find_or_check_numerical_variables(X: pd.DataFrame,
                                       variables: Variables = None
                                       ) -> List[Union[str, int]]:
    """
    Checks that variables provided by the user are of type numerical. If None, finds
    all the numerical variables in the DataFrame.

    Parameters
    ----------
    X : Pandas DataFrame.
    variables : variable or list of variables. Defaults to None.

    Raises
    ------
    ValueError
        If there are no numerical variables in the df or the df is empty.
    TypeError
        If any of the user provided variables are not numerical.

    Returns
    -------
    variables: List of numerical variables.
    """

    if variables is None:
        # find numerical variables in dataset
        variables = list(X.select_dtypes(include="number").columns)
        if len(variables) == 0:
            raise ValueError(
                "No numerical variables found in this dataframe. Please check "
                "variable format with pandas dtypes.")

    elif isinstance(variables, (str, int)):
        if is_numeric(X[variables]):
            variables = [variables]
        else:
            raise TypeError("The variable entered is not numeric.")

    else:
        if len(variables) == 0:
            raise ValueError("The list of variables is empty.")

        # check that user entered variables are of type numerical
        else:
            if len(X[variables].select_dtypes(exclude="number").columns) > 0:
                raise TypeError(
                    "Some of the variables are not numerical. Please cast them as "
                    "numerical before using this transformer.")

    return variables
예제 #4
0
	def check_column_dtype(self,data,column_name,required_dtype):
		'''
		:param data : Data 
		:type data  : pd.DataFrame,
		:			  str - filename (csv/xls/xlsx)
		:param column_name : column name
		:type column_name  : str
		:param required_dtype : one among {"Indicator","Date","Country","Value"}
		:type required_dtype  : str
		:Functionality: checks if values in corresponding column("column_name") of data
		:               has the property of required_dtype
		:               For "Indicator" - all values of col should be str
		:               For "Date" - all values of col should be 
		:               For "Country" - all values of col should be str # tbd - check if they are actual country names
		:               For "Value" - all values of col should be int
		'''

		self.check_data(data)
		assert isinstance(column_name,str) or isinstance(column_name,int) or isinstance(column_name,dt.date)
		assert column_name in data.keys().values
		assert required_dtype in self.params["REQUIREDCOLUMNS"]["TYPE1"]
		if required_dtype=="Indicator":
			# check if each value in the data.column_name column is str type
			assert is_string(self.data[column_name])
		elif required_dtype=="Date":
			# check if each value in the data.column_name column is datetime64 type
			assert is_datetime(self.data[column_name])
		elif required_dtype=="Country":
			# check if each value in the data.column_name column is str type
			assert is_string(self.data[column_name])    
			# tbd - check if they are actual country names
		elif required_dtype=="Value":
			# check if each value in the data.column_name column is int type
			assert is_numeric(self.data[column_name])
		else:
			raise NotImplementedError
예제 #5
0
 def contains_op(cls, series: pd.Series) -> bool:
     return pdt.is_numeric(series)
예제 #6
0
def _is_convertible_to_num(column: pd.Series) -> bool:
    return is_numeric(pd.to_numeric(column, errors="ignore"))
예제 #7
0
def _find_categorical_and_numerical_variables(
    X: pd.DataFrame,
    variables: Variables = None
) -> Tuple[List[Union[str, int]], List[Union[str, int]]]:
    """
    Find numerical and categorical variables.

    Parameters
    ----------
    X :  pandas DataFrame

    variables : List of variables. Defaults to None.

    Returns
    -------
    variables : Tuple with List of numerical and list of categorical variables.
    """

    # If the user passes just 1 variable outside a list.
    if isinstance(variables, (str, int)):

        if is_categorical(X[variables]) or is_object(X[variables]):
            variables_cat = [variables]
            variables_num = []
        elif is_numeric(X[variables]):
            variables_num = [variables]
            variables_cat = []
        else:
            raise TypeError(
                "The variable entered is neither numerical nor categorical.")

    # If user leaves default None parameter.
    elif variables is None:
        # find categorical variables
        if variables is None:
            variables_cat = [
                column for column in X.select_dtypes(
                    include=["O", "category"]).columns
                if _is_categorical_and_is_not_datetime(X[column])
            ]
        # find numerical variables in dataset
        variables_num = list(X.select_dtypes(include="number").columns)

        if len(variables_num) == 0 and len(variables_cat) == 0:
            raise TypeError(
                "There are no numerical or categorical variables in the dataframe"
            )

    # If user passes variable list.
    else:
        if len(variables) == 0:
            raise ValueError("The list of variables is empty.")

        # find categorical variables
        variables_cat = [
            var for var in X[variables].select_dtypes(
                include=["O", "category"]).columns
        ]

        # find numerical variables
        variables_num = list(
            X[variables].select_dtypes(include="number").columns)

        if any(
            [v for v in variables if v not in variables_cat + variables_num]):
            raise TypeError(
                "Some of the variables are neither numerical nor categorical.")

    return variables_cat, variables_num
예제 #8
0
def _is_categories_num(column: pd.Series) -> bool:
    return is_numeric(column.dtype.categories)