def test_guess_type_of_variable_boolean(sparse): s = pd.Series([True, False, True, None] * 10) s = _convert_sparse(s, sparse) assert guess_type_of_variable(s) == TypeOfVariables.CAT s = pd.Series([True, False, True] * 10) s = _convert_sparse(s, sparse) assert guess_type_of_variable(s) == TypeOfVariables.CAT
def test_ColumnsSelector__get_list_of_columns(): X = pd.DataFrame({"a": [0, 1, 2], "b": ["AAA", "BBB", "CCC"], "c": ["xx", "yy", "zz"], "d": [0.1, 0.2, 0.3]}) assert ColumnsSelector._get_list_of_columns("all", X, regex_match=False) is None assert ColumnsSelector._get_list_of_columns(["a"], X, regex_match=False) == ["a"] assert ColumnsSelector._get_list_of_columns(["a", "b"], X, regex_match=False) == ["a", "b"] assert ColumnsSelector._get_list_of_columns([0, 1, 2], X, regex_match=False) == [0, 1, 2] assert ColumnsSelector._get_list_of_columns(None, X, regex_match=False) == [] assert ColumnsSelector._get_list_of_columns("object", X, regex_match=False) == ["b", "c"] with pytest.raises(TypeError): ColumnsSelector._get_list_of_columns(X.values, "object", regex_match=False) # error : because no DataFrame with pytest.raises(TypeError): ColumnsSelector._get_list_of_columns({"type": "not recognized"}, X, regex_match=False) with pytest.raises(ValueError): ColumnsSelector._get_list_of_columns("object", X, regex_match=True) # error : because regex_match for columns in TypeOfVariables.alls: assert ColumnsSelector._get_list_of_columns(columns, X) == [ c for c in X.columns if guess_type_of_variable(X[c]) == columns ]
def guess_columns_to_encode(X): """ guess which columns should be encoded """ cols = [] for c in list(X.columns): if guess_type_of_variable(X[c]) == TypeOfVariables.CAT: cols.append(c) return cols
def test_guess_type_of_variable(): df = get_sample_df(100) df["cat_col_1"] = df["text_col"].apply(lambda s: s[0:3]) assert guess_type_of_variable(df["float_col"]) == "NUM" assert guess_type_of_variable(df["int_col"]) == "NUM" assert guess_type_of_variable(df["text_col"]) == "TEXT" assert guess_type_of_variable(df["cat_col_1"]) == "CAT" df_with_cat = df.copy() df_with_cat["cat_col_1"] = df_with_cat["cat_col_1"].astype("category") assert np.all([guess_type_of_variable(df[col]) == guess_type_of_variable(df_with_cat[col]) for col in df.columns]) assert (df.values == df_with_cat.values).all()
def _get_list_of_columns(columns, X, regex_match=False): """ retrieve the corresponding list of columns from the specified 'columns' attribute given by the user """ if columns is None: list_columns = [] elif isinstance(columns, str) and columns == "all": # 'columns' == 'all' ==> we keep all the columns list_columns = None elif isinstance(columns, str): if not isinstance(X, pd.DataFrame): raise TypeError( "X should be a DataFrame when 'columns_to_use' or 'columns_to_drop' is a string : %s" % columns ) if regex_match: raise ValueError( "regex_match is True doesn't mean anything when 'columns' is a type : %s" % str(columns) ) if columns in TypeOfVariables.alls: list_columns = [c for c in X.columns if guess_type_of_variable(X[c]) == columns] else: list_columns = list(X.select_dtypes(include=columns).columns) elif isinstance(columns, list): list_columns = columns elif isinstance(columns, np.ndarray): if columns.ndim != 1: raise TypeError("'columns_to_use' or 'columns_to_drop' should be a 1 dimensional array") list_columns = columns else: raise TypeError( "'columns_to_use' or 'columns_to_drop' should be either a string or a list and not %s" % str(type(columns)) ) return list_columns
def test_guess_type_of_variable_boolean(): s = pd.Series([True, False, True, None] * 10) assert guess_type_of_variable(s) == TypeOfVariables.CAT s = pd.Series([True, False, True] * 10) assert guess_type_of_variable(s) == TypeOfVariables.CAT