def test_checknull_old(self): for value in na_vals + sometimes_na_vals: assert libmissing.checknull(value, inf_as_na=True) for value in inf_vals: assert libmissing.checknull(value, inf_as_na=True) for value in int_na_vals: assert not libmissing.checknull(value, inf_as_na=True) for value in never_na_vals: assert not libmissing.checknull(value, inf_as_na=True)
def test_checknull(self): for value in na_vals: assert libmissing.checknull(value) for value in inf_vals: assert not libmissing.checknull(value) for value in int_na_vals: assert not libmissing.checknull(value) for value in sometimes_na_vals: assert not libmissing.checknull(value) for value in never_na_vals: assert not libmissing.checknull(value)
def test_checknull(self): for value in na_vals: assert libmissing.checknull(value) for value in inf_vals: assert not libmissing.checknull(value) for value in int_na_vals: assert not libmissing.checknull(value) for value in sometimes_na_vals: assert not libmissing.checknull(value) for value in never_na_vals: assert not libmissing.checknull(value)
def _isna_new(obj): if is_scalar(obj): return libmissing.checknull(obj) # hack (for now) because MI registers as ndarray elif isinstance(obj, ABCMultiIndex): raise NotImplementedError("isna is not defined for MultiIndex") elif isinstance( obj, ( ABCSeries, np.ndarray, ABCIndexClass, ABCExtensionArray, ABCDatetimeArray, ABCTimedeltaArray, ), ): return _isna_ndarraylike(obj) elif isinstance(obj, ABCGeneric): return obj._constructor(obj._data.isna(func=isna)) elif isinstance(obj, list): return _isna_ndarraylike(np.asarray(obj, dtype=object)) elif hasattr(obj, "__array__"): return _isna_ndarraylike(np.asarray(obj)) else: return obj is None
def _to_str_if_not_na(obj: Any) -> Any: """ This function transforms an obj to str if it is not NA. The check for NA is similar to pd.isna, but will treat a list obj as a scalar and return a single boolean, rather than a list of booleans. Otherwise when a cell is tuple or list it will throw an error. """ return obj if libmissing.checknull(obj) else str(obj)
def _isna_new(obj): if is_scalar(obj): return libmissing.checknull(obj) # hack (for now) because MI registers as ndarray elif isinstance(obj, ABCMultiIndex): raise NotImplementedError("isna is not defined for MultiIndex") elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass)): return _isna_ndarraylike(obj) elif isinstance(obj, ABCGeneric): return obj._constructor(obj._data.isna(func=isna)) elif isinstance(obj, list) or hasattr(obj, '__array__'): return _isna_ndarraylike(np.asarray(obj)) else: return obj is None
def _isna_new(obj): if is_scalar(obj): return libmissing.checknull(obj) # hack (for now) because MI registers as ndarray elif isinstance(obj, ABCMultiIndex): raise NotImplementedError("isna is not defined for MultiIndex") elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass)): return _isna_ndarraylike(obj) elif isinstance(obj, ABCGeneric): return obj._constructor(obj._data.isna(func=isna)) elif isinstance(obj, list) or hasattr(obj, '__array__'): return _isna_ndarraylike(np.asarray(obj)) else: return obj is None
def _isna(obj, inf_as_na: bool = False): """ Detect missing values, treating None, NaN or NA as null. Infinite values will also be treated as null if inf_as_na is True. Parameters ---------- obj: ndarray or object value Input array or scalar value. inf_as_na: bool Whether to treat infinity as null. Returns ------- boolean ndarray or boolean """ if is_scalar(obj): if inf_as_na: return libmissing.checknull_old(obj) else: return libmissing.checknull(obj) # hack (for now) because MI registers as ndarray elif isinstance(obj, ABCMultiIndex): raise NotImplementedError("isna is not defined for MultiIndex") elif isinstance(obj, type): return False elif isinstance(obj, (np.ndarray, ABCExtensionArray)): return _isna_array(obj, inf_as_na=inf_as_na) elif isinstance(obj, ABCIndex): # Try to use cached isna, which also short-circuits for integer dtypes # and avoids materializing RangeIndex._values if not obj._can_hold_na: return obj.isna() return _isna_array(obj._values, inf_as_na=inf_as_na) elif isinstance(obj, ABCSeries): result = _isna_array(obj._values, inf_as_na=inf_as_na) # box result = obj._constructor(result, index=obj.index, name=obj.name, copy=False) return result elif isinstance(obj, ABCDataFrame): return obj.isna() elif isinstance(obj, list): return _isna_array(np.asarray(obj, dtype=object), inf_as_na=inf_as_na) elif hasattr(obj, "__array__"): return _isna_array(np.asarray(obj), inf_as_na=inf_as_na) else: return False
def _isna(obj, inf_as_na: bool = False): """ Detect missing values, treating None, NaN or NA as null. Infinite values will also be treated as null if inf_as_na is True. Parameters ---------- obj: ndarray or object value Input array or scalar value. inf_as_na: bool Whether to treat infinity as null. Returns ------- boolean ndarray or boolean """ if is_scalar(obj): if inf_as_na: return libmissing.checknull_old(obj) else: return libmissing.checknull(obj) # hack (for now) because MI registers as ndarray elif isinstance(obj, ABCMultiIndex): raise NotImplementedError("isna is not defined for MultiIndex") elif isinstance(obj, type): return False elif isinstance(obj, (np.ndarray, ABCExtensionArray)): # error: Value of type variable "ArrayLike" of "_isna_array" cannot be # "Union[ndarray, ExtensionArray]" return _isna_array(obj, inf_as_na=inf_as_na) # type: ignore[type-var] elif isinstance(obj, (ABCSeries, ABCIndex)): # error: Value of type variable "ArrayLike" of "_isna_array" cannot be # "Union[Any, ExtensionArray, ndarray]" result = _isna_array(obj._values, inf_as_na=inf_as_na) # type: ignore[type-var] # box if isinstance(obj, ABCSeries): result = obj._constructor( result, index=obj.index, name=obj.name, copy=False ) return result elif isinstance(obj, ABCDataFrame): return obj.isna() elif isinstance(obj, list): return _isna_array(np.asarray(obj, dtype=object), inf_as_na=inf_as_na) elif hasattr(obj, "__array__"): return _isna_array(np.asarray(obj), inf_as_na=inf_as_na) else: return False
def _isna_new(obj): if is_scalar(obj): return libmissing.checknull(obj) # hack (for now) because MI registers as ndarray elif isinstance(obj, ABCMultiIndex): raise NotImplementedError("isna is not defined for MultiIndex") elif isinstance(obj, type): return False elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass, ABCExtensionArray)): return _isna_ndarraylike(obj) elif isinstance(obj, ABCDataFrame): return obj.isna() elif isinstance(obj, list): return _isna_ndarraylike(np.asarray(obj, dtype=object)) elif hasattr(obj, "__array__"): return _isna_ndarraylike(np.asarray(obj)) else: return False
def _isna(obj, inf_as_na: bool = False): """ Detect missing values, treating None, NaN or NA as null. Infinite values will also be treated as null if inf_as_na is True. Parameters ---------- obj: ndarray or object value Input array or scalar value. inf_as_na: bool Whether to treat infinity as null. Returns ------- boolean ndarray or boolean """ if is_scalar(obj): if inf_as_na: return libmissing.checknull_old(obj) else: return libmissing.checknull(obj) # hack (for now) because MI registers as ndarray elif isinstance(obj, ABCMultiIndex): raise NotImplementedError("isna is not defined for MultiIndex") elif isinstance(obj, type): return False elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass, ABCExtensionArray)): return _isna_ndarraylike(obj, inf_as_na=inf_as_na) elif isinstance(obj, ABCDataFrame): return obj.isna() elif isinstance(obj, list): return _isna_ndarraylike(np.asarray(obj, dtype=object), inf_as_na=inf_as_na) elif hasattr(obj, "__array__"): return _isna_ndarraylike(np.asarray(obj), inf_as_na=inf_as_na) else: return False
def preprocess_dataframe( org_df: Union[pd.DataFrame, dd.DataFrame], used_columns: Optional[Union[List[str], List[object]]] = None, excluded_columns: Optional[Union[List[str], List[object]]] = None, detect_small_distinct: bool = True, ) -> dd.DataFrame: """ Make a dask dataframe with only used_columns. This function will do the following: 1. keep only used_columns. 2. transform column name to string (avoid object column name) and rename duplicate column names in form of {col}_{id}. 3. reset index 4. transform object column to string column (note that obj column can contain cells from different type). 5. transform to dask dataframe if input is pandas dataframe. Parameters ---------------- org_df: dataframe the original dataframe used_columns: optional list[str], default None used columns in org_df excluded_columns: optional list[str], default None excluded columns from used_columns, mainly used for geo point data processing. detect_small_distinct: bool, default True whether to detect numerical columns with small distinct values as categorical column. """ if used_columns is None: df = org_df.copy() else: # Process the case when used_columns are string column name, # but org_df column name is object. used_columns_set = set(used_columns) used_cols_obj = set() for col in org_df.columns: if str(col) in used_columns_set or col in used_columns_set: used_cols_obj.add(col) df = org_df[used_cols_obj] columns = list(df.columns) # Resolve duplicate names in columns. # Duplicate names will be renamed as col_{id}. column_count = Counter(columns) current_id: Dict[Any, int] = dict() for i, col in enumerate(columns): if column_count[col] > 1: current_id[col] = current_id.get(col, 0) + 1 new_col_name = f"{col}_{current_id[col]}" else: new_col_name = f"{col}" columns[i] = new_col_name df.columns = columns df = df.reset_index(drop=True) df = to_dask(df) # Since an object column could contains multiple types # in different cells. transform non-na values in object column to string. # Function `_notna2str` transforms an obj to str if it is not NA. # The check for NA is similar to pd.isna, but will treat a list obj as # a scalar and return a single boolean, rather than a list of booleans. # Otherwise when a cell is tuple or list it will throw an error. _notna2str = lambda obj: obj if libmissing.checknull(obj) else str(obj) for col in df.columns: col_dtype = detect_dtype(df[col], detect_small_distinct=detect_small_distinct) if (is_dtype(col_dtype, Nominal())) and ( (excluded_columns is None) or (col not in excluded_columns)): df[col] = df[col].apply(_notna2str, meta=(col, "object")) return df
def test_checknull_old_never_na_vals(self, value): assert not libmissing.checknull(value, inf_as_na=True)