예제 #1
0
    def test_checknull_old(self):
        for value in na_vals + sometimes_na_vals:
            assert libmissing.checknull(value, inf_as_na=True)

        for value in inf_vals:
            assert libmissing.checknull(value, inf_as_na=True)

        for value in int_na_vals:
            assert not libmissing.checknull(value, inf_as_na=True)

        for value in never_na_vals:
            assert not libmissing.checknull(value, inf_as_na=True)
    def test_checknull(self):
        for value in na_vals:
            assert libmissing.checknull(value)

        for value in inf_vals:
            assert not libmissing.checknull(value)

        for value in int_na_vals:
            assert not libmissing.checknull(value)

        for value in sometimes_na_vals:
            assert not libmissing.checknull(value)

        for value in never_na_vals:
            assert not libmissing.checknull(value)
예제 #3
0
    def test_checknull(self):
        for value in na_vals:
            assert libmissing.checknull(value)

        for value in inf_vals:
            assert not libmissing.checknull(value)

        for value in int_na_vals:
            assert not libmissing.checknull(value)

        for value in sometimes_na_vals:
            assert not libmissing.checknull(value)

        for value in never_na_vals:
            assert not libmissing.checknull(value)
예제 #4
0
def _isna_new(obj):
    if is_scalar(obj):
        return libmissing.checknull(obj)
    # hack (for now) because MI registers as ndarray
    elif isinstance(obj, ABCMultiIndex):
        raise NotImplementedError("isna is not defined for MultiIndex")
    elif isinstance(
        obj,
        (
            ABCSeries,
            np.ndarray,
            ABCIndexClass,
            ABCExtensionArray,
            ABCDatetimeArray,
            ABCTimedeltaArray,
        ),
    ):
        return _isna_ndarraylike(obj)
    elif isinstance(obj, ABCGeneric):
        return obj._constructor(obj._data.isna(func=isna))
    elif isinstance(obj, list):
        return _isna_ndarraylike(np.asarray(obj, dtype=object))
    elif hasattr(obj, "__array__"):
        return _isna_ndarraylike(np.asarray(obj))
    else:
        return obj is None
예제 #5
0
def _to_str_if_not_na(obj: Any) -> Any:
    """
    This function transforms an obj to str if it is not NA.
    The check for NA is similar to pd.isna, but will treat a list obj as
    a scalar and return a single boolean, rather than a list of booleans.
    Otherwise when a cell is tuple or list it will throw an error.
    """
    return obj if libmissing.checknull(obj) else str(obj)
예제 #6
0
def _isna_new(obj):
    if is_scalar(obj):
        return libmissing.checknull(obj)
    # hack (for now) because MI registers as ndarray
    elif isinstance(obj, ABCMultiIndex):
        raise NotImplementedError("isna is not defined for MultiIndex")
    elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass)):
        return _isna_ndarraylike(obj)
    elif isinstance(obj, ABCGeneric):
        return obj._constructor(obj._data.isna(func=isna))
    elif isinstance(obj, list) or hasattr(obj, '__array__'):
        return _isna_ndarraylike(np.asarray(obj))
    else:
        return obj is None
예제 #7
0
파일: missing.py 프로젝트: Xbar/pandas
def _isna_new(obj):
    if is_scalar(obj):
        return libmissing.checknull(obj)
    # hack (for now) because MI registers as ndarray
    elif isinstance(obj, ABCMultiIndex):
        raise NotImplementedError("isna is not defined for MultiIndex")
    elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass)):
        return _isna_ndarraylike(obj)
    elif isinstance(obj, ABCGeneric):
        return obj._constructor(obj._data.isna(func=isna))
    elif isinstance(obj, list) or hasattr(obj, '__array__'):
        return _isna_ndarraylike(np.asarray(obj))
    else:
        return obj is None
예제 #8
0
def _isna(obj, inf_as_na: bool = False):
    """
    Detect missing values, treating None, NaN or NA as null. Infinite
    values will also be treated as null if inf_as_na is True.

    Parameters
    ----------
    obj: ndarray or object value
        Input array or scalar value.
    inf_as_na: bool
        Whether to treat infinity as null.

    Returns
    -------
    boolean ndarray or boolean
    """
    if is_scalar(obj):
        if inf_as_na:
            return libmissing.checknull_old(obj)
        else:
            return libmissing.checknull(obj)
    # hack (for now) because MI registers as ndarray
    elif isinstance(obj, ABCMultiIndex):
        raise NotImplementedError("isna is not defined for MultiIndex")
    elif isinstance(obj, type):
        return False
    elif isinstance(obj, (np.ndarray, ABCExtensionArray)):
        return _isna_array(obj, inf_as_na=inf_as_na)
    elif isinstance(obj, ABCIndex):
        # Try to use cached isna, which also short-circuits for integer dtypes
        #  and avoids materializing RangeIndex._values
        if not obj._can_hold_na:
            return obj.isna()
        return _isna_array(obj._values, inf_as_na=inf_as_na)

    elif isinstance(obj, ABCSeries):
        result = _isna_array(obj._values, inf_as_na=inf_as_na)
        # box
        result = obj._constructor(result, index=obj.index, name=obj.name, copy=False)
        return result
    elif isinstance(obj, ABCDataFrame):
        return obj.isna()
    elif isinstance(obj, list):
        return _isna_array(np.asarray(obj, dtype=object), inf_as_na=inf_as_na)
    elif hasattr(obj, "__array__"):
        return _isna_array(np.asarray(obj), inf_as_na=inf_as_na)
    else:
        return False
예제 #9
0
def _isna(obj, inf_as_na: bool = False):
    """
    Detect missing values, treating None, NaN or NA as null. Infinite
    values will also be treated as null if inf_as_na is True.

    Parameters
    ----------
    obj: ndarray or object value
        Input array or scalar value.
    inf_as_na: bool
        Whether to treat infinity as null.

    Returns
    -------
    boolean ndarray or boolean
    """
    if is_scalar(obj):
        if inf_as_na:
            return libmissing.checknull_old(obj)
        else:
            return libmissing.checknull(obj)
    # hack (for now) because MI registers as ndarray
    elif isinstance(obj, ABCMultiIndex):
        raise NotImplementedError("isna is not defined for MultiIndex")
    elif isinstance(obj, type):
        return False
    elif isinstance(obj, (np.ndarray, ABCExtensionArray)):
        # error: Value of type variable "ArrayLike" of "_isna_array" cannot be
        # "Union[ndarray, ExtensionArray]"
        return _isna_array(obj, inf_as_na=inf_as_na)  # type: ignore[type-var]
    elif isinstance(obj, (ABCSeries, ABCIndex)):
        # error: Value of type variable "ArrayLike" of "_isna_array" cannot be
        # "Union[Any, ExtensionArray, ndarray]"
        result = _isna_array(obj._values, inf_as_na=inf_as_na)  # type: ignore[type-var]
        # box
        if isinstance(obj, ABCSeries):
            result = obj._constructor(
                result, index=obj.index, name=obj.name, copy=False
            )
        return result
    elif isinstance(obj, ABCDataFrame):
        return obj.isna()
    elif isinstance(obj, list):
        return _isna_array(np.asarray(obj, dtype=object), inf_as_na=inf_as_na)
    elif hasattr(obj, "__array__"):
        return _isna_array(np.asarray(obj), inf_as_na=inf_as_na)
    else:
        return False
예제 #10
0
def _isna_new(obj):

    if is_scalar(obj):
        return libmissing.checknull(obj)
    # hack (for now) because MI registers as ndarray
    elif isinstance(obj, ABCMultiIndex):
        raise NotImplementedError("isna is not defined for MultiIndex")
    elif isinstance(obj, type):
        return False
    elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass, ABCExtensionArray)):
        return _isna_ndarraylike(obj)
    elif isinstance(obj, ABCDataFrame):
        return obj.isna()
    elif isinstance(obj, list):
        return _isna_ndarraylike(np.asarray(obj, dtype=object))
    elif hasattr(obj, "__array__"):
        return _isna_ndarraylike(np.asarray(obj))
    else:
        return False
예제 #11
0
def _isna(obj, inf_as_na: bool = False):
    """
    Detect missing values, treating None, NaN or NA as null. Infinite
    values will also be treated as null if inf_as_na is True.

    Parameters
    ----------
    obj: ndarray or object value
        Input array or scalar value.
    inf_as_na: bool
        Whether to treat infinity as null.

    Returns
    -------
    boolean ndarray or boolean
    """
    if is_scalar(obj):
        if inf_as_na:
            return libmissing.checknull_old(obj)
        else:
            return libmissing.checknull(obj)
    # hack (for now) because MI registers as ndarray
    elif isinstance(obj, ABCMultiIndex):
        raise NotImplementedError("isna is not defined for MultiIndex")
    elif isinstance(obj, type):
        return False
    elif isinstance(obj,
                    (ABCSeries, np.ndarray, ABCIndexClass, ABCExtensionArray)):
        return _isna_ndarraylike(obj, inf_as_na=inf_as_na)
    elif isinstance(obj, ABCDataFrame):
        return obj.isna()
    elif isinstance(obj, list):
        return _isna_ndarraylike(np.asarray(obj, dtype=object),
                                 inf_as_na=inf_as_na)
    elif hasattr(obj, "__array__"):
        return _isna_ndarraylike(np.asarray(obj), inf_as_na=inf_as_na)
    else:
        return False
예제 #12
0
파일: utils.py 프로젝트: sfu-db/dataprep
def preprocess_dataframe(
    org_df: Union[pd.DataFrame, dd.DataFrame],
    used_columns: Optional[Union[List[str], List[object]]] = None,
    excluded_columns: Optional[Union[List[str], List[object]]] = None,
    detect_small_distinct: bool = True,
) -> dd.DataFrame:
    """
    Make a dask dataframe with only used_columns.
    This function will do the following:
        1. keep only used_columns.
        2. transform column name to string (avoid object column name) and rename
        duplicate column names in form of {col}_{id}.
        3. reset index
        4. transform object column to string column (note that obj column can contain
        cells from different type).
        5. transform to dask dataframe if input is pandas dataframe.

    Parameters
    ----------------
    org_df: dataframe
        the original dataframe
    used_columns: optional list[str], default None
        used columns in org_df
    excluded_columns: optional list[str], default None
        excluded columns from used_columns, mainly used for geo point data processing.
    detect_small_distinct: bool, default True
        whether to detect numerical columns with small distinct values as categorical column.
    """
    if used_columns is None:
        df = org_df.copy()
    else:
        # Process the case when used_columns are string column name,
        # but org_df column name is object.
        used_columns_set = set(used_columns)
        used_cols_obj = set()
        for col in org_df.columns:
            if str(col) in used_columns_set or col in used_columns_set:
                used_cols_obj.add(col)
        df = org_df[used_cols_obj]

    columns = list(df.columns)

    # Resolve duplicate names in columns.
    # Duplicate names will be renamed as col_{id}.
    column_count = Counter(columns)
    current_id: Dict[Any, int] = dict()
    for i, col in enumerate(columns):
        if column_count[col] > 1:
            current_id[col] = current_id.get(col, 0) + 1
            new_col_name = f"{col}_{current_id[col]}"
        else:
            new_col_name = f"{col}"
        columns[i] = new_col_name

    df.columns = columns
    df = df.reset_index(drop=True)
    df = to_dask(df)

    # Since an object column could contains multiple types
    # in different cells. transform non-na values in object column to string.

    # Function `_notna2str` transforms an obj to str if it is not NA.
    # The check for NA is similar to pd.isna, but will treat a list obj as
    # a scalar and return a single boolean, rather than a list of booleans.
    # Otherwise when a cell is tuple or list it will throw an error.
    _notna2str = lambda obj: obj if libmissing.checknull(obj) else str(obj)
    for col in df.columns:
        col_dtype = detect_dtype(df[col],
                                 detect_small_distinct=detect_small_distinct)
        if (is_dtype(col_dtype, Nominal())) and (
            (excluded_columns is None) or (col not in excluded_columns)):
            df[col] = df[col].apply(_notna2str, meta=(col, "object"))
    return df
예제 #13
0
 def test_checknull_old_never_na_vals(self, value):
     assert not libmissing.checknull(value, inf_as_na=True)