Exemplo n.º 1
0
def special_dtype_handling(df, spec_dtypes, spec_timezones, schema):
    """
    Wrapper around functions for special handling of specific dtypes

    Args:
        Universal:
            df (pd.DataFrame): Dataframe being operated on
            spec_dtypes (dict<str:np.dtype or str>):
                a dict from column names to dtypes
            schema (str): The schema of the table the df is being uploaded to
        For datetimes:
            spec_timezones (dict<str, str>):
                Dictionary from datetime columns to the timezone they
                represent. If the column is timezone-naive, it will have the
                timezone added to its metadata, leaving the times themselves
                unmodified. If the column is timezone-aware, the timezone
                will be converted, likely modifying the stored times.

    """
    df = apply_spec_dtypes(df, spec_dtypes)

    # All datetime columns, regardless of timezone naive/aware
    datetime_cols = [col for col in df.columns
                     if is_datetime64_any_dtype(df.dtypes[col])]

    convert_to_spec_timezones(df, datetime_cols, spec_timezones)
    make_datetimes_timezone_naive(df, datetime_cols, schema)

    return df
Exemplo n.º 2
0
def py2rpy_pandasseries(obj):
    if obj.dtype.name == 'O':
        warnings.warn('Element "%s" is of dtype "O" and converted '
                      'to R vector of strings.' % obj.name)
        res = StrVector(obj)
    elif obj.dtype.name == 'category':
        res = py2rpy_categoryseries(obj)
        res = FactorVector(res)
    elif is_datetime64_any_dtype(obj.dtype):
        # time series
        tzname = obj.dt.tz.zone if obj.dt.tz else ''
        d = [
            IntVector([x.year for x in obj]),
            IntVector([x.month for x in obj]),
            IntVector([x.day for x in obj]),
            IntVector([x.hour for x in obj]),
            IntVector([x.minute for x in obj]),
            FloatSexpVector([x.second + x.microsecond * 1e-6 for x in obj])
        ]
        res = ISOdatetime(*d, tz=StrSexpVector([tzname]))
        # TODO: can the POSIXct be created from the POSIXct constructor ?
        # (is '<M8[ns]' mapping to Python datetime.datetime ?)
        res = POSIXct(res)
    elif (obj.dtype == dt_O_type):
        homogeneous_type = None
        for x in obj.values:
            if x is None:
                continue
            if homogeneous_type is None:
                homogeneous_type = type(x)
                continue
            if type(x) is not homogeneous_type:
                raise ValueError('Series can only be of one type, or None.')
        # TODO: Could this be merged with obj.type.name == 'O' case above ?
        res = {
            int: IntVector,
            bool: BoolVector,
            None: BoolVector,
            str: StrVector,
            bytes: numpy2ri.converter.py2rpy.registry[numpy.ndarray]
        }[homogeneous_type](obj)
    else:
        # converted as a numpy array
        func = numpy2ri.converter.py2rpy.registry[numpy.ndarray]
        # current conversion as performed by numpy

        res = func(obj)
        if len(obj.shape) == 1:
            if (obj.dtype != dt_O_type):
                # force into an R vector
                res = as_vector(res)

    # "index" is equivalent to "names" in R
    if obj.ndim == 1:
        res.do_slot_assign('names',
                           StrVector(tuple(str(x) for x in obj.index)))
    else:
        res.do_slot_assign('dimnames',
                           SexpVector(conversion.py2rpy(obj.index)))
    return res
Exemplo n.º 3
0
    def restore_type(self, dtype, sample=None):
        """Restore type from Pandas
        """

        # Pandas types
        if pdc.is_bool_dtype(dtype):
            return 'boolean'
        elif pdc.is_datetime64_any_dtype(dtype):
            return 'datetime'
        elif pdc.is_integer_dtype(dtype):
            return 'integer'
        elif pdc.is_numeric_dtype(dtype):
            return 'number'

        # Python types
        if sample is not None:
            if isinstance(sample, (list, tuple)):
                return 'array'
            elif isinstance(sample, datetime.date):
                return 'date'
            elif isinstance(sample, isodate.Duration):
                return 'duration'
            elif isinstance(sample, dict):
                return 'object'
            elif isinstance(sample, six.string_types):
                return 'string'
            elif isinstance(sample, datetime.time):
                return 'time'

        return 'string'
Exemplo n.º 4
0
def to_num_datetime(col, name='array', thresh=0.80, **kwargs):
    '''convert col to numeric or datetime if possible, otherwise remain
    unchaged 
    
    parameters
    ----
    col --> series, scalar or ndarry will be turned into series type
    
    name --> name of the col series 
    
    thresh --> default 0.8 
        - if more than the thresh percentage of X could be converted, 
          then should commit conversion   
    **kwargs 
    
    - errors - {'ignore', 'raise', 'coerce'}, default --> 'coerce'
        - If 'raise', then invalid parsing will raise an exception
        - If 'coerce', then invalid parsing will be set as NaN
        - If 'ignore', then invalid parsing will return the input
    other pandas to_datetime key words
    
    return
    ----
    converted series or df
    '''
    try:
        col = pd.Series(col)
    except Exception:
        raise Exception('col must be 1-d array/list/tuple/dict/Series')

    if api.is_numeric_dtype(col):
        return col
    if api.is_datetime64_any_dtype(col):
        return col
    if api.is_categorical_dtype(col):
        return col
    if col.count() == 0:
        return col

    is_numeric_convertible = False
    not_null_count = col.count()

    try:
        num = pd.to_numeric(col, errors=kwargs.get('errors', 'coerce'))
        if num.count() / not_null_count >= thresh:
            col = num
            is_numeric_convertible = True
    except:
        pass
    if not is_numeric_convertible:
        params = {'errors': 'coerce', 'infter_datetime_format': True}
        params.update(kwargs)
        try:
            date = pd.to_datetime(col, **params)
            if pd.notnull(date).sum() / not_null_count >= thresh:
                col = date
        except:
            pass
    return col
Exemplo n.º 5
0
def py2ri_pandasseries(obj):
    if numpy.dtype.name == 'O':
        warnings.warn(
            'Element "%s" is of dtype "O" and converted to R vector of strings.'
            % obj.name)
        res = StrVector(obj)
    elif obj.dtype.name == 'category':
        res = py2ri_categoryseries(obj)
        res = FactorVector(res)
    elif is_datetime64_any_dtype(obj.dtype):
        # time series
        tzname = obj.dt.tz.zone if obj.dt.tz else ''
        d = [
            IntVector([x.year for x in obj]),
            IntVector([x.month for x in obj]),
            IntVector([x.day for x in obj]),
            IntVector([x.hour for x in obj]),
            IntVector([x.minute for x in obj]),
            IntVector([x.second for x in obj])
        ]
        res = ISOdatetime(*d, tz=StrSexpVector([tzname]))
        #FIXME: can the POSIXct be created from the POSIXct constructor ?
        # (is '<M8[ns]' mapping to Python datetime.datetime ?)
        res = POSIXct(res)
    else:
        # converted as a numpy array
        func = numpy2ri.converter.py2ri.registry[numpy.ndarray]
        # current conversion as performed by numpy
        res = func(obj)
        if len(obj.shape) == 1:
            if (obj.dtype != dt_O_type):
                # force into an R vector
                res = as_vector(res)

    # "index" is equivalent to "names" in R
    if obj.ndim == 1:
        res.do_slot_assign('names',
                           StrVector(tuple(str(x) for x in obj.index)))
    else:
        res.do_slot_assign('dimnames', SexpVector(conversion.py2ri(obj.index)))
    return res
Exemplo n.º 6
0
def get_datetime_cols(df):
    return [
        col for col in df.columns if is_datetime64_any_dtype(df.dtypes[col])
    ]
Exemplo n.º 7
0
def to_num_datetime(col, name='array', thresh=0.75, **kwargs):
    '''convert col to numeric or datetime if possible, otherwise remain
    unchaged 
    
    parameters
    -----------
    col : series scalar or ndarry
        input sequence
    
    name : str
        name of the col series 
    
    thresh : float
        default 0.8,  
        if more than the thresh percentage of X could be converted, 
          then should commit conversion   
    
    keyword args
    ------------
    other pandas to_datetime key words
    
    errors : {'ignore', 'raise', 'coerce'}
        default 'coerce'
        
        If 'raise', then invalid parsing will raise an exception
        
        If 'coerce', then invalid parsing will be set as NaN
        
        If 'ignore', then invalid parsing will return the input
    
    
    return
    --------
    s : series
        converted col
    '''
    try:
        col = pd.Series(col)
    except Exception:
        raise Exception('col must be 1-d array/list/tuple/dict/Series')

    if api.is_numeric_dtype(col):
        return col
    if api.is_datetime64_any_dtype(col):
        return col
    if api.is_categorical_dtype(col):
        return col
    if col.count() == 0:
        return col
    if col.astype(str).str.contains('^0\d+$').any():
        return col

    is_numeric_convertible = False
    not_null_count = col.count()

    try:
        num = pd.to_numeric(col, errors=kwargs.get('errors', 'coerce'))
        if num.count() / not_null_count >= thresh:
            col = num
            is_numeric_convertible = True
    except:
        pass
    if not is_numeric_convertible:
        params = {'errors': 'coerce', 'infer_datetime_format': True}
        params.update(kwargs)
        try:
            date = pd.to_datetime(col, **params)
            if pd.notnull(date).sum() / not_null_count >= thresh:
                col = date
            else:
                col = col.apply(lambda x: x if pd.isna(x) else str(x))
        except:
            pass

    return col
Exemplo n.º 8
0
def dtype_specific_binary(left,
                          right,
                          numerics,
                          datetimes,
                          bools,
                          strings,
                          categoricals,
                          intervals,
                          errors='ignore'):
    """
    A low-level base binary function to perform different binary operations based on the dtypes of left, right inputs.
    Can be used with functools.partial to create a custom binary function that can be passed to apply_columnwise or
    higher order compare_values utilities found elsewhere in this module.  See examples for more details.

    This function supports 6 distinct groups of pandas dtypes which are validated using a corresponding set of helpers
    provided by the pandas.core.dtypes API:

        1) numeric - is_numeric_dtype
        2) datetime-like - is_datetime64_any_dtype or istimedelta64_dtype
        3) bool - is_bool_dtype
        4) string - is_string_dtype
        5) categorical - is_categorical_dtype
        6) is_interval_dtype

    If no supported dtype is matched, or `left` & `right` do not have matching dtypes apd.Series of NaN values is
    returned unless errors='raise' in which case a ValueError is raised.

    Parameters
    ----------
    left : pd.Series, pd.DataFrame, np.ndarray
    right : pd.Series, pd.DataFrame, np.ndarray
    numerics : binary callable
        applied to numeric dtypes
    datetimes : binary callable
        applied to datetime-like objects
    bools : binary callable
        applied to bool dtypes
    strings : binary callable
        applied to string-like dtypes
    categoricals : binary callable
        applied to Categorical dtype
    intervals : binary callable
        applied to Interval dtype
    errors : str
        default 'ignore' issues warning and returns NaNs when dtype of left, right do not match
        if 'raise' is passed, will raise ValueError in such cases

    Returns
    -------
    result of applying a specific binary callable to `left` and `right` inputs based on dtype

    """
    _ld = left.dtype
    _rd = right.dtype
    if is_numeric_dtype(_ld) and is_numeric_dtype(_rd):
        return numerics(left, right)
    elif ((is_datetime64_any_dtype(_ld) or is_timedelta64_dtype(_ld)
           or is_timedelta64_ns_dtype(_ld))
          and (is_datetime64_any_dtype(_rd) or is_timedelta64_dtype(_rd)
               or is_timedelta64_ns_dtype(_rd))):
        return datetimes(left, right)
    elif is_bool_dtype(_ld) and is_bool_dtype(_rd):
        return bools(left, right)
    elif is_string_dtype(_ld) and is_string_dtype(_rd):
        return strings(left, right)
    elif is_categorical_dtype(_ld) and is_categorical_dtype(_rd):
        return categoricals(left, right)
    elif is_interval_dtype(_ld) and is_interval_dtype(_rd):
        return intervals(left, right)
    else:
        # by default when dtypes are mismatched we issue a warning and return NaNs
        # raise if user requires it
        if errors == 'raise':
            raise ValueError(
                f"left and right do not have matching supported dtypes: {_ld.name}, {_rd.name}"
            )
        else:
            warnings.warn(
                f"left: {left.name}, {_ld.name} and right: {right.name}, {_rd.name}"
                f" do not have comparable dtypes, returning NaNs")
            return pd.Series(np.nan, index=right.index)
Exemplo n.º 9
0
def py2rpy_pandasseries(obj):
    if obj.dtype.name == 'O':
        warnings.warn('Element "%s" is of dtype "O" and converted '
                      'to R vector of strings.' % obj.name)
        res = StrVector(obj)
    elif obj.dtype.name == 'category':
        res = py2rpy_categoryseries(obj)
        res = FactorVector(res)
    elif is_datetime64_any_dtype(obj.dtype):
        # time series
        tzname = obj.dt.tz.zone if obj.dt.tz else ''
        d = [IntVector([x.year for x in obj]),
             IntVector([x.month for x in obj]),
             IntVector([x.day for x in obj]),
             IntVector([x.hour for x in obj]),
             IntVector([x.minute for x in obj]),
             FloatSexpVector([x.second + x.microsecond * 1e-6 for x in obj])]
        res = ISOdatetime(*d, tz=StrSexpVector([tzname]))
        # TODO: can the POSIXct be created from the POSIXct constructor ?
        # (is '<M8[ns]' mapping to Python datetime.datetime ?)
        res = POSIXct(res)
    elif obj.dtype.type == str:
        res = _PANDASTYPE2RPY2[str](obj)
    elif obj.dtype.name in integer_array_types:
        res = _PANDASTYPE2RPY2[int](obj)
        if len(obj.shape) == 1:
            if obj.dtype != dt_O_type:
                # force into an R vector
                res = as_vector(res)
    elif (obj.dtype == dt_O_type):
        homogeneous_type = None
        for x in obj.values:
            if x is None:
                continue
            if homogeneous_type is None:
                homogeneous_type = type(x)
                continue
            if ((type(x) is not homogeneous_type)
                and not
                ((isinstance(x, float) and math.isnan(x))
                 or pandas.isna(x))):
                raise ValueError(
                    'Series can only be of one type, or None '
                    '(and here we have %s and %s). If happening with '
                    'a pandas DataFrame the method infer_objects() '
                    'will normalize data types before conversion.' %
                    (homogeneous_type, type(x)))
        # TODO: Could this be merged with obj.type.name == 'O' case above ?
        res = _PANDASTYPE2RPY2[homogeneous_type](obj)
    else:
        # converted as a numpy array
        func = numpy2ri.converter.py2rpy.registry[numpy.ndarray]
        # current conversion as performed by numpy

        res = func(obj.values)
        if len(obj.shape) == 1:
            if (obj.dtype != dt_O_type):
                # force into an R vector
                res = as_vector(res)

    # "index" is equivalent to "names" in R
    if obj.ndim == 1:
        res.do_slot_assign('names',
                           StrVector(tuple(str(x) for x in obj.index)))
    else:
        res.do_slot_assign('dimnames',
                           SexpVector(conversion.py2rpy(obj.index)))
    return res