Exemplo n.º 1
def special_dtype_handling(df, spec_dtypes, spec_timezones, schema):
    Wrapper around functions for special handling of specific dtypes

            df (pd.DataFrame): Dataframe being operated on
            spec_dtypes (dict<str:np.dtype or str>):
                a dict from column names to dtypes
            schema (str): The schema of the table the df is being uploaded to
        For datetimes:
            spec_timezones (dict<str, str>):
                Dictionary from datetime columns to the timezone they
                represent. If the column is timezone-naive, it will have the
                timezone added to its metadata, leaving the times themselves
                unmodified. If the column is timezone-aware, the timezone
                will be converted, likely modifying the stored times.

    df = apply_spec_dtypes(df, spec_dtypes)

    # All datetime columns, regardless of timezone naive/aware
    datetime_cols = [col for col in df.columns
                     if is_datetime64_any_dtype(df.dtypes[col])]

    convert_to_spec_timezones(df, datetime_cols, spec_timezones)
    make_datetimes_timezone_naive(df, datetime_cols, schema)

    return df
Exemplo n.º 2
def py2rpy_pandasseries(obj):
    if obj.dtype.name == 'O':
        warnings.warn('Element "%s" is of dtype "O" and converted '
                      'to R vector of strings.' % obj.name)
        res = StrVector(obj)
    elif obj.dtype.name == 'category':
        res = py2rpy_categoryseries(obj)
        res = FactorVector(res)
    elif is_datetime64_any_dtype(obj.dtype):
        # time series
        tzname = obj.dt.tz.zone if obj.dt.tz else ''
        d = [
            IntVector([x.year for x in obj]),
            IntVector([x.month for x in obj]),
            IntVector([x.day for x in obj]),
            IntVector([x.hour for x in obj]),
            IntVector([x.minute for x in obj]),
            FloatSexpVector([x.second + x.microsecond * 1e-6 for x in obj])
        res = ISOdatetime(*d, tz=StrSexpVector([tzname]))
        # TODO: can the POSIXct be created from the POSIXct constructor ?
        # (is '<M8[ns]' mapping to Python datetime.datetime ?)
        res = POSIXct(res)
    elif (obj.dtype == dt_O_type):
        homogeneous_type = None
        for x in obj.values:
            if x is None:
            if homogeneous_type is None:
                homogeneous_type = type(x)
            if type(x) is not homogeneous_type:
                raise ValueError('Series can only be of one type, or None.')
        # TODO: Could this be merged with obj.type.name == 'O' case above ?
        res = {
            int: IntVector,
            bool: BoolVector,
            None: BoolVector,
            str: StrVector,
            bytes: numpy2ri.converter.py2rpy.registry[numpy.ndarray]
        # converted as a numpy array
        func = numpy2ri.converter.py2rpy.registry[numpy.ndarray]
        # current conversion as performed by numpy

        res = func(obj)
        if len(obj.shape) == 1:
            if (obj.dtype != dt_O_type):
                # force into an R vector
                res = as_vector(res)

    # "index" is equivalent to "names" in R
    if obj.ndim == 1:
                           StrVector(tuple(str(x) for x in obj.index)))
    return res
Exemplo n.º 3
    def restore_type(self, dtype, sample=None):
        """Restore type from Pandas

        # Pandas types
        if pdc.is_bool_dtype(dtype):
            return 'boolean'
        elif pdc.is_datetime64_any_dtype(dtype):
            return 'datetime'
        elif pdc.is_integer_dtype(dtype):
            return 'integer'
        elif pdc.is_numeric_dtype(dtype):
            return 'number'

        # Python types
        if sample is not None:
            if isinstance(sample, (list, tuple)):
                return 'array'
            elif isinstance(sample, datetime.date):
                return 'date'
            elif isinstance(sample, isodate.Duration):
                return 'duration'
            elif isinstance(sample, dict):
                return 'object'
            elif isinstance(sample, six.string_types):
                return 'string'
            elif isinstance(sample, datetime.time):
                return 'time'

        return 'string'
Exemplo n.º 4
def to_num_datetime(col, name='array', thresh=0.80, **kwargs):
    '''convert col to numeric or datetime if possible, otherwise remain
    col --> series, scalar or ndarry will be turned into series type
    name --> name of the col series 
    thresh --> default 0.8 
        - if more than the thresh percentage of X could be converted, 
          then should commit conversion   
    - errors - {'ignore', 'raise', 'coerce'}, default --> 'coerce'
        - If 'raise', then invalid parsing will raise an exception
        - If 'coerce', then invalid parsing will be set as NaN
        - If 'ignore', then invalid parsing will return the input
    other pandas to_datetime key words
    converted series or df
        col = pd.Series(col)
    except Exception:
        raise Exception('col must be 1-d array/list/tuple/dict/Series')

    if api.is_numeric_dtype(col):
        return col
    if api.is_datetime64_any_dtype(col):
        return col
    if api.is_categorical_dtype(col):
        return col
    if col.count() == 0:
        return col

    is_numeric_convertible = False
    not_null_count = col.count()

        num = pd.to_numeric(col, errors=kwargs.get('errors', 'coerce'))
        if num.count() / not_null_count >= thresh:
            col = num
            is_numeric_convertible = True
    if not is_numeric_convertible:
        params = {'errors': 'coerce', 'infter_datetime_format': True}
            date = pd.to_datetime(col, **params)
            if pd.notnull(date).sum() / not_null_count >= thresh:
                col = date
    return col
Exemplo n.º 5
def py2ri_pandasseries(obj):
    if numpy.dtype.name == 'O':
            'Element "%s" is of dtype "O" and converted to R vector of strings.'
            % obj.name)
        res = StrVector(obj)
    elif obj.dtype.name == 'category':
        res = py2ri_categoryseries(obj)
        res = FactorVector(res)
    elif is_datetime64_any_dtype(obj.dtype):
        # time series
        tzname = obj.dt.tz.zone if obj.dt.tz else ''
        d = [
            IntVector([x.year for x in obj]),
            IntVector([x.month for x in obj]),
            IntVector([x.day for x in obj]),
            IntVector([x.hour for x in obj]),
            IntVector([x.minute for x in obj]),
            IntVector([x.second for x in obj])
        res = ISOdatetime(*d, tz=StrSexpVector([tzname]))
        #FIXME: can the POSIXct be created from the POSIXct constructor ?
        # (is '<M8[ns]' mapping to Python datetime.datetime ?)
        res = POSIXct(res)
        # converted as a numpy array
        func = numpy2ri.converter.py2ri.registry[numpy.ndarray]
        # current conversion as performed by numpy
        res = func(obj)
        if len(obj.shape) == 1:
            if (obj.dtype != dt_O_type):
                # force into an R vector
                res = as_vector(res)

    # "index" is equivalent to "names" in R
    if obj.ndim == 1:
                           StrVector(tuple(str(x) for x in obj.index)))
        res.do_slot_assign('dimnames', SexpVector(conversion.py2ri(obj.index)))
    return res
Exemplo n.º 6
def get_datetime_cols(df):
    return [
        col for col in df.columns if is_datetime64_any_dtype(df.dtypes[col])
Exemplo n.º 7
def to_num_datetime(col, name='array', thresh=0.75, **kwargs):
    '''convert col to numeric or datetime if possible, otherwise remain
    col : series scalar or ndarry
        input sequence
    name : str
        name of the col series 
    thresh : float
        default 0.8,  
        if more than the thresh percentage of X could be converted, 
          then should commit conversion   
    keyword args
    other pandas to_datetime key words
    errors : {'ignore', 'raise', 'coerce'}
        default 'coerce'
        If 'raise', then invalid parsing will raise an exception
        If 'coerce', then invalid parsing will be set as NaN
        If 'ignore', then invalid parsing will return the input
    s : series
        converted col
        col = pd.Series(col)
    except Exception:
        raise Exception('col must be 1-d array/list/tuple/dict/Series')

    if api.is_numeric_dtype(col):
        return col
    if api.is_datetime64_any_dtype(col):
        return col
    if api.is_categorical_dtype(col):
        return col
    if col.count() == 0:
        return col
    if col.astype(str).str.contains('^0\d+$').any():
        return col

    is_numeric_convertible = False
    not_null_count = col.count()

        num = pd.to_numeric(col, errors=kwargs.get('errors', 'coerce'))
        if num.count() / not_null_count >= thresh:
            col = num
            is_numeric_convertible = True
    if not is_numeric_convertible:
        params = {'errors': 'coerce', 'infer_datetime_format': True}
            date = pd.to_datetime(col, **params)
            if pd.notnull(date).sum() / not_null_count >= thresh:
                col = date
                col = col.apply(lambda x: x if pd.isna(x) else str(x))

    return col
Exemplo n.º 8
def dtype_specific_binary(left,
    A low-level base binary function to perform different binary operations based on the dtypes of left, right inputs.
    Can be used with functools.partial to create a custom binary function that can be passed to apply_columnwise or
    higher order compare_values utilities found elsewhere in this module.  See examples for more details.

    This function supports 6 distinct groups of pandas dtypes which are validated using a corresponding set of helpers
    provided by the pandas.core.dtypes API:

        1) numeric - is_numeric_dtype
        2) datetime-like - is_datetime64_any_dtype or istimedelta64_dtype
        3) bool - is_bool_dtype
        4) string - is_string_dtype
        5) categorical - is_categorical_dtype
        6) is_interval_dtype

    If no supported dtype is matched, or `left` & `right` do not have matching dtypes apd.Series of NaN values is
    returned unless errors='raise' in which case a ValueError is raised.

    left : pd.Series, pd.DataFrame, np.ndarray
    right : pd.Series, pd.DataFrame, np.ndarray
    numerics : binary callable
        applied to numeric dtypes
    datetimes : binary callable
        applied to datetime-like objects
    bools : binary callable
        applied to bool dtypes
    strings : binary callable
        applied to string-like dtypes
    categoricals : binary callable
        applied to Categorical dtype
    intervals : binary callable
        applied to Interval dtype
    errors : str
        default 'ignore' issues warning and returns NaNs when dtype of left, right do not match
        if 'raise' is passed, will raise ValueError in such cases

    result of applying a specific binary callable to `left` and `right` inputs based on dtype

    _ld = left.dtype
    _rd = right.dtype
    if is_numeric_dtype(_ld) and is_numeric_dtype(_rd):
        return numerics(left, right)
    elif ((is_datetime64_any_dtype(_ld) or is_timedelta64_dtype(_ld)
           or is_timedelta64_ns_dtype(_ld))
          and (is_datetime64_any_dtype(_rd) or is_timedelta64_dtype(_rd)
               or is_timedelta64_ns_dtype(_rd))):
        return datetimes(left, right)
    elif is_bool_dtype(_ld) and is_bool_dtype(_rd):
        return bools(left, right)
    elif is_string_dtype(_ld) and is_string_dtype(_rd):
        return strings(left, right)
    elif is_categorical_dtype(_ld) and is_categorical_dtype(_rd):
        return categoricals(left, right)
    elif is_interval_dtype(_ld) and is_interval_dtype(_rd):
        return intervals(left, right)
        # by default when dtypes are mismatched we issue a warning and return NaNs
        # raise if user requires it
        if errors == 'raise':
            raise ValueError(
                f"left and right do not have matching supported dtypes: {_ld.name}, {_rd.name}"
                f"left: {left.name}, {_ld.name} and right: {right.name}, {_rd.name}"
                f" do not have comparable dtypes, returning NaNs")
            return pd.Series(np.nan, index=right.index)
Exemplo n.º 9
def py2rpy_pandasseries(obj):
    if obj.dtype.name == 'O':
        warnings.warn('Element "%s" is of dtype "O" and converted '
                      'to R vector of strings.' % obj.name)
        res = StrVector(obj)
    elif obj.dtype.name == 'category':
        res = py2rpy_categoryseries(obj)
        res = FactorVector(res)
    elif is_datetime64_any_dtype(obj.dtype):
        # time series
        tzname = obj.dt.tz.zone if obj.dt.tz else ''
        d = [IntVector([x.year for x in obj]),
             IntVector([x.month for x in obj]),
             IntVector([x.day for x in obj]),
             IntVector([x.hour for x in obj]),
             IntVector([x.minute for x in obj]),
             FloatSexpVector([x.second + x.microsecond * 1e-6 for x in obj])]
        res = ISOdatetime(*d, tz=StrSexpVector([tzname]))
        # TODO: can the POSIXct be created from the POSIXct constructor ?
        # (is '<M8[ns]' mapping to Python datetime.datetime ?)
        res = POSIXct(res)
    elif obj.dtype.type == str:
        res = _PANDASTYPE2RPY2[str](obj)
    elif obj.dtype.name in integer_array_types:
        res = _PANDASTYPE2RPY2[int](obj)
        if len(obj.shape) == 1:
            if obj.dtype != dt_O_type:
                # force into an R vector
                res = as_vector(res)
    elif (obj.dtype == dt_O_type):
        homogeneous_type = None
        for x in obj.values:
            if x is None:
            if homogeneous_type is None:
                homogeneous_type = type(x)
            if ((type(x) is not homogeneous_type)
                and not
                ((isinstance(x, float) and math.isnan(x))
                 or pandas.isna(x))):
                raise ValueError(
                    'Series can only be of one type, or None '
                    '(and here we have %s and %s). If happening with '
                    'a pandas DataFrame the method infer_objects() '
                    'will normalize data types before conversion.' %
                    (homogeneous_type, type(x)))
        # TODO: Could this be merged with obj.type.name == 'O' case above ?
        res = _PANDASTYPE2RPY2[homogeneous_type](obj)
        # converted as a numpy array
        func = numpy2ri.converter.py2rpy.registry[numpy.ndarray]
        # current conversion as performed by numpy

        res = func(obj.values)
        if len(obj.shape) == 1:
            if (obj.dtype != dt_O_type):
                # force into an R vector
                res = as_vector(res)

    # "index" is equivalent to "names" in R
    if obj.ndim == 1:
                           StrVector(tuple(str(x) for x in obj.index)))
    return res