def special_dtype_handling(df, spec_dtypes, spec_timezones, schema): """ Wrapper around functions for special handling of specific dtypes Args: Universal: df (pd.DataFrame): Dataframe being operated on spec_dtypes (dict<str:np.dtype or str>): a dict from column names to dtypes schema (str): The schema of the table the df is being uploaded to For datetimes: spec_timezones (dict<str, str>): Dictionary from datetime columns to the timezone they represent. If the column is timezone-naive, it will have the timezone added to its metadata, leaving the times themselves unmodified. If the column is timezone-aware, the timezone will be converted, likely modifying the stored times. """ df = apply_spec_dtypes(df, spec_dtypes) # All datetime columns, regardless of timezone naive/aware datetime_cols = [col for col in df.columns if is_datetime64_any_dtype(df.dtypes[col])] convert_to_spec_timezones(df, datetime_cols, spec_timezones) make_datetimes_timezone_naive(df, datetime_cols, schema) return df
def py2rpy_pandasseries(obj): if obj.dtype.name == 'O': warnings.warn('Element "%s" is of dtype "O" and converted ' 'to R vector of strings.' % obj.name) res = StrVector(obj) elif obj.dtype.name == 'category': res = py2rpy_categoryseries(obj) res = FactorVector(res) elif is_datetime64_any_dtype(obj.dtype): # time series tzname = obj.dt.tz.zone if obj.dt.tz else '' d = [ IntVector([x.year for x in obj]), IntVector([x.month for x in obj]), IntVector([x.day for x in obj]), IntVector([x.hour for x in obj]), IntVector([x.minute for x in obj]), FloatSexpVector([x.second + x.microsecond * 1e-6 for x in obj]) ] res = ISOdatetime(*d, tz=StrSexpVector([tzname])) # TODO: can the POSIXct be created from the POSIXct constructor ? # (is '<M8[ns]' mapping to Python datetime.datetime ?) res = POSIXct(res) elif (obj.dtype == dt_O_type): homogeneous_type = None for x in obj.values: if x is None: continue if homogeneous_type is None: homogeneous_type = type(x) continue if type(x) is not homogeneous_type: raise ValueError('Series can only be of one type, or None.') # TODO: Could this be merged with obj.type.name == 'O' case above ? res = { int: IntVector, bool: BoolVector, None: BoolVector, str: StrVector, bytes: numpy2ri.converter.py2rpy.registry[numpy.ndarray] }[homogeneous_type](obj) else: # converted as a numpy array func = numpy2ri.converter.py2rpy.registry[numpy.ndarray] # current conversion as performed by numpy res = func(obj) if len(obj.shape) == 1: if (obj.dtype != dt_O_type): # force into an R vector res = as_vector(res) # "index" is equivalent to "names" in R if obj.ndim == 1: res.do_slot_assign('names', StrVector(tuple(str(x) for x in obj.index))) else: res.do_slot_assign('dimnames', SexpVector(conversion.py2rpy(obj.index))) return res
def restore_type(self, dtype, sample=None): """Restore type from Pandas """ # Pandas types if pdc.is_bool_dtype(dtype): return 'boolean' elif pdc.is_datetime64_any_dtype(dtype): return 'datetime' elif pdc.is_integer_dtype(dtype): return 'integer' elif pdc.is_numeric_dtype(dtype): return 'number' # Python types if sample is not None: if isinstance(sample, (list, tuple)): return 'array' elif isinstance(sample, datetime.date): return 'date' elif isinstance(sample, isodate.Duration): return 'duration' elif isinstance(sample, dict): return 'object' elif isinstance(sample, six.string_types): return 'string' elif isinstance(sample, datetime.time): return 'time' return 'string'
def to_num_datetime(col, name='array', thresh=0.80, **kwargs): '''convert col to numeric or datetime if possible, otherwise remain unchaged parameters ---- col --> series, scalar or ndarry will be turned into series type name --> name of the col series thresh --> default 0.8 - if more than the thresh percentage of X could be converted, then should commit conversion **kwargs - errors - {'ignore', 'raise', 'coerce'}, default --> 'coerce' - If 'raise', then invalid parsing will raise an exception - If 'coerce', then invalid parsing will be set as NaN - If 'ignore', then invalid parsing will return the input other pandas to_datetime key words return ---- converted series or df ''' try: col = pd.Series(col) except Exception: raise Exception('col must be 1-d array/list/tuple/dict/Series') if api.is_numeric_dtype(col): return col if api.is_datetime64_any_dtype(col): return col if api.is_categorical_dtype(col): return col if col.count() == 0: return col is_numeric_convertible = False not_null_count = col.count() try: num = pd.to_numeric(col, errors=kwargs.get('errors', 'coerce')) if num.count() / not_null_count >= thresh: col = num is_numeric_convertible = True except: pass if not is_numeric_convertible: params = {'errors': 'coerce', 'infter_datetime_format': True} params.update(kwargs) try: date = pd.to_datetime(col, **params) if pd.notnull(date).sum() / not_null_count >= thresh: col = date except: pass return col
def py2ri_pandasseries(obj): if numpy.dtype.name == 'O': warnings.warn( 'Element "%s" is of dtype "O" and converted to R vector of strings.' % obj.name) res = StrVector(obj) elif obj.dtype.name == 'category': res = py2ri_categoryseries(obj) res = FactorVector(res) elif is_datetime64_any_dtype(obj.dtype): # time series tzname = obj.dt.tz.zone if obj.dt.tz else '' d = [ IntVector([x.year for x in obj]), IntVector([x.month for x in obj]), IntVector([x.day for x in obj]), IntVector([x.hour for x in obj]), IntVector([x.minute for x in obj]), IntVector([x.second for x in obj]) ] res = ISOdatetime(*d, tz=StrSexpVector([tzname])) #FIXME: can the POSIXct be created from the POSIXct constructor ? # (is '<M8[ns]' mapping to Python datetime.datetime ?) res = POSIXct(res) else: # converted as a numpy array func = numpy2ri.converter.py2ri.registry[numpy.ndarray] # current conversion as performed by numpy res = func(obj) if len(obj.shape) == 1: if (obj.dtype != dt_O_type): # force into an R vector res = as_vector(res) # "index" is equivalent to "names" in R if obj.ndim == 1: res.do_slot_assign('names', StrVector(tuple(str(x) for x in obj.index))) else: res.do_slot_assign('dimnames', SexpVector(conversion.py2ri(obj.index))) return res
def get_datetime_cols(df): return [ col for col in df.columns if is_datetime64_any_dtype(df.dtypes[col]) ]
def to_num_datetime(col, name='array', thresh=0.75, **kwargs): '''convert col to numeric or datetime if possible, otherwise remain unchaged parameters ----------- col : series scalar or ndarry input sequence name : str name of the col series thresh : float default 0.8, if more than the thresh percentage of X could be converted, then should commit conversion keyword args ------------ other pandas to_datetime key words errors : {'ignore', 'raise', 'coerce'} default 'coerce' If 'raise', then invalid parsing will raise an exception If 'coerce', then invalid parsing will be set as NaN If 'ignore', then invalid parsing will return the input return -------- s : series converted col ''' try: col = pd.Series(col) except Exception: raise Exception('col must be 1-d array/list/tuple/dict/Series') if api.is_numeric_dtype(col): return col if api.is_datetime64_any_dtype(col): return col if api.is_categorical_dtype(col): return col if col.count() == 0: return col if col.astype(str).str.contains('^0\d+$').any(): return col is_numeric_convertible = False not_null_count = col.count() try: num = pd.to_numeric(col, errors=kwargs.get('errors', 'coerce')) if num.count() / not_null_count >= thresh: col = num is_numeric_convertible = True except: pass if not is_numeric_convertible: params = {'errors': 'coerce', 'infer_datetime_format': True} params.update(kwargs) try: date = pd.to_datetime(col, **params) if pd.notnull(date).sum() / not_null_count >= thresh: col = date else: col = col.apply(lambda x: x if pd.isna(x) else str(x)) except: pass return col
def dtype_specific_binary(left, right, numerics, datetimes, bools, strings, categoricals, intervals, errors='ignore'): """ A low-level base binary function to perform different binary operations based on the dtypes of left, right inputs. Can be used with functools.partial to create a custom binary function that can be passed to apply_columnwise or higher order compare_values utilities found elsewhere in this module. See examples for more details. This function supports 6 distinct groups of pandas dtypes which are validated using a corresponding set of helpers provided by the pandas.core.dtypes API: 1) numeric - is_numeric_dtype 2) datetime-like - is_datetime64_any_dtype or istimedelta64_dtype 3) bool - is_bool_dtype 4) string - is_string_dtype 5) categorical - is_categorical_dtype 6) is_interval_dtype If no supported dtype is matched, or `left` & `right` do not have matching dtypes apd.Series of NaN values is returned unless errors='raise' in which case a ValueError is raised. Parameters ---------- left : pd.Series, pd.DataFrame, np.ndarray right : pd.Series, pd.DataFrame, np.ndarray numerics : binary callable applied to numeric dtypes datetimes : binary callable applied to datetime-like objects bools : binary callable applied to bool dtypes strings : binary callable applied to string-like dtypes categoricals : binary callable applied to Categorical dtype intervals : binary callable applied to Interval dtype errors : str default 'ignore' issues warning and returns NaNs when dtype of left, right do not match if 'raise' is passed, will raise ValueError in such cases Returns ------- result of applying a specific binary callable to `left` and `right` inputs based on dtype """ _ld = left.dtype _rd = right.dtype if is_numeric_dtype(_ld) and is_numeric_dtype(_rd): return numerics(left, right) elif ((is_datetime64_any_dtype(_ld) or is_timedelta64_dtype(_ld) or is_timedelta64_ns_dtype(_ld)) and (is_datetime64_any_dtype(_rd) or is_timedelta64_dtype(_rd) or is_timedelta64_ns_dtype(_rd))): return datetimes(left, right) elif is_bool_dtype(_ld) and is_bool_dtype(_rd): return bools(left, right) elif is_string_dtype(_ld) and is_string_dtype(_rd): return strings(left, right) elif is_categorical_dtype(_ld) and is_categorical_dtype(_rd): return categoricals(left, right) elif is_interval_dtype(_ld) and is_interval_dtype(_rd): return intervals(left, right) else: # by default when dtypes are mismatched we issue a warning and return NaNs # raise if user requires it if errors == 'raise': raise ValueError( f"left and right do not have matching supported dtypes: {_ld.name}, {_rd.name}" ) else: warnings.warn( f"left: {left.name}, {_ld.name} and right: {right.name}, {_rd.name}" f" do not have comparable dtypes, returning NaNs") return pd.Series(np.nan, index=right.index)
def py2rpy_pandasseries(obj): if obj.dtype.name == 'O': warnings.warn('Element "%s" is of dtype "O" and converted ' 'to R vector of strings.' % obj.name) res = StrVector(obj) elif obj.dtype.name == 'category': res = py2rpy_categoryseries(obj) res = FactorVector(res) elif is_datetime64_any_dtype(obj.dtype): # time series tzname = obj.dt.tz.zone if obj.dt.tz else '' d = [IntVector([x.year for x in obj]), IntVector([x.month for x in obj]), IntVector([x.day for x in obj]), IntVector([x.hour for x in obj]), IntVector([x.minute for x in obj]), FloatSexpVector([x.second + x.microsecond * 1e-6 for x in obj])] res = ISOdatetime(*d, tz=StrSexpVector([tzname])) # TODO: can the POSIXct be created from the POSIXct constructor ? # (is '<M8[ns]' mapping to Python datetime.datetime ?) res = POSIXct(res) elif obj.dtype.type == str: res = _PANDASTYPE2RPY2[str](obj) elif obj.dtype.name in integer_array_types: res = _PANDASTYPE2RPY2[int](obj) if len(obj.shape) == 1: if obj.dtype != dt_O_type: # force into an R vector res = as_vector(res) elif (obj.dtype == dt_O_type): homogeneous_type = None for x in obj.values: if x is None: continue if homogeneous_type is None: homogeneous_type = type(x) continue if ((type(x) is not homogeneous_type) and not ((isinstance(x, float) and math.isnan(x)) or pandas.isna(x))): raise ValueError( 'Series can only be of one type, or None ' '(and here we have %s and %s). If happening with ' 'a pandas DataFrame the method infer_objects() ' 'will normalize data types before conversion.' % (homogeneous_type, type(x))) # TODO: Could this be merged with obj.type.name == 'O' case above ? res = _PANDASTYPE2RPY2[homogeneous_type](obj) else: # converted as a numpy array func = numpy2ri.converter.py2rpy.registry[numpy.ndarray] # current conversion as performed by numpy res = func(obj.values) if len(obj.shape) == 1: if (obj.dtype != dt_O_type): # force into an R vector res = as_vector(res) # "index" is equivalent to "names" in R if obj.ndim == 1: res.do_slot_assign('names', StrVector(tuple(str(x) for x in obj.index))) else: res.do_slot_assign('dimnames', SexpVector(conversion.py2rpy(obj.index))) return res