def series2col(s, name): kw = { 'name': name, 'kind': fpb.Column.SLICE, } if is_integer(s.dtype): kw['dtype'] = fpb.INTEGER kw['ints'] = s elif is_float(s.dtype): kw['dtype'] = fpb.FLOAT kw['floats'] = s elif s.dtype == np.object: # Pandas dtype for str is object kw['strings'] = s kw['dtype'] = fpb.STRING elif is_bool(s.dtype): kw['bools'] = s kw['dtype'] = fpb.BOOLEAN elif is_datetime(s.dtype): if s.dt.tz: try: s = s.dt.tz_localize(pytz.UTC) except TypeError: s = s.dt.tz_convert('UTC') kw['times'] = s.astype(np.int64) kw['dtype'] = fpb.TIME elif is_categorical_dtype(s.dtype): # We assume catgorical data is strings kw['strings'] = s.astype(str) kw['dtype'] = fpb.STRING else: raise WriteError('{} - unsupported type - {}'.format(s.name, s.dtype)) return fpb.Column(**kw)
def _check_op(self, s, op_name, other, exc=None): op = self.get_op_from_name(op_name) result = op(s, other) # compute expected mask = s.isna() # other array is an Integer if isinstance(other, IntegerArray): omask = getattr(other, 'mask', None) mask = getattr(other, 'data', other) if omask is not None: mask |= omask # float result type or float op if ((is_float_dtype(other) or is_float(other) or op_name in ['__rtruediv__', '__truediv__', '__rdiv__', '__div__'])): rs = s.astype('float') expected = op(rs, other) self._check_op_float(result, expected, mask, s, op_name, other) # integer result type else: rs = pd.Series(s.values._data) expected = op(rs, other) self._check_op_integer(result, expected, mask, s, op_name, other)
def infer_dtype_by_scaladata(data): if isinstance(data, float): return DataType.DOUBLE if isinstance(data, bool): return DataType.BOOL if isinstance(data, int): return DataType.INT64 if isinstance(data, str): return DataType.STRING if isinstance(data, np.float64): return DataType.DOUBLE if isinstance(data, np.float32): return DataType.FLOAT if isinstance(data, np.int64): return DataType.INT64 if isinstance(data, np.int32): return DataType.INT32 if isinstance(data, np.int16): return DataType.INT16 if isinstance(data, np.int8): return DataType.INT8 if isinstance(data, np.bool8): return DataType.BOOL if isinstance(data, np.bool_): return DataType.BOOL if isinstance(data, bytes): return DataType.BINARY_VECTOR if is_float(data): return DataType.DOUBLE return DataType.UNKNOWN
def default_display_func(x): if self.na_rep is not None and pd.isna(x): return self.na_rep elif is_float(x): n_precision = len(str(int(x))) + self.precision display_format = f"{x:.{n_precision}n}" return display_format elif is_integer(x): display_format = f"{x:n}" return display_format else: return x
def get_actual_types(df): column_types = {} for col_name in df.columns: col = df[col_name] if is_integer(col.dtype): column_types[col.name] = fpb.INTEGER elif is_float(col.dtype): column_types[col.name] = fpb.FLOAT elif is_string(col.dtype): has_data = False for x in col: if pd.isnull(x): continue if isinstance(x, str): column_types[col.name] = fpb.STRING has_data = True break if isinstance(x, bool): column_types[col.name] = fpb.BOOLEAN has_data = True break if isinstance(x, pd.Timestamp): column_types[col.name] = fpb.TIME has_data = True break if isinstance(x, datetime): column_types[col.name] = fpb.TIME has_data = True break raise WriteError( '{} - contains an unsupported value type - {}'.format( col_name, type(x))) # If all items in the column are None # it does not matter what type the column will be, set the column as INTEGER if not has_data: column_types[col.name] = fpb.NULL elif is_bool(col.dtype): column_types[col.name] = fpb.BOOLEAN elif is_datetime(col.dtype): column_types[col.name] = fpb.TIME elif is_categorical_dtype(col.dtype): # We assume catgorical data is strings column_types[col.name] = fpb.STRING else: raise WriteError('{} - unsupported type - {}'.format( col_name, col.dtype)) return column_types
def _check_op(self, s, op_name, other, exc=None): op = self.get_op_from_name(op_name) result = op(s, other) # compute expected mask = s.isna() # if s is a DataFrame, squeeze to a Series # for comparison if isinstance(s, pd.DataFrame): result = result.squeeze() s = s.squeeze() mask = mask.squeeze() # other array is an Integer if isinstance(other, IntegerArray): omask = getattr(other, "mask", None) mask = getattr(other, "data", other) if omask is not None: mask |= omask # 1 ** na is na, so need to unmask those if op_name == "__pow__": mask = np.where(~s.isna() & (s == 1), False, mask) elif op_name == "__rpow__": other_is_one = other == 1 if isinstance(other_is_one, pd.Series): other_is_one = other_is_one.fillna(False) mask = np.where(other_is_one, False, mask) # float result type or float op if ( is_float_dtype(other) or is_float(other) or op_name in ["__rtruediv__", "__truediv__", "__rdiv__", "__div__"] ): rs = s.astype("float") expected = op(rs, other) self._check_op_float(result, expected, mask, s, op_name, other) # integer result type else: rs = pd.Series(s.values._data, name=s.name) expected = op(rs, other) self._check_op_integer(result, expected, mask, s, op_name, other)
def _check_op(self, s, op_name, other, exc=None): op = self.get_op_from_name(op_name) result = op(s, other) # compute expected mask = s.isna() # if s is a DataFrame, squeeze to a Series # for comparison if isinstance(s, pd.DataFrame): result = result.squeeze() s = s.squeeze() mask = mask.squeeze() # other array is an Integer if isinstance(other, IntegerArray): omask = getattr(other, 'mask', None) mask = getattr(other, 'data', other) if omask is not None: mask |= omask # 1 ** na is na, so need to unmask those if op_name == '__pow__': mask = np.where(s == 1, False, mask) elif op_name == '__rpow__': mask = np.where(other == 1, False, mask) # float result type or float op if ((is_float_dtype(other) or is_float(other) or op_name in ['__rtruediv__', '__truediv__', '__rdiv__', '__div__'])): rs = s.astype('float') expected = op(rs, other) self._check_op_float(result, expected, mask, s, op_name, other) # integer result type else: rs = pd.Series(s.values._data) expected = op(rs, other) self._check_op_integer(result, expected, mask, s, op_name, other)