def _check_header(data): """Validates the first few header bytes of a serialized DataFrame. If the serialized DataFrame is compressed, then this function performs decomression and returns the result Args: data: A bytearray representing the serialized DataFrame Returns: The argument bytearray, possibly decomressed Raises: DataFrameException: If the validation failed """ if not isinstance(data, bytearray): raise ValueError( ("Invalid argument 'data'. " "Expected bytearray but found {}").format(type(data))) if data[0] == 0x64 and data[1] == 0x66: data = _decompress(data) # validate the first bytes of the header and # the used format version must start with '{v:' if (data[0] != 0x7b or data[1] != 0x76 or data[2] != 0x3a or (data[3] != 0x32 and data[3] != 0x31)): raise dataframe.DataFrameException("Unsupported encoding") if data[3] != 0x32: # encoding version 2 raise dataframe.DataFrameException( ("Unsupported encoding version (v:{})").format(data[3])) return data
def _decompress(data): """Decompresses the given bytearray. Args: data: The bytearray to decompress Returns: The decompressed bytearray """ if not len(data) > 2: raise dataframe.DataFrameException("Invalid data format") if data[0] != 0x64 or data[1] != 0x66: raise dataframe.DataFrameException( ("Invalid data format. Is not a .df file. " "Starts with 0x{} 0x{}").format( bytearray(data[0].to_bytes(1, byteorder="big", signed=True)).hex(), bytearray(data[1].to_bytes(1, byteorder="big", signed=True)).hex())) data[0] = 0x78 data[1] = 0x9c data = zlib.decompress(data) return bytearray(data)
def _check_type(self, value): if isinstance(value, int): if (value < -2147483648) or (value > 2147483647): raise dataframe.DataFrameException( "Out of range int (int32): {}".format(value)) elif value is not None and not isinstance(value, np.int32): raise dataframe.DataFrameException( ("Invalid argument. Expected " "int (int32) but found {}".format(type(value))))
def _check_type(self, value): if isinstance(value, int): if (value < -9223372036854775808) or (value > 9223372036854775807): raise dataframe.DataFrameException( "Out of range long (int64): {}".format(value)) elif value is not None and not isinstance(value, np.int64): raise dataframe.DataFrameException( ("Invalid argument. Expected " "long (int64) but found {}".format(type(value))))
def _check_type(self, value): if value is None: raise dataframe.DataFrameException( ("Invalid argument. " "BinaryColumn cannot use None values")) if not isinstance(value, bytearray): raise dataframe.DataFrameException( ("Invalid argument. Expected " "bytearray but found {}".format(type(value))))
def _check_type(self, value): if isinstance(value, int): if (value < -32768) or (value > 32767): raise dataframe.DataFrameException( "Out of range short (int16): {}".format(value)) elif value is not None and not isinstance(value, np.int16): raise dataframe.DataFrameException( ("Invalid argument. Expected " "short (int16) but found {}".format(type(value))))
def _check_type(self, value): if value is None: raise dataframe.DataFrameException( ("Invalid argument. " "BooleanColumn cannot use None values")) if not isinstance(value, (bool, np.bool, np.bool_)): raise dataframe.DataFrameException( ("Invalid argument. Expected " "boolean (bool) but found {}").format(type(value)))
def _check_type(self, value): if value is None: raise dataframe.DataFrameException( ("Invalid argument. " "DoubleColumn cannot use None values")) if not isinstance(value, float) and not isinstance(value, np.float64): raise dataframe.DataFrameException( ("Invalid argument. Expected " "double (float64) but found {}".format(type(value))))
def __init__(self, name=None, values=None): """Constructs a new BinaryColumn. The constructed Column will have the specified name or is unlabeled if the specified name is None or empty. The constructed Column has the content of the specified list or numpy array. If the argument specifying the Column values is an int, then the constructed Column is initialized with the given length and all Column entries are set to default values. Args: name: The name of the BinaryColumn as a string values: The content of the BinaryColumn. Must be a list or numpy array with dtype object, or an int """ if values is None: values = self._create_array() if isinstance(values, list): for value in values: if value is None: raise dataframe.DataFrameException( "BinaryColumn cannot use None values") if not isinstance(value, bytearray): raise dataframe.DataFrameException( "List must only contain bytearrays") # create and set values manually because numpy changes bytearray # objects to ndarray types when all bytearrays have equal length tmp = np.empty(len(values), dtype="object") for i, value in enumerate(values): tmp[i] = value values = tmp elif isinstance(values, np.ndarray): if values.dtype != "object": raise dataframe.DataFrameException( ("Invalid argument array. Expected " "bytearray array (object) but found {}".format( values.dtype))) for value in values: self._check_type(value) elif isinstance(values, int): values = self._create_array(size=values) else: raise dataframe.DataFrameException( ("Invalid argument array. Expected " "list or numpy array but found {}".format(type(values)))) super().__init__(name, values)
def __init__(self, name=None, values=None): """Constructs a new StringColumn. The constructed Column will have the specified name or is unlabeled if the specified name is None or empty. The constructed Column has the content of the specified list or numpy array. If the argument specifying the Column values is an int, then the constructed Column is initialized with the given length and all Column entries are set to default values. Args: name: The name of the StringColumn as a string values: The content of the StringColumn. Must be a list or numpy array with dtype object, or an int """ if values is None: values = np.empty(0, dtype=np.object) if isinstance(values, list): for i, value in enumerate(values): self._check_type(value) if not value: values[i] = StringColumn.DEFAULT_VALUE values = np.array(values, dtype=np.object) elif isinstance(values, np.ndarray): if values.dtype != "object": raise dataframe.DataFrameException( ("Invalid argument array. Expected " "string array (object) but found {}".format(values.dtype))) for i, value in enumerate(values): if not isinstance(value, str): raise dataframe.DataFrameException( ("Invalid element in argument array. Expected " "string (str) but found {}".format(values.dtype))) if not value: values[i] = StringColumn.DEFAULT_VALUE elif isinstance(values, int): values = np.empty(values, dtype=np.object) for i in range(values.shape[0]): values[i] = StringColumn.DEFAULT_VALUE else: raise dataframe.DataFrameException( ("Invalid argument array. Expected " "list or numpy array but found {}".format(type(values)))) super().__init__(name, values)
def __init__(self, name=None, values=None): """Constructs a new CharColumn. The constructed Column will have the specified name or is unlabeled if the specified name is None or empty. The constructed Column has the content of the specified list or numpy array. If the argument specifying the Column values is an int, then the constructed Column is initialized with the given length and all Column entries are set to default values. Args: name: The name of the CharColumn as a string values: The content of the CharColumn. Must be a list or numpy array with dtype uint8, or an int """ if values is None: values = np.empty(0, dtype=np.uint8) if isinstance(values, list): charvals = self._create_array(len(values)) for i, value in enumerate(values): self._check_type(value) charvals[i] = ord(value) values = charvals elif isinstance(values, np.ndarray): if values.dtype != "uint8": raise dataframe.DataFrameException( ("Invalid argument array. Expected " "char array (uint8) but found {}".format(values.dtype))) for i, value in enumerate(values): if (value < 32) or (value > 126): raise dataframe.DataFrameException( ("Invalid character value for CharColumn at index {}. " "Only printable ASCII is permitted").format(i)) elif isinstance(values, int): values = np.zeros(values, dtype=np.uint8) default_val = ord(CharColumn.DEFAULT_VALUE) for i in range(values.shape[0]): values[i] = default_val else: raise dataframe.DataFrameException( ("Invalid argument array. Expected " "list or numpy array but found {}".format(type(values)))) super().__init__(name, values)
def _check_type(self, value): if value is None: raise dataframe.DataFrameException( ("Invalid argument. " "ByteColumn cannot use None values")) if isinstance(value, int): if (value < -128) or (value > 127): raise dataframe.DataFrameException( "Out of range byte (int8): {}".format(value)) elif not isinstance(value, np.int8): raise dataframe.DataFrameException( ("Invalid argument. Expected " "byte (int8) but found {}".format(type(value))))
def serialize(df, compress=False): """Serializes the given DataFrame to a bytearray. The compression of the returned bytearray is controlled by the additional boolean flag of this method. Args: df: The DataFrame to serialize. Must not be None compress: A boolean flag indicating whether the returned bytearray should be compressed. Must be a bool Returns; A bytearray representing the given DataFrame in a serialized form Raises: DataFrameException: If any errors occur during serialization or compression """ if df is None: raise dataframe.DataFrameException( "DataFrame argument must not be None") if not isinstance(df, dataframe.DataFrame): raise ValueError( ("Invalid argument 'df'. " "Expected raven.struct.dataframe.DataFrame but found {}").format( type(df))) if not isinstance(compress, bool): raise ValueError(("Invalid argument 'compress'. " "Expected bool but found {}").format(type(compress))) return _compress(_serialize_v2(df)) if compress else _serialize_v2(df)
def _check_bounds(self, index): """Checks array bounds for the specified index. This method raises a DataFrameException if the specified index is out of bounds. """ if index < 0 or index >= self._values.shape[0]: raise dataframe.DataFrameException("Invalid row index: {}".format(index))
def _check_type(self, value): if value is not None: if isinstance(value, str): if len(value) != 1: raise dataframe.DataFrameException( ("Invalid character value. Expected string of " "length 1 but found length {}".format(len(value)))) byte = ord(value) if (byte < 32) or (byte > 126): raise dataframe.DataFrameException( ("Invalid character value for NullableCharColumn. " "Only printable ASCII is permitted")) else: raise dataframe.DataFrameException( ("Invalid argument. Expected " "char (str) but found {}".format(type(value))))
def _cast_to_numeric_type(col, value): """Casts the specified double to the corresponding Number type of the specified Column. Args: col: The Column which specifies the numeric type value: The float value to cast Returns: A number which has the concrete type used by the specified Column """ c = col.type_code() if col.is_nullable(): if c == doublecolumn.NullableDoubleColumn.TYPE_CODE: return float(value) elif c == floatcolumn.NullableFloatColumn.TYPE_CODE: return float(value) elif c == bytecolumn.NullableByteColumn.TYPE_CODE: return int(value) if not np.isnan(value) else None elif c == shortcolumn.NullableShortColumn.TYPE_CODE: return int(value) if not np.isnan(value) else None elif c == intcolumn.NullableIntColumn.TYPE_CODE: return int(value) if not np.isnan(value) else None elif c == longcolumn.NullableLongColumn.TYPE_CODE: return int(value) if not np.isnan(value) else None else: raise dataframe.DataFrameException("Unrecognized column type") else: if c == doublecolumn.DoubleColumn.TYPE_CODE: return float(value) elif c == floatcolumn.FloatColumn.TYPE_CODE: return float(value) elif c == bytecolumn.ByteColumn.TYPE_CODE: return int(value) elif c == shortcolumn.ShortColumn.TYPE_CODE: return int(value) elif c == intcolumn.IntColumn.TYPE_CODE: return int(value) elif c == longcolumn.LongColumn.TYPE_CODE: return int(value) else: raise dataframe.DataFrameException("Unrecognized column type")
def __init__(self, name=None, values=None): """Constructs a new NullableBooleanColumn. The constructed Column will have the specified name or is unlabeled if the specified name is None or empty. The constructed Column has the content of the specified list or numpy array. If the argument specifying the Column values is an int, then the constructed Column is initialized with the given length and all Column entries are set to default values. Args: name: The name of the NullableBooleanColumn as a string values: The content of the NullableBooleanColumn. Must be a list or numpy array with dtype object, or an int """ if values is None: values = np.empty(0, dtype=np.object) if isinstance(values, list): for value in values: self._check_type(value) values = np.array(values, dtype=np.object) elif isinstance(values, np.ndarray): if values.dtype != "object": raise dataframe.DataFrameException( ("Invalid argument array. Expected " "boolean array (object) but found {}".format( values.dtype))) for value in values: self._check_type(value) elif isinstance(values, int): values = np.empty(values, dtype=np.object) else: raise dataframe.DataFrameException( ("Invalid argument array. Expected " "list or numpy array but found {}".format(type(values)))) super().__init__(name, values)
def __init__(self, name=None, values=None): """Assigns this Column instance the specified name and values. This constructor should be called by all subclasses. Args: name: The name to assign to this Column. Must be a string values: The values to assign to this Column. Must be a numpy array """ if name is not None and not isinstance(name, str): raise dataframe.DataFrameException( ("Invalid argument 'name'. " "Expected str but found {}").format(type(name))) if not isinstance(values, np.ndarray): raise dataframe.DataFrameException( ("Invalid argument 'values'. " "Expected numpy.ndarray but found {}").format(type(values))) self._name = name self._values = values
def convert_to(self, typecode): converted = None if typecode == utils.type_code_byte_column(): vals = np.empty([self._values.shape[0]], dtype=np.int8) for i, x in np.ndenumerate(self._values): vals[i] = int(chr(x)) if x is not None else 0 converted = dataframe.DataFrame.ByteColumn(values=vals) elif typecode == utils.type_code_short_column(): vals = np.empty([self._values.shape[0]], dtype=np.int16) for i, x in np.ndenumerate(self._values): vals[i] = int(chr(x)) if x is not None else 0 converted = dataframe.DataFrame.ShortColumn(values=vals) elif typecode == utils.type_code_int_column(): vals = np.empty([self._values.shape[0]], dtype=np.int32) for i, x in np.ndenumerate(self._values): vals[i] = int(chr(x)) if x is not None else 0 converted = dataframe.DataFrame.IntColumn(values=vals) elif typecode == utils.type_code_long_column(): vals = np.empty([self._values.shape[0]], dtype=np.int64) for i, x in np.ndenumerate(self._values): vals[i] = int(chr(x)) if x is not None else 0 converted = dataframe.DataFrame.LongColumn(values=vals) elif typecode == utils.type_code_string_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): vals[i] = chr( x) if x is not None else utils.default_value_string_column( ) converted = dataframe.DataFrame.StringColumn(values=vals) elif typecode == utils.type_code_float_column(): vals = np.empty([self._values.shape[0]], dtype=np.float32) for i, x in np.ndenumerate(self._values): vals[i] = float(chr(x)) if x is not None else 0.0 converted = dataframe.DataFrame.FloatColumn(values=vals) elif typecode == utils.type_code_double_column(): vals = np.empty([self._values.shape[0]], dtype=np.float64) for i, x in np.ndenumerate(self._values): vals[i] = float(chr(x)) if x is not None else 0.0 converted = dataframe.DataFrame.DoubleColumn(values=vals) elif typecode == utils.type_code_char_column(): vals = np.empty([self._values.shape[0]], dtype=np.uint8) ord_default = ord(CharColumn.DEFAULT_VALUE) for i, x in np.ndenumerate(self._values): vals[i] = x if x is not None else ord_default converted = dataframe.DataFrame.CharColumn(values=vals) elif typecode == utils.type_code_boolean_column(): values_true = {"t", "1", "y"} values_false = {"f", "0", "n"} vals = np.empty([self._values.shape[0]], dtype=np.bool) for i, x in np.ndenumerate(self._values): if x is not None: x = chr(x).lower() is_true = x in values_true is_false = x in values_false if not is_true and not is_false: raise dataframe.DataFrameException( ("Invalid boolean character: '{}'".format( self._values[i]))) vals[i] = is_true else: vals[i] = False converted = dataframe.DataFrame.BooleanColumn(values=vals) elif typecode == utils.type_code_binary_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): if x is not None: vals[i] = bytearray(chr(x).encode("utf-8")) else: vals[i] = bytearray.fromhex("00") converted = dataframe.DataFrame.BinaryColumn(values=vals) elif typecode == utils.type_code_nullable_byte_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): vals[i] = int(chr(x)) if x is not None else None converted = dataframe.DataFrame.NullableByteColumn(values=vals) elif typecode == utils.type_code_nullable_short_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): vals[i] = int(chr(x)) if x is not None else None converted = dataframe.DataFrame.NullableShortColumn(values=vals) elif typecode == utils.type_code_nullable_int_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): vals[i] = int(chr(x)) if x is not None else None converted = dataframe.DataFrame.NullableIntColumn(values=vals) elif typecode == utils.type_code_nullable_long_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): vals[i] = int(chr(x)) if x is not None else None converted = dataframe.DataFrame.NullableLongColumn(values=vals) elif typecode == utils.type_code_nullable_string_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): vals[i] = chr(x) if x is not None else None converted = dataframe.DataFrame.NullableStringColumn(values=vals) elif typecode == utils.type_code_nullable_float_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): vals[i] = float(chr(x)) if x is not None else None converted = dataframe.DataFrame.NullableFloatColumn(values=vals) elif typecode == utils.type_code_nullable_double_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): vals[i] = float(chr(x)) if x is not None else None converted = dataframe.DataFrame.NullableDoubleColumn(values=vals) elif typecode == NullableCharColumn.TYPE_CODE: converted = self.clone() elif typecode == utils.type_code_nullable_boolean_column(): values_true = {"t", "1", "y"} values_false = {"f", "0", "n"} vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): if x is not None: x = chr(x).lower() is_true = x in values_true is_false = x in values_false if not is_true and not is_false: raise dataframe.DataFrameException( ("Invalid boolean character: '{}'".format( self._values[i]))) vals[i] = is_true else: vals[i] = None converted = dataframe.DataFrame.NullableBooleanColumn(values=vals) elif typecode == utils.type_code_nullable_binary_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): if x is not None: vals[i] = bytearray(chr(x).encode("utf-8")) else: vals[i] = None converted = dataframe.DataFrame.NullableBinaryColumn(values=vals) else: raise dataframe.DataFrameException( "Unknown column type code: {}".format(typecode)) # pylint: disable=protected-access converted._name = self._name return converted
def setitem_impl(arg, position, value): """Implementation of the __setitem__() function. Args: arg: The DataFrame instance on which the function was called upon position: The position argument passed to the function value: The value argument passed to the function """ if isinstance(position, tuple): if len(position) > 2: raise dataframe.DataFrameException( ("Invalid position argument. Too many " "positions specified: {}").format(len(position))) cols = position[0] rows = position[1] if isinstance(cols, (int, str)): # check for negative column indices if isinstance(cols, int) and cols < 0: if abs(cols) > arg.columns(): raise dataframe.DataFrameException( "Invalid column index: {}".format(cols)) cols = cols % arg.columns() if rows is None: # implements df[x, :] = Column # and df["x", :] = Column arg.set_column(cols, value) elif isinstance(rows, int): # implements df[x, y] = v # and df["x", y] = v if rows < 0: if abs(rows) > arg.rows(): raise dataframe.DataFrameException( "Invalid row index: {}".format(rows)) rows = rows % arg.rows() arg.get_column(cols).set_value(rows, value) elif isinstance(rows, str): # implements df[x, "y_regex"] = v | func | lamda # and df["x", "y_regex"] = v | func | lamda arg.replace(cols, rows, replacement=value) elif isinstance(rows, tuple): # implements df[x, (y0, y1, ..., yn)] = (v0, v1, ..., vn) # and df["x", (y0, y1, ..., yn)] = (v0, v1, ..., vn) col = arg.get_column(cols) if isinstance(value, (list, tuple)): if len(rows) != len(value): raise dataframe.DataFrameException(( "Invalid value argument. The specified " "list/tuple has a size of {} but the row position " "argument has a size of {}").format( len(value), len(rows))) for i, index in enumerate(rows): col.set_value(index, value[i]) else: # implements df[x, (y0, y1, ..., yn)] = v # and df["x", (y0, y1, ..., yn)] = v for index in rows: col.set_value(index, value) elif isinstance(rows, slice): rows = rows.indices(arg.rows()) start = rows[0] stop = rows[1] step = rows[2] col = arg.get_column(cols) if isinstance(value, (list, tuple)): # implements df[x, y0:y1:y2] = (v0, v1, ..., vn) # and df["x", y0:y1:y2] = (v0, v1, ..., vn) if ((stop - start) // step) != len(value): raise dataframe.DataFrameException(( "Invalid value argument. The specified " "list/tuple has a size of {} but the row position " "argument has a size of {}").format( len(value), (stop - start) // step)) i = 0 for index in range(start, stop, step): col.set_value(index, value[i]) i += 1 else: # implements df[x, y0:y1:y2] = v # and df["x", y0:y1:y2] = v for index in range(start, stop, step): col.set_value(index, value) else: # invalid type for row position arg raise dataframe.DataFrameException( ("Invalid row position type. " "Expected int or str but found {}").format(type(rows))) elif isinstance(cols, (tuple, slice)): # prefetch the selected columns as a DataFrame if isinstance(cols, tuple): cols_selected = arg.get_columns(cols=cols) else: # is slice cols_selected = (dataframe.NullableDataFrame( arg._internal_columns()[cols]) if arg.is_nullable() else dataframe.DefaultDataFrame( arg._internal_columns()[cols])) if isinstance(rows, int): if rows < 0: if abs(rows) > arg.rows(): raise dataframe.DataFrameException( "Invalid row index: {}".format(rows)) rows = rows % arg.rows() if isinstance(value, (tuple, list)): # implements df[(x0, x1, ..., xn), y] = [v0, v1, ..., vn] # and df[x0:x1:x2, y] = [v0, v1, ..., vn] cols_selected.set_row(rows, value) else: # implements df[(x0, x1, ..., xn), y] = v # and df[x0:x1:x2, y] = v cols_selected.set_row(rows, [value] * cols_selected.columns()) elif isinstance(rows, tuple): if isinstance(value, (list, tuple)): # implements df[(x0, x1, ..., xn), (y0, y1, ..., ym)] = [[ ], [ ], ..., [ ]] # and df[x0:x1:x2, (y0, y1, ..., ym)] = [[ ], [ ], ..., [ ]] if len(value) == 0: raise dataframe.DataFrameException(( "Invalid value argument. The specified list/tuple " "of row values is empty")) if isinstance(value[0], (list, tuple)): if len(rows) != len(value): raise dataframe.DataFrameException(( "Invalid value argument. The specified list/tuple " "has a size of {} but the row position argument " "has a size of {}").format( len(value), len(rows))) for i, index in enumerate(rows): cols_selected.set_row(index, value[i]) else: for index in rows: cols_selected.set_row(index, value) elif isinstance(value, dataframe.DataFrame): # implements df[(x0, x1, ..., xn), (y0, y1, ..., ym)] = vDataFrame # and df[x0:x1:x2, (y0, y1, ..., ym)] = vDataFrame if len(rows) != value.rows(): rmsg1 = "rows" if value.rows() != 1 else "row" rmsg2 = "rows" if len(rows) != 1 else "row" raise dataframe.DataFrameException( ("Invalid value argument. The specified " "DataFrame has {} {} but the row position " "argument specified {} {}").format( value.rows(), rmsg1, len(rows), rmsg2)) for i, index in enumerate(rows): cols_selected.set_row(index, value.get_row(i)) else: # implements df[(x0, x1, ..., xn), (y0, y1, ..., ym)] = v # and df[x0:x1:x2, (y0, y1, ..., ym)] = v value = [value] * cols_selected.columns() for index in rows: cols_selected.set_row(index, value) elif isinstance(rows, slice): rows = rows.indices(cols_selected.rows()) start = rows[0] stop = rows[1] step = rows[2] if isinstance(value, (list, tuple)): # implements df[(x0, x1, ..., xn), y0:y1:y2] = [ .. ] # and df[x0:x1:x2, y0:y1:y2] = [ .. ] for index in range(start, stop, step): cols_selected.set_row(index, value) elif isinstance(value, dataframe.DataFrame): # implements df[(x0, x1, ..., xn), y0:y1:y2] = vDataFrame # and df[x0:x1:x2, y0:y1:y2] = vDataFrame i = 0 for index in range(start, stop, step): cols_selected.set_row(index, value.get_row(i)) i += 1 else: # implements df[(x0, x1, ..., xn), y0:y1:y2] = v # and df[x0:x1:x2, y0:y1:y2] = v value = [value] * cols_selected.columns() for index in range(start, stop, step): cols_selected.set_row(index, value) elif isinstance(rows, str): raise dataframe.DataFrameException( ("Invalid column position type. A replacement operation " "must only specify a single column " "but found {}").format(type(cols))) else: # invalid type for row position arg raise dataframe.DataFrameException( ("Invalid row position type. " "Expected int or str but found {}").format(type(rows))) else: # invalid type for column position arg raise dataframe.DataFrameException( ("Invalid column position type. " "Expected int or str but found {}").format(type(cols))) elif isinstance(position, int): # check for negative column indices if position < 0: if abs(position) > arg.columns(): raise dataframe.DataFrameException( "Invalid column index: {}".format(position)) position = position % arg.columns() # implements df[x] = Column if position == arg.columns(): arg.add_column(value) else: arg.set_column(position, value) elif isinstance(position, str): # and df["x"] = Column arg.set_column(position, value) else: # invalid type for entire position arg raise dataframe.DataFrameException(("Invalid position type. " "Expected int or str but " "found {}").format(type(position)))
def __init__(self, name=None, values=None): """Constructs a new NullableCharColumn. The constructed Column will have the specified name or is unlabeled if the specified name is None or empty. The constructed Column has the content of the specified list or numpy array. If the argument specifying the Column values is an int, then the constructed Column is initialized with the given length and all Column entries are set to default values. Args: name: The name of the NullableCharColumn as a string values: The content of the NullableCharColumn. Must be a list or numpy array with dtype object, or an int """ if values is None: values = np.empty(0, dtype=np.object) if isinstance(values, list): charvals = np.zeros(len(values), dtype=np.object) for i, value in enumerate(values): self._check_type(value) if value is None: charvals[i] = None else: charvals[i] = ord(value) values = charvals elif isinstance(values, np.ndarray): if values.dtype != "object": raise dataframe.DataFrameException( ("Invalid argument array. Expected " "char array (object) but found {}".format(values.dtype))) for i, value in enumerate(values): if value is not None: if isinstance(value, str): if len(value) != 1: raise dataframe.DataFrameException(( "Invalid character value in numpy array argument. " "Expected string of length 1 but found " "length {}".format(len(value)))) byte = ord(value) if (byte < 32) or (byte > 126): raise dataframe.DataFrameException(( "Invalid character value for NullableCharColumn at index {}. " "Only printable ASCII is permitted").format(i)) values[i] = byte elif isinstance(value, int): if (value < 32) or (value > 126): raise dataframe.DataFrameException(( "Invalid character value for NullableCharColumn at index {}. " "Only printable ASCII is permitted").format(i)) else: raise dataframe.DataFrameException( ("Invalid argument. Expected " "char (str) but found {}".format(type(value)))) elif isinstance(values, int): values = np.empty(values, dtype=np.object) else: raise dataframe.DataFrameException( ("Invalid argument array. Expected " "list or numpy array but found {}".format(type(values)))) super().__init__(name, values)
def convert_to(self, typecode): converted = None if typecode == utils.type_code_byte_column(): vals = np.empty([self._values.shape[0]], dtype=np.int8) for i, x in np.ndenumerate(self._values): if x is not None: vals[i] = int(np.int8(x)) else: vals[i] = 0 converted = dataframe.DataFrame.ByteColumn(values=vals) elif typecode == ShortColumn.TYPE_CODE: vals = np.empty([self._values.shape[0]], dtype=np.int16) for i, x in np.ndenumerate(self._values): if x is not None: vals[i] = int(np.int16(x)) else: vals[i] = 0 converted = ShortColumn(values=vals) elif typecode == utils.type_code_int_column(): vals = np.empty([self._values.shape[0]], dtype=np.int32) for i, x in np.ndenumerate(self._values): if x is not None: vals[i] = int(np.int32(x)) else: vals[i] = 0 converted = dataframe.DataFrame.IntColumn(values=vals) elif typecode == utils.type_code_long_column(): vals = np.empty([self._values.shape[0]], dtype=np.int64) for i, x in np.ndenumerate(self._values): if x is not None: vals[i] = int(np.int64(x)) else: vals[i] = 0 converted = dataframe.DataFrame.LongColumn(values=vals) elif typecode == utils.type_code_string_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): if x is not None: vals[i] = str(x) else: vals[i] = utils.default_value_string_column() converted = dataframe.DataFrame.StringColumn(values=vals) elif typecode == utils.type_code_float_column(): vals = np.empty([self._values.shape[0]], dtype=np.float32) for i, x in np.ndenumerate(self._values): if x is not None: vals[i] = float(x) else: vals[i] = 0.0 converted = dataframe.DataFrame.FloatColumn(values=vals) elif typecode == utils.type_code_double_column(): vals = np.empty([self._values.shape[0]], dtype=np.float64) for i, x in np.ndenumerate(self._values): if x is not None: vals[i] = float(x) else: vals[i] = 0.0 converted = dataframe.DataFrame.DoubleColumn(values=vals) elif typecode == utils.type_code_char_column(): vals = np.zeros([self._values.shape[0]], dtype=np.uint8) ord_default = ord(utils.default_value_char_column()) for i, x in np.ndenumerate(self._values): if x is not None: vals[i] = ord(str(x)[0]) else: vals[i] = ord_default converted = dataframe.DataFrame.CharColumn(values=vals) elif typecode == utils.type_code_boolean_column(): vals = np.empty([self._values.shape[0]], dtype=np.bool) for i, x in np.ndenumerate(self._values): if x is not None: vals[i] = (x != 0) else: vals[i] = False converted = dataframe.DataFrame.BooleanColumn(values=vals) elif typecode == utils.type_code_binary_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): if x is not None: vals[i] = bytearray( int(x).to_bytes(2, byteorder="big", signed=True)) else: vals[i] = bytearray( int(0).to_bytes(2, byteorder="big", signed=True)) converted = dataframe.DataFrame.BinaryColumn(values=vals) elif typecode == utils.type_code_nullable_byte_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): vals[i] = int(np.int8(x)) if x is not None else None converted = dataframe.DataFrame.NullableByteColumn(values=vals) elif typecode == NullableShortColumn.TYPE_CODE: converted = self.clone() elif typecode == utils.type_code_nullable_int_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): vals[i] = int(np.int32(x)) if x is not None else None converted = dataframe.DataFrame.NullableIntColumn(values=vals) elif typecode == utils.type_code_nullable_long_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): vals[i] = int(np.int64(x)) if x is not None else None converted = dataframe.DataFrame.NullableLongColumn(values=vals) elif typecode == utils.type_code_nullable_string_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): if x is not None: vals[i] = str(x) else: vals[i] = None converted = dataframe.DataFrame.NullableStringColumn(values=vals) elif typecode == utils.type_code_nullable_float_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): if x is not None: vals[i] = float(x) else: vals[i] = None converted = dataframe.DataFrame.NullableFloatColumn(values=vals) elif typecode == utils.type_code_nullable_double_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): if x is not None: vals[i] = float(x) else: vals[i] = None converted = dataframe.DataFrame.NullableDoubleColumn(values=vals) elif typecode == utils.type_code_nullable_char_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): if x is not None: vals[i] = ord(str(x)[0]) else: vals[i] = None converted = dataframe.DataFrame.NullableCharColumn(values=vals) elif typecode == utils.type_code_nullable_boolean_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): if x is not None: vals[i] = (x != 0) else: vals[i] = None converted = dataframe.DataFrame.NullableBooleanColumn(values=vals) elif typecode == utils.type_code_nullable_binary_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): if x is not None: vals[i] = bytearray( int(x).to_bytes(2, byteorder="big", signed=True)) else: vals[i] = None converted = dataframe.DataFrame.NullableBinaryColumn(values=vals) else: raise dataframe.DataFrameException( "Unknown column type code: {}".format(typecode)) # pylint: disable=protected-access converted._name = self._name return converted
def _check_type(self, value): if value is not None: if not isinstance(value, bytearray): raise dataframe.DataFrameException( ("Invalid argument. Expected " "bytearray but found {}".format(type(value))))
def convert_to(self, typecode): converted = None if typecode == utils.type_code_byte_column(): vals = np.empty([self._values.shape[0]], dtype=np.int8) for i, x in np.ndenumerate(self._values): vals[i] = int(np.int8(x)) converted = dataframe.DataFrame.ByteColumn(values=vals) elif typecode == utils.type_code_short_column(): vals = np.empty([self._values.shape[0]], dtype=np.int16) for i, x in np.ndenumerate(self._values): vals[i] = int(np.int16(x)) converted = dataframe.DataFrame.ShortColumn(values=vals) elif typecode == utils.type_code_int_column(): vals = np.empty([self._values.shape[0]], dtype=np.int32) for i, x in np.ndenumerate(self._values): vals[i] = int(np.int32(x)) converted = dataframe.DataFrame.IntColumn(values=vals) elif typecode == utils.type_code_long_column(): vals = np.empty([self._values.shape[0]], dtype=np.int64) for i, x in np.ndenumerate(self._values): vals[i] = int(np.int64(x)) converted = dataframe.DataFrame.LongColumn(values=vals) elif typecode == StringColumn.TYPE_CODE: converted = self.clone() elif typecode == utils.type_code_float_column(): vals = np.empty([self._values.shape[0]], dtype=np.float32) for i, x in np.ndenumerate(self._values): vals[i] = float(np.float32(x)) converted = dataframe.DataFrame.FloatColumn(values=vals) elif typecode == utils.type_code_double_column(): vals = np.empty([self._values.shape[0]], dtype=np.float64) for i, x in np.ndenumerate(self._values): vals[i] = float(np.float64(x)) converted = dataframe.DataFrame.DoubleColumn(values=vals) elif typecode == utils.type_code_char_column(): vals = np.empty([self._values.shape[0]], dtype=np.uint8) for i, x in np.ndenumerate(self._values): if x is not None: vals[i] = ord(x[0]) else: vals[i] = utils.default_value_char_column() converted = dataframe.DataFrame.CharColumn(values=vals) elif typecode == utils.type_code_boolean_column(): values_true = {"true", "t", "1", "yes", "y", "on"} values_false = {"false", "f", "0", "no", "n", "off"} vals = np.empty([self._values.shape[0]], dtype=np.bool) for i, x in np.ndenumerate(self._values): if x is not None: x = x.lower() is_true = x in values_true is_false = x in values_false if not is_true and not is_false: raise dataframe.DataFrameException( ("Invalid boolean string: '{}'".format(self._values[i]))) vals[i] = is_true else: vals[i] = False converted = dataframe.DataFrame.BooleanColumn(values=vals) elif typecode == utils.type_code_binary_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): if x is not None: vals[i] = bytearray.fromhex(x) else: vals[i] = bytearray(b'\x00') converted = dataframe.DataFrame.BinaryColumn(values=vals) elif typecode == utils.type_code_nullable_byte_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): if x is not None: vals[i] = int(np.int8(x)) else: vals[i] = None converted = dataframe.DataFrame.NullableByteColumn(values=vals) elif typecode == utils.type_code_nullable_short_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): if x is not None: vals[i] = int(np.int16(x)) else: vals[i] = None converted = dataframe.DataFrame.NullableShortColumn(values=vals) elif typecode == utils.type_code_nullable_int_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): if x is not None: vals[i] = int(np.int32(x)) else: vals[i] = None converted = dataframe.DataFrame.NullableIntColumn(values=vals) elif typecode == utils.type_code_nullable_long_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): if x is not None: vals[i] = int(np.int64(x)) else: vals[i] = None converted = dataframe.DataFrame.NullableLongColumn(values=vals) elif typecode == NullableStringColumn.TYPE_CODE: vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(vals): vals[i] = x converted = NullableStringColumn(values=vals) elif typecode == utils.type_code_nullable_float_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): if x is not None: vals[i] = float(np.float32(x)) else: vals[i] = None converted = dataframe.DataFrame.NullableFloatColumn(values=vals) elif typecode == utils.type_code_nullable_double_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): if x is not None: vals[i] = float(np.float64(x)) else: vals[i] = None converted = dataframe.DataFrame.NullableDoubleColumn(values=vals) elif typecode == utils.type_code_nullable_char_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): if x is not None: vals[i] = ord(x[0]) else: vals[i] = None converted = dataframe.DataFrame.NullableCharColumn(values=vals) elif typecode == utils.type_code_nullable_boolean_column(): values_true = {"true", "t", "1", "yes", "y", "on"} values_false = {"false", "f", "0", "no", "n", "off"} vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): if x is not None: x = x.lower() is_true = x in values_true is_false = x in values_false if not is_true and not is_false: raise dataframe.DataFrameException( ("Invalid boolean string: '{}'".format(self._values[i]))) vals[i] = is_true else: vals[i] = None converted = dataframe.DataFrame.NullableBooleanColumn(values=vals) elif typecode == utils.type_code_nullable_binary_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): if x is not None: vals[i] = bytearray.fromhex(x) else: vals[i] = None converted = dataframe.DataFrame.NullableBinaryColumn(values=vals) else: raise dataframe.DataFrameException( "Unknown column type code: {}".format(typecode)) # pylint: disable=protected-access converted._name = self._name return converted
def convert_to(self, typecode): converted = None if typecode == utils.type_code_byte_column(): vals = np.empty([self._values.shape[0]], dtype=np.int8) for i, x in np.ndenumerate(self._values): if x is not None and len(x) > 0: vals[i] = x[0] else: vals[i] = 0 converted = dataframe.DataFrame.ByteColumn(values=vals) elif typecode == utils.type_code_short_column(): vals = np.empty([self._values.shape[0]], dtype=np.int16) for i, x in np.ndenumerate(self._values): if x is not None and len(x) >= 2: vals[i] = int.from_bytes(x[0:2], byteorder="big", signed=True) else: vals[i] = 0 converted = dataframe.DataFrame.ShortColumn(values=vals) elif typecode == utils.type_code_int_column(): vals = np.empty([self._values.shape[0]], dtype=np.int32) for i, x in np.ndenumerate(self._values): if x is not None and len(x) >= 4: vals[i] = int.from_bytes(x[0:4], byteorder="big", signed=True) else: vals[i] = 0 converted = dataframe.DataFrame.IntColumn(values=vals) elif typecode == utils.type_code_long_column(): vals = np.empty([self._values.shape[0]], dtype=np.int64) for i, x in np.ndenumerate(self._values): if x is not None and len(x) >= 8: vals[i] = int.from_bytes(x[0:8], byteorder="big", signed=True) else: vals[i] = 0 converted = dataframe.DataFrame.LongColumn(values=vals) elif typecode == utils.type_code_string_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): if x is not None: vals[i] = x.hex() else: vals[i] = utils.default_value_string_column() converted = dataframe.DataFrame.StringColumn(values=vals) elif typecode == utils.type_code_float_column(): vals = np.empty([self._values.shape[0]], dtype=np.float32) for i, x in np.ndenumerate(self._values): if x is not None and len(x) >= 4: vals[i] = unpack(">f", x[0:4])[0] else: vals[i] = 0.0 converted = dataframe.DataFrame.FloatColumn(values=vals) elif typecode == utils.type_code_double_column(): vals = np.empty([self._values.shape[0]], dtype=np.float64) for i, x in np.ndenumerate(self._values): if x is not None and len(x) >= 8: vals[i] = unpack(">d", x[0:8])[0] else: vals[i] = 0.0 converted = dataframe.DataFrame.DoubleColumn(values=vals) elif typecode == utils.type_code_char_column(): vals = np.empty([self._values.shape[0]], dtype=np.uint8) ord_default = ord(utils.default_value_char_column()) for i, x in np.ndenumerate(self._values): if x is not None and len(x) > 0: vals[i] = int(x[0]) else: vals[i] = ord_default converted = dataframe.DataFrame.CharColumn(values=vals) elif typecode == utils.type_code_boolean_column(): vals = np.empty([self._values.shape[0]], dtype=np.bool) for i, x in np.ndenumerate(self._values): if x is not None: is_zero = True for y in x: if y != 0: is_zero = False break vals[i] = not is_zero else: vals[i] = False converted = dataframe.DataFrame.BooleanColumn(values=vals) elif typecode == BinaryColumn.TYPE_CODE: vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): if x is not None and len(x) > 0: b = bytearray(len(x)) b[:] = x vals[i] = b else: vals[i] = bytearray(b"\x00") converted = BinaryColumn(values=vals) elif typecode == utils.type_code_nullable_byte_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): if x is not None and len(x) > 0: vals[i] = x[0] else: vals[i] = None converted = dataframe.DataFrame.NullableByteColumn(values=vals) elif typecode == utils.type_code_nullable_short_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): if x is not None and len(x) >= 2: vals[i] = int.from_bytes(x[0:2], byteorder="big", signed=True) else: vals[i] = None converted = dataframe.DataFrame.NullableShortColumn(values=vals) elif typecode == utils.type_code_nullable_int_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): if x is not None and len(x) >= 4: vals[i] = int.from_bytes(x[0:4], byteorder="big", signed=True) else: vals[i] = None converted = dataframe.DataFrame.NullableIntColumn(values=vals) elif typecode == utils.type_code_nullable_long_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): if x is not None and len(x) >= 8: vals[i] = int.from_bytes(x[0:8], byteorder="big", signed=True) else: vals[i] = None converted = dataframe.DataFrame.NullableLongColumn(values=vals) elif typecode == utils.type_code_nullable_string_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): if x is not None: vals[i] = x.hex() else: vals[i] = None converted = dataframe.DataFrame.NullableStringColumn(values=vals) elif typecode == utils.type_code_nullable_float_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): if x is not None and len(x) >= 4: vals[i] = unpack(">f", x[0:4])[0] else: vals[i] = None converted = dataframe.DataFrame.NullableFloatColumn(values=vals) elif typecode == utils.type_code_nullable_double_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): if x is not None and len(x) >= 8: vals[i] = unpack(">d", x[0:8])[0] else: vals[i] = None converted = dataframe.DataFrame.NullableDoubleColumn(values=vals) elif typecode == utils.type_code_nullable_char_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): if x is not None and len(x) > 0: vals[i] = int(x[0]) else: vals[i] = None converted = dataframe.DataFrame.NullableCharColumn(values=vals) elif typecode == utils.type_code_nullable_boolean_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): if x is not None: is_zero = True for y in x: if y != 0: is_zero = False break vals[i] = not is_zero else: vals[i] = None converted = dataframe.DataFrame.NullableBooleanColumn(values=vals) elif typecode == NullableBinaryColumn.TYPE_CODE: converted = self.clone() else: raise dataframe.DataFrameException( "Unknown column type code: {}".format(typecode)) # pylint: disable=protected-access converted._name = self._name return converted
def convert_to(self, typecode): converted = None if typecode == utils.type_code_byte_column(): converted = dataframe.DataFrame.ByteColumn( values=self._values.astype(np.int8)) elif typecode == utils.type_code_short_column(): converted = dataframe.DataFrame.ShortColumn( values=self._values.astype(np.int16)) elif typecode == utils.type_code_int_column(): converted = dataframe.DataFrame.IntColumn( values=self._values.astype(np.int32)) elif typecode == LongColumn.TYPE_CODE: converted = self.clone() elif typecode == utils.type_code_string_column(): vals = self._values.astype(np.object) for i, x in np.ndenumerate(vals): vals[i] = str(x) converted = dataframe.DataFrame.StringColumn(values=vals) elif typecode == utils.type_code_float_column(): converted = dataframe.DataFrame.FloatColumn( values=self._values.astype(np.float32)) elif typecode == utils.type_code_double_column(): converted = dataframe.DataFrame.DoubleColumn( values=self._values.astype(np.float64)) elif typecode == utils.type_code_char_column(): vals = self._values.astype(np.uint8) for i, x in np.ndenumerate(vals): vals[i] = ord(str(x)[0]) converted = dataframe.DataFrame.CharColumn(values=vals) elif typecode == utils.type_code_boolean_column(): converted = dataframe.DataFrame.BooleanColumn( values=self._values.astype(np.bool)) elif typecode == utils.type_code_binary_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): vals[i] = bytearray( int(x).to_bytes(8, byteorder="big", signed=True)) converted = dataframe.DataFrame.BinaryColumn(values=vals) elif typecode == utils.type_code_nullable_byte_column(): vals = self._values.astype(np.int8) converted = dataframe.DataFrame.NullableByteColumn( values=vals.astype(np.object)) elif typecode == utils.type_code_nullable_short_column(): vals = self._values.astype(np.int16) converted = dataframe.DataFrame.NullableShortColumn( values=vals.astype(np.object)) elif typecode == utils.type_code_nullable_int_column(): vals = self._values.astype(np.int32) converted = dataframe.DataFrame.NullableIntColumn( values=vals.astype(np.object)) elif typecode == NullableLongColumn.TYPE_CODE: converted = NullableLongColumn( values=self._values.astype(np.object)) elif typecode == utils.type_code_nullable_string_column(): vals = self._values.astype(np.object) for i, x in np.ndenumerate(vals): vals[i] = str(x) converted = dataframe.DataFrame.NullableStringColumn(values=vals) elif typecode == utils.type_code_nullable_float_column(): vals = self._values.astype(np.float32) vals = vals.astype(np.object) converted = dataframe.DataFrame.NullableFloatColumn(values=vals) elif typecode == utils.type_code_nullable_double_column(): vals = self._values.astype(np.float64) vals = vals.astype(np.object) converted = dataframe.DataFrame.NullableDoubleColumn(values=vals) elif typecode == utils.type_code_nullable_char_column(): vals = self._values.astype(np.object) for i, x in np.ndenumerate(vals): vals[i] = ord(str(x)[0]) converted = dataframe.DataFrame.NullableCharColumn(values=vals) elif typecode == utils.type_code_nullable_boolean_column(): vals = self._values.astype(np.bool) vals = vals.astype(np.object) converted = dataframe.DataFrame.NullableBooleanColumn(values=vals) elif typecode == utils.type_code_nullable_binary_column(): vals = np.empty([self._values.shape[0]], dtype=np.object) for i, x in np.ndenumerate(self._values): vals[i] = bytearray( int(x).to_bytes(8, byteorder="big", signed=True)) converted = dataframe.DataFrame.NullableBinaryColumn(values=vals) else: raise dataframe.DataFrameException( "Unknown column type code: {}".format(typecode)) # pylint: disable=protected-access converted._name = self._name return converted
def _check_type(self, value): if value is not None and not isinstance(value, str): raise dataframe.DataFrameException( ("Invalid argument. Expected " "string (str) but found {}".format(type(value))))
def _deserialize_v2(buffer): """Deserialization from the binary-based version 2 format (v2). Args: buffer: The bytearray representing the DataFrame to deserialize Returns: A DataFrame from the given bytearray """ #HEADER ptr = 5 # index pointer dftype = buffer[ptr] if not dftype in (0x64, 0x6e): raise dataframe.DataFrameException( "Unsupported DataFrame implementation") # header format is {v:2;irrrrccccName1.Name2.ttllllbbb}0x... # code of the DataFrame implementation impl_default = (dftype == 0x64) ptr += 1 rows = int.from_bytes(buffer[ptr:ptr + 4], byteorder="big", signed=False) ptr += 4 cols = int.from_bytes(buffer[ptr:ptr + 4], byteorder="big", signed=False) ptr += 4 # column labels names = [] for i in range(cols): c0 = ptr # first char while buffer[ptr] != 0x00: ptr += 1 ptr += 1 names.append(buffer[c0:ptr - 1].decode("utf-8")) # column types types = [] for i in range(cols): types.append(buffer[ptr]) ptr += 1 df = None columns = [] if not impl_default: # NullableDataFrame # first read the entire lookup list into memory lookup_length = int.from_bytes(buffer[ptr:ptr + 4], byteorder="big", signed=False) ptr += 4 lookup_bits = BitVector(buffer[ptr:ptr + lookup_length]) # list index pointing to the next readable bit within the lookup list li = 0 ptr += lookup_length if buffer[ptr] != 0x7d: # header closing brace '}' missing raise dataframe.DataFrameException("Invalid format") #END HEADER #PAYLOAD for i in range(cols): val = np.empty(rows, dtype=np.object) if types[i] == bytecolumn.NullableByteColumn.TYPE_CODE: for j in range(rows): ptr += 1 b = int.from_bytes(buffer[ptr:ptr + 1], byteorder="big", signed=True) if b == 0: if not lookup_bits.get(li): val[j] = 0 li += 1 else: val[j] = b columns.append(bytecolumn.NullableByteColumn(names[i], val)) elif types[i] == shortcolumn.NullableShortColumn.TYPE_CODE: for j in range(rows): ptr += 2 b = int.from_bytes(buffer[ptr - 1:ptr + 1], byteorder="big", signed=True) if b == 0: if not lookup_bits.get(li): val[j] = 0 li += 1 else: val[j] = b columns.append(shortcolumn.NullableShortColumn(names[i], val)) elif types[i] == intcolumn.NullableIntColumn.TYPE_CODE: for j in range(rows): ptr += 4 b = int.from_bytes(buffer[ptr - 3:ptr + 1], byteorder="big", signed=True) if b == 0: if not lookup_bits.get(li): val[j] = 0 li += 1 else: val[j] = b columns.append(intcolumn.NullableIntColumn(names[i], val)) elif types[i] == longcolumn.NullableLongColumn.TYPE_CODE: for j in range(rows): ptr += 8 b = int.from_bytes(buffer[ptr - 7:ptr + 1], byteorder="big", signed=True) if b == 0: if not lookup_bits.get(li): val[j] = 0 li += 1 else: val[j] = b columns.append(longcolumn.NullableLongColumn(names[i], val)) elif types[i] == stringcolumn.NullableStringColumn.TYPE_CODE: for j in range(rows): ptr += 1 c0 = ptr # marks the first character of each string while buffer[ptr] != 0x00: ptr += 1 if (ptr - c0) == 0: if not lookup_bits.get(li): val[j] = "" li += 1 else: val[j] = buffer[c0:ptr].decode("utf-8") columns.append(stringcolumn.NullableStringColumn( names[i], val)) elif types[i] == floatcolumn.NullableFloatColumn.TYPE_CODE: for j in range(rows): ptr += 4 # since Python does not have float32, we need to do a conversion # over numpy and str to get the same precision as the original value f = float( str( np.float32( unpack(">f", buffer[ptr - 3:ptr + 1])[0]))) if f == 0.0: if not lookup_bits.get(li): val[j] = 0.0 li += 1 else: val[j] = f columns.append(floatcolumn.NullableFloatColumn(names[i], val)) elif types[i] == doublecolumn.NullableDoubleColumn.TYPE_CODE: for j in range(rows): ptr += 8 f = unpack(">d", buffer[ptr - 7:ptr + 1])[0] if f == 0.0: if not lookup_bits.get(li): val[j] = 0.0 li += 1 else: val[j] = f columns.append(doublecolumn.NullableDoubleColumn( names[i], val)) elif types[i] == charcolumn.NullableCharColumn.TYPE_CODE: for j in range(rows): ptr += 1 c = int.from_bytes(buffer[ptr:ptr + 1], byteorder="big", signed=False) if c == 0: val[j] = None else: val[j] = chr(c) columns.append(charcolumn.NullableCharColumn(names[i], val)) elif types[i] == booleancolumn.NullableBooleanColumn.TYPE_CODE: length = int(rows / 8 if (rows % 8 == 0) else ((rows / 8) + 1)) ptr += 1 # focus on next readable position bits = BitVector(buffer[ptr:ptr + length]) for j in range(rows): if not bits.get(j): if not lookup_bits.get(li): val[j] = False li += 1 else: val[j] = True # let the base pointer jump forward to the last read byte ptr += (length - 1) columns.append( booleancolumn.NullableBooleanColumn(names[i], val)) elif types[i] == binarycolumn.NullableBinaryColumn.TYPE_CODE: for j in range(rows): ptr += 1 length = int.from_bytes(buffer[ptr:ptr + 4], byteorder="big", signed=False) ptr += 3 if length != 0: data = bytearray(length) for k in range(length): ptr += 1 data[k] = buffer[ptr] val[j] = data columns.append(binarycolumn.NullableBinaryColumn( names[i], val)) else: raise dataframe.DataFrameException( ("Unknown column with type code {}").format(types[i])) #END PAYLOAD if cols == 0: # uninitialized instance df = dataframe.NullableDataFrame() else: df = dataframe.NullableDataFrame(columns) else: # DefaultDataFrame if buffer[ptr] != 0x7d: # header closing brace '}' raise dataframe.DataFrameException("Invalid format") #END HEADER #PAYLOAD for i in range(cols): if types[i] == bytecolumn.ByteColumn.TYPE_CODE: val = np.empty(rows, dtype=np.int8) for j in range(rows): ptr += 1 val[j] = int.from_bytes(buffer[ptr:ptr + 1], byteorder="big", signed=True) columns.append(bytecolumn.ByteColumn(names[i], val)) elif types[i] == shortcolumn.ShortColumn.TYPE_CODE: val = np.empty(rows, dtype=np.int16) for j in range(rows): ptr += 2 val[j] = int.from_bytes(buffer[ptr - 1:ptr + 1], byteorder="big", signed=True) columns.append(shortcolumn.ShortColumn(names[i], val)) elif types[i] == intcolumn.IntColumn.TYPE_CODE: val = np.empty(rows, dtype=np.int32) for j in range(rows): ptr += 4 val[j] = int.from_bytes(buffer[ptr - 3:ptr + 1], byteorder="big", signed=True) columns.append(intcolumn.IntColumn(names[i], val)) elif types[i] == longcolumn.LongColumn.TYPE_CODE: val = np.empty(rows, dtype=np.int64) for j in range(rows): ptr += 8 val[j] = int.from_bytes(buffer[ptr - 7:ptr + 1], byteorder="big", signed=True) columns.append(longcolumn.LongColumn(names[i], val)) elif types[i] == stringcolumn.StringColumn.TYPE_CODE: val = np.empty(rows, dtype=np.object) for j in range(rows): ptr += 1 c0 = ptr # marks the first character of each string while buffer[ptr] != 0x00: ptr += 1 if (ptr - c0) == 0: val[j] = stringcolumn.StringColumn.DEFAULT_VALUE else: val[j] = buffer[c0:ptr].decode("utf-8") columns.append(stringcolumn.StringColumn(names[i], val)) elif types[i] == floatcolumn.FloatColumn.TYPE_CODE: val = np.empty(rows, dtype=np.float32) for j in range(rows): ptr += 4 # since Python does not have float32, we need to do a conversion # over numpy and str to get the same precision as the original value val[j] = float( str( np.float32( unpack(">f", buffer[ptr - 3:ptr + 1])[0]))) columns.append(floatcolumn.FloatColumn(names[i], val)) elif types[i] == doublecolumn.DoubleColumn.TYPE_CODE: val = np.empty(rows, dtype=np.float64) for j in range(rows): ptr += 8 val[j] = unpack(">d", buffer[ptr - 7:ptr + 1])[0] columns.append(doublecolumn.DoubleColumn(names[i], val)) elif types[i] == charcolumn.CharColumn.TYPE_CODE: val = np.empty(rows, dtype=np.uint8) for j in range(rows): ptr += 1 c = int.from_bytes(buffer[ptr:ptr + 1], byteorder="big", signed=False) val[j] = c columns.append(charcolumn.CharColumn(names[i], val)) elif types[i] == booleancolumn.BooleanColumn.TYPE_CODE: val = np.empty(rows, dtype=np.bool) length = int(rows / 8 if (rows % 8 == 0) else ((rows / 8) + 1)) ptr += 1 # focus on next readable position bits = BitVector(buffer[ptr:ptr + length]) for j in range(rows): val[j] = bits.get(j) ptr += (length - 1) columns.append(booleancolumn.BooleanColumn(names[i], val)) elif types[i] == binarycolumn.BinaryColumn.TYPE_CODE: val = np.empty(rows, dtype=np.object) for j in range(rows): ptr += 1 length = int.from_bytes(buffer[ptr:ptr + 4], byteorder="big", signed=False) ptr += 3 data = bytearray(length) for k in range(length): ptr += 1 data[k] = buffer[ptr] val[j] = data columns.append(binarycolumn.BinaryColumn(names[i], val)) else: raise dataframe.DataFrameException( ("Unknown column with type code {}").format(types[i])) #END PAYLOAD if cols == 0: # uninitialized instance df = dataframe.DefaultDataFrame() else: df = dataframe.DefaultDataFrame(columns) return df
def getitem_impl(arg, position): """Implementation of the __getitem__() function Args: arg: The DataFrame instance on which the function was called upon position: The position argument passed to the function Returns: The value at the specified position """ if isinstance(position, tuple): if len(position) > 2: raise dataframe.DataFrameException( ("Invalid position argument. Too many " "positions specified: {}").format(len(position))) cols = position[0] rows = position[1] if isinstance(cols, (int, str)): # check for negative column indices if isinstance(cols, int) and cols < 0: if abs(cols) > arg.columns(): raise dataframe.DataFrameException( "Invalid column index: {}".format(cols)) cols = cols % arg.columns() if rows is None: # implements df[x, :] and df["x", :] return arg.get_columns(cols=cols) elif isinstance(rows, int): # implements df[x, y] and df["x", y] if rows < 0: if abs(rows) > arg.rows(): raise dataframe.DataFrameException( "Invalid row index: {}".format(rows)) rows = rows % arg.rows() return arg.get_column(cols).get_value(rows) elif isinstance(rows, str): # implements df[x, "y_regex"] and df["x", "y_regex"] return arg.filter(cols, rows) elif isinstance(rows, tuple): # implements df[x, (y0, y1, ..., yn)] # and df["x", (y0, y1, ..., yn)] col_selected = arg.get_column(cols) col = column.Column.like(col_selected, length=len(rows)) df = (dataframe.NullableDataFrame(col) if arg.is_nullable() else dataframe.DefaultDataFrame(col)) for i, row_index in enumerate(rows): col[i] = col_selected[row_index] return df elif isinstance(rows, slice): # implements df[x, y0:y1:y2] # and df["x", y0:y1:y2] start = rows.start stop = rows.stop step = rows.step col_selected = arg.get_column(cols) # numpy returns an array view when slicing # so we have to copy the array explicitly # to get an independent instance col_values = col_selected._values[start:stop:step].copy() col = column.Column.like(col_selected, length=0) col._values = col_values return (dataframe.NullableDataFrame(col) if arg.is_nullable() else dataframe.DefaultDataFrame(col)) elif isinstance(cols, (tuple, slice)): # prefetch the selected columns as a DataFrame if isinstance(cols, tuple): cols_selected = arg.get_columns(cols=cols) else: # is slice cols_selected = arg._internal_columns()[cols] cols_selected = (dataframe.NullableDataFrame(cols_selected) if arg.is_nullable() else dataframe.DefaultDataFrame(cols_selected)) if rows is None: # implements df[(x0, x1, ..., xn), ] # and df[x0:x1:x2, ] return cols_selected elif isinstance(rows, int): # implements df[(x0, x1, ..., xn), y] # and df[x0:x1:x2, y] if rows < 0: if abs(rows) > arg.rows(): raise dataframe.DataFrameException( "Invalid row index: {}".format(rows)) rows = rows % arg.rows() return cols_selected.get_row(rows) elif isinstance(rows, tuple): # implements df[(x0, x1, ..., xn), (y0, y1, ..., ym)] # and df[x0:x1:x2, (y0, y1, ..., ym)] cols = [ column.Column.like(col, length=len(rows)) for col in cols_selected._internal_columns() ] df = (dataframe.NullableDataFrame(cols) if arg.is_nullable() else dataframe.DefaultDataFrame(cols)) for i, row_index in enumerate(rows): df.set_row(i, cols_selected.get_row(rows[i])) return df elif isinstance(rows, slice): # implements df[(x0, x1, ..., xn), y0:y1:y2] # and df[x0:x1:x2, y0:y1:y2] start = rows.start stop = rows.stop step = rows.step cols = [None] * cols_selected.columns() for i, col in enumerate(cols_selected._internal_columns()): col_values = col._values[start:stop:step].copy() col_sliced = column.Column.like(col, length=col_values.shape[0]) col_sliced._values = col_values cols[i] = col_sliced return (dataframe.NullableDataFrame(cols) if arg.is_nullable() else dataframe.DefaultDataFrame(cols)) elif isinstance(rows, str): raise dataframe.DataFrameException( ("Invalid column position type. A filter operation " "must only specify a single column " "but found {}").format(type(cols))) else: # invalid type for column position arg raise dataframe.DataFrameException( ("Invalid column position type. " "Expected int or str but found {}").format(type(cols))) elif isinstance(position, int): # implements df[x] if position < 0: if abs(position) > arg.columns(): raise dataframe.DataFrameException( "Invalid column index: {}".format(position)) position = position % arg.columns() return arg.get_column(position) elif isinstance(position, str): # implements df["x"] return arg.get_column(position) else: # invalid type for entire position arg raise dataframe.DataFrameException(("Invalid position type. " "Expected int or str but " "found {}").format(type(position))) # make pylint happy about missing return statement raise dataframe.DataFrameException("Implementation error")
def _serialize_v2(df): """Serialization to the binary-based version 2 format (v2). Args: df: The DataFrame to serialize Returns: A bytearray representing the given DataFrame """ buffer = bytearray() #HEADER # must start with {v:2; buffer.append(0x7b) buffer.append(0x76) buffer.append(0x3a) buffer.append(0x32) buffer.append(0x3b) # impl: default=0x64 nullable=0x6e buffer.append(0x6e if df.is_nullable() else 0x64) rows = df.rows() if rows > 0xffffffff: raise dataframe.DataFrameException( ("Unable to serialize DataFrame with " "row count greater than 0xffffffff")) buffer.extend(rows.to_bytes(4, byteorder="big", signed=False)) cols = df.columns() if cols > 0xffffffff: raise dataframe.DataFrameException( ("Unable to serialize DataFrame with " "column count greater than 0xffffffff")) buffer.extend(cols.to_bytes(4, byteorder="big", signed=False)) if df.has_column_names(): for name in df.get_column_names(): buffer.extend(bytearray(name, "utf-8")) # add null byte as name delimeter buffer.append(0x00) else: # set indices as strings for i in range(cols): buffer.extend(bytearray(str(i), "utf-8")) buffer.append(0x00) for col in df: buffer.append(col.type_code()) if df.is_nullable(): # NullableDataFrame # The specification requires a lookup list for differentiating between # default values (for example: zeros for numbers) and actual null values. # This is implemented here as a bit vector initialized with all bits # set to zero. # As the lookup list is part of the header, we must first serialize the # entire payload and build the lookup list and then bind all the parts # together at the end header = buffer buffer = bytearray() # the lookup list lookup_bits = BitVector() #PAYLOAD for col in df: type_code = col.type_code() val = col.as_array() if type_code == bytecolumn.NullableByteColumn.TYPE_CODE: for i in range(rows): if val[i] is None: buffer.append(0x00) lookup_bits.add1() elif val[i] == 0: buffer.append(0x00) lookup_bits.add0() else: buffer.extend( int(val[i]).to_bytes(1, byteorder="big", signed=True)) elif type_code == shortcolumn.NullableShortColumn.TYPE_CODE: for i in range(rows): if val[i] is None: buffer.extend(b'\x00\x00') lookup_bits.add1() elif val[i] == 0: buffer.extend(b'\x00\x00') lookup_bits.add0() else: buffer.extend( int(val[i]).to_bytes(2, byteorder="big", signed=True)) elif type_code == intcolumn.NullableIntColumn.TYPE_CODE: for i in range(rows): if val[i] is None: buffer.extend(b'\x00\x00\x00\x00') lookup_bits.add1() elif val[i] == 0: buffer.extend(b'\x00\x00\x00\x00') lookup_bits.add0() else: buffer.extend( int(val[i]).to_bytes(4, byteorder="big", signed=True)) elif type_code == longcolumn.NullableLongColumn.TYPE_CODE: for i in range(rows): if val[i] is None: buffer.extend(b'\x00\x00\x00\x00\x00\x00\x00\x00') lookup_bits.add1() elif val[i] == 0: buffer.extend(b'\x00\x00\x00\x00\x00\x00\x00\x00') lookup_bits.add0() else: buffer.extend( int(val[i]).to_bytes(8, byteorder="big", signed=True)) elif type_code == stringcolumn.NullableStringColumn.TYPE_CODE: for i in range(rows): if val[i] is None: lookup_bits.add1() elif len(val[i]) == 0: lookup_bits.add0() else: buffer.extend(val[i].encode("utf-8")) # add null character as string delimeter buffer.append(0x00) elif type_code == floatcolumn.NullableFloatColumn.TYPE_CODE: for i in range(rows): if val[i] is None: buffer.extend(b'\x00\x00\x00\x00') lookup_bits.add1() # bit representation of zero is strictly # defined so we compare directly elif val[i] == 0.0: buffer.extend(b'\x00\x00\x00\x00') lookup_bits.add0() else: buffer.extend(pack(">f", val[i])) elif type_code == doublecolumn.NullableDoubleColumn.TYPE_CODE: for i in range(rows): if val[i] is None: buffer.extend(b'\x00\x00\x00\x00\x00\x00\x00\x00') lookup_bits.add1() # bit representation of zero is strictly # defined so we compare directly elif val[i] == 0.0: buffer.extend(b'\x00\x00\x00\x00\x00\x00\x00\x00') lookup_bits.add0() else: buffer.extend(pack(">d", val[i])) elif type_code == charcolumn.NullableCharColumn.TYPE_CODE: for i in range(rows): if val[i] is None: buffer.append(0x00) else: buffer.extend(val[i].to_bytes(1, byteorder="big", signed=True)) elif type_code == booleancolumn.NullableBooleanColumn.TYPE_CODE: bits = BitVector() for i in range(rows): if val[i] is None: bits.add0() lookup_bits.add1() else: if val[i]: bits.add1() else: bits.add0() lookup_bits.add0() buffer.extend(bits.tobytearray()) elif type_code == binarycolumn.NullableBinaryColumn.TYPE_CODE: for i in range(rows): dataLength = len(val[i]) if val[i] is not None else 0 buffer.extend(dataLength.to_bytes(4, byteorder="big")) if val[i] is not None: buffer.extend(val[i]) else: raise dataframe.DataFrameException( "Unknown column type: {}".format(type_code)) #END PAYLOAD # copy operations to stick everything together payload = buffer # allocate buffer for the final result buffer = header # Number of byte blocks of the lookup list. # The specification requires that the lookup # list has a minimum length of one block blength = int(((lookup_bits.size() - 1) / 8) + 1) buffer.extend(blength.to_bytes(4, byteorder="big", signed=False)) # copy lookup bits buffer.extend(lookup_bits.tobytearray()) # add header closing brace '}' buffer.append(0x7d) # copy payload buffer buffer.extend(payload) else: # DefaultDataFrame buffer.append(0x7d) # add header closing brace '}' #END HEADER # As DefaultDataFrames do not have null values, no lookup list # is required and we just serialize all bytes as they are to # the payload section #PAYLOAD for col in df: type_code = col.type_code() val = col.as_array() if type_code == bytecolumn.ByteColumn.TYPE_CODE: for i in range(rows): buffer.extend( int(val[i]).to_bytes(1, byteorder="big", signed=True)) elif type_code == shortcolumn.ShortColumn.TYPE_CODE: for i in range(rows): buffer.extend( int(val[i]).to_bytes(2, byteorder="big", signed=True)) elif type_code == intcolumn.IntColumn.TYPE_CODE: for i in range(rows): buffer.extend( int(val[i]).to_bytes(4, byteorder="big", signed=True)) elif type_code == longcolumn.LongColumn.TYPE_CODE: for i in range(rows): buffer.extend( int(val[i]).to_bytes(8, byteorder="big", signed=True)) elif type_code == stringcolumn.StringColumn.TYPE_CODE: for i in range(rows): buffer.extend(val[i].encode("utf-8")) # add null character as string delimeter buffer.append(0x00) elif type_code == floatcolumn.FloatColumn.TYPE_CODE: for i in range(rows): buffer.extend(pack(">f", val[i])) elif type_code == doublecolumn.DoubleColumn.TYPE_CODE: for i in range(rows): buffer.extend(pack(">d", val[i])) elif type_code == charcolumn.CharColumn.TYPE_CODE: for i in range(rows): buffer.extend( int(val[i]).to_bytes(1, byteorder="big", signed=True)) elif type_code == booleancolumn.BooleanColumn.TYPE_CODE: bits = BitVector() for i in range(rows): if val[i]: bits.add1() else: bits.add0() buffer.extend(bits.tobytearray()) elif type_code == binarycolumn.BinaryColumn.TYPE_CODE: for i in range(rows): buffer.extend(len(val[i]).to_bytes(4, byteorder="big")) buffer.extend(val[i]) else: raise dataframe.DataFrameException( "Unknown column type: {}".format(type_code)) #END PAYLOAD return buffer