def encode_value(cls, value: tp.Any) -> _meta.Value: if value is None: raise _ex.ETracInternal("Cannot encode a null value") if isinstance(value, bool): type_desc = _meta.TypeDescriptor(_meta.BasicType.BOOLEAN) return _meta.Value(type_desc, booleanValue=value) if isinstance(value, int): type_desc = _meta.TypeDescriptor(_meta.BasicType.INTEGER) return _meta.Value(type_desc, integerValue=value) if isinstance(value, float): type_desc = _meta.TypeDescriptor(_meta.BasicType.FLOAT) return _meta.Value(type_desc, floatValue=value) if isinstance(value, decimal.Decimal): type_desc = _meta.TypeDescriptor(_meta.BasicType.DECIMAL) return _meta.Value(type_desc, decimalValue=_meta.DecimalValue(str(value))) if isinstance(value, str): type_desc = _meta.TypeDescriptor(_meta.BasicType.STRING) return _meta.Value(type_desc, stringValue=value) # dt.datetime inherits dt.date, so check datetime first to avoid encoding datetime as a date if isinstance(value, dt.datetime): type_desc = _meta.TypeDescriptor(_meta.BasicType.DATETIME) return _meta.Value(type_desc, datetimeValue=_meta.DatetimeValue(value.isoformat())) if isinstance(value, dt.date): type_desc = _meta.TypeDescriptor(_meta.BasicType.DATE) return _meta.Value(type_desc, dateValue=_meta.DateValue(value.isoformat())) raise _ex.ETracInternal(f"Encoding value type [{type(value)}] is not supported yet")
def view_to_pandas(cls, view: DataView, part: DataPartKey) -> pd.DataFrame: deltas = view.parts.get(part) # Sanity checks if not view.arrow_schema: raise _ex.ETracInternal(f"Data view schema not set") if not deltas: raise _ex.ETracInternal( f"Data view for part [{part.opaque_key}] does not contain any items" ) if len(deltas) == 1: return cls.item_to_pandas(deltas[0]) batches = { batch for delta in deltas for batch in ( delta.batches if delta.batches else delta.table.to_batches()) } table = pa.Table.from_batches(batches) # noqa return table.to_pandas()
def decode_value(value: _meta.Value) -> tp.Any: if value is None or not isinstance(value, _meta.Value): raise _ex.ETracInternal() if value.type is None or \ value.type.basicType is None or \ value.type.basicType == _meta.BasicType.BASIC_TYPE_NOT_SET: raise _ex.ETracInternal("Missing type information") basic_type = value.type.basicType if basic_type == _meta.BasicType.BOOLEAN: return value.booleanValue if basic_type == _meta.BasicType.INTEGER: return value.integerValue if basic_type == _meta.BasicType.FLOAT: return value.floatValue if basic_type == _meta.BasicType.DECIMAL: return decimal.Decimal(value.decimalValue.decimal) if basic_type == _meta.BasicType.STRING: return value.stringValue if basic_type == _meta.BasicType.DATE: return dt.date.fromisoformat(value.dateValue.isoDate) if basic_type == _meta.BasicType.DATETIME: return dt.datetime.fromisoformat(value.datetimeValue.isoDatetime) raise _ex.ETracInternal(f"Decoding value type [{basic_type}] is not supported yet")
def lookup(self, node_id: NodeId[__T]) -> __T: engine_node = self.__nodes.get(node_id) # Use internal errors if any of the checks fail on a result lookup # The engine should guarantee that all these conditions are met before a node is executed if engine_node is None: raise _ex.ETracInternal( f"Node [{node_id.name}] does not exist in execution context [{node_id.namespace}]" ) if not engine_node.complete: raise _ex.ETracInternal( f"Node [{node_id.name}] still pending in execution context [{node_id.namespace}]" ) if engine_node.error: raise _ex.ETracInternal( f"Node [{node_id.name}] failed in execution context [{node_id.namespace}]" ) if not NodeProcessor.result_matches_type(engine_node.result, node_id.result_type): expected_type = node_id.result_type or type(None) result_type = type(engine_node.result) err = f"Wrong type for node [{node_id.name}] in execution context [{node_id.namespace}]" \ + f" (expected [{expected_type}], got [{result_type}])" raise _ex.ETracInternal(err) return engine_node.result
def arrow_to_python_type(arrow_type: pa.DataType) -> type: if pa.types.is_boolean(arrow_type): return bool if pa.types.is_integer(arrow_type): return int if pa.types.is_floating(arrow_type): return float if pa.types.is_decimal(arrow_type): return decimal.Decimal if pa.types.is_string(arrow_type): return str if pa.types.is_date(arrow_type): return dt.date if pa.types.is_timestamp(arrow_type): return dt.datetime raise _ex.ETracInternal( f"No Python type mapping available for Arrow type [{arrow_type}]")
def _update_results(self, updates: tp.Dict[NodeId, _EngineNode]): nodes = {**self.graph.nodes, **updates} pending_nodes = cp.copy(self.graph.pending_nodes) active_nodes = cp.copy(self.graph.active_nodes) succeeded_nodes = cp.copy(self.graph.succeeded_nodes) failed_nodes = cp.copy(self.graph.failed_nodes) for node_id, node in updates.items(): if node_id in active_nodes: active_nodes.remove(node_id) elif node_id in pending_nodes: pending_nodes.remove( node_id ) # TODO: check pending node ID is part of main node id bundle else: raise _ex.ETracInternal() if node.error: failed_nodes.add(node_id) else: succeeded_nodes.add(node_id) if node_id in self.processors: node_ref = self.processors.pop(node_id) self.actors().stop(node_ref) graph = _EngineContext(nodes, pending_nodes, active_nodes, succeeded_nodes, failed_nodes) self.graph = graph self.check_job_status()
def python_to_arrow_type(cls, python_type: type) -> pa.DataType: if python_type == bool: return pa.bool_() if python_type == int: return pa.int64() if python_type == float: return pa.float64() if python_type == decimal.Decimal: return pa.decimal128(cls.__TRAC_DECIMAL_PRECISION, cls.__TRAC_DECIMAL_SCALE) if python_type == str: return pa.utf8() if python_type == dt.date: return pa.date32() if python_type == dt.datetime: return pa.timestamp(cls.__TRAC_TIMESTAMP_UNIT, cls.__TRAC_TIMESTAMP_ZONE) raise _ex.ETracInternal( f"No Arrow type mapping available for Python type [{python_type}]")
def trac_to_python_basic_type(cls, trac_basic_type: _meta.BasicType) -> type: python_type = cls.__TRAC_TO_PYTHON_BASIC_TYPE.get(trac_basic_type) if python_type is None: raise _ex.ETracInternal(f"No Python type mapping available for TRAC type [{trac_basic_type}]") return python_type
def python_to_trac_basic_type(cls, python_type: type) -> _meta.BasicType: basic_type = cls.__PYTHON_TO_TRAC_BASIC_TYPE.get(python_type) if basic_type is None: raise _ex.ETracInternal(f"No TRAC type mapping available for Python type [{python_type}]") return basic_type
def convert_boolean_value(raw_value: tp.Any) -> _meta.Value: type_desc = _meta.TypeDescriptor(_meta.BasicType.BOOLEAN) if isinstance(raw_value, bool): return _meta.Value(type_desc, booleanValue=raw_value) msg = f"Value of type [{type(raw_value)}] cannot be converted to {_meta.BasicType.BOOLEAN.name}" raise _ex.ETracInternal(msg)
def _check_result_type(self, result): # Use an internal error if the result is the wrong type # Node functions should only ever return the expected type expected_type = self.node.node.id.result_type or self.__NONE_TYPE result_type = type(result) if not self.result_matches_type(result, expected_type): err = f"Node result is the wrong type, expected [{expected_type.__name__}], got [{result_type.__name__}]" raise _ex.ETracInternal(err)
def trac_to_arrow_basic_type( cls, trac_basic_type: _meta.BasicType) -> pa.DataType: arrow_type = cls.__TRAC_TO_ARROW_BASIC_TYPE_MAPPING.get( trac_basic_type) if arrow_type is None: raise _ex.ETracInternal( f"No Arrow type mapping available for TRAC type [{trac_basic_type}]" ) return arrow_type
def convert_decimal_value(raw_value: tp.Any) -> _meta.Value: type_desc = _meta.TypeDescriptor(_meta.BasicType.DECIMAL) if isinstance(raw_value, decimal.Decimal): return _meta.Value(type_desc, decimalValue=_meta.DecimalValue(str(raw_value))) if isinstance(raw_value, int) or isinstance(raw_value, float): return _meta.Value(type_desc, decimalValue=_meta.DecimalValue(str(raw_value))) msg = f"Value of type [{type(raw_value)}] cannot be converted to {_meta.BasicType.DECIMAL.name}" raise _ex.ETracInternal(msg)
def convert_float_value(raw_value: tp.Any) -> _meta.Value: type_desc = _meta.TypeDescriptor(_meta.BasicType.FLOAT) if isinstance(raw_value, float): return _meta.Value(type_desc, floatValue=raw_value) if isinstance(raw_value, int): return _meta.Value(type_desc, floatValue=float(raw_value)) msg = f"Value of type [{type(raw_value)}] cannot be converted to {_meta.BasicType.FLOAT.name}" raise _ex.ETracInternal(msg)
def convert_integer_value(raw_value: tp.Any) -> _meta.Value: type_desc = _meta.TypeDescriptor(_meta.BasicType.INTEGER) if isinstance(raw_value, int): return _meta.Value(type_desc, integerValue=raw_value) if isinstance(raw_value, float) and raw_value.is_integer(): return _meta.Value(type_desc, integerValue=int(raw_value)) msg = f"Value of type [{type(raw_value)}] cannot be converted to {_meta.BasicType.INTEGER.name}" raise _ex.ETracInternal(msg)
def convert_datetime_value(raw_value: tp.Any) -> _meta.Value: type_desc = _meta.TypeDescriptor(_meta.BasicType.DATETIME) if isinstance(raw_value, dt.datetime): return _meta.Value(type_desc, datetimeValue=_meta.DatetimeValue(isoDatetime=raw_value.isoformat())) if isinstance(raw_value, str): datetime_value = dt.datetime.fromisoformat(raw_value) return _meta.Value(type_desc, datetimeValue=_meta.DatetimeValue(isoDatetime=datetime_value.isoformat())) msg = f"Value of type [{type(raw_value)}] cannot be converted to {_meta.BasicType.DATETIME.name}" raise _ex.ETracInternal(msg)
def trac_to_arrow_schema(cls, trac_schema: _meta.SchemaDefinition) -> pa.Schema: if trac_schema.schemaType != _meta.SchemaType.TABLE: raise _ex.ETracInternal( f"Schema type [{trac_schema.schemaType}] cannot be converted for Apache Arrow" ) arrow_fields = [(f.fieldName, cls.trac_to_arrow_basic_type(f.fieldType)) for f in trac_schema.table.fields] return pa.schema(arrow_fields, metadata={})
def item_to_pandas(cls, item: DataItem) -> pd.DataFrame: if item.pandas is not None: return item.pandas.copy() if item.table is not None: return cls.arrow_to_pandas(item.table) if item.batches is not None: table = pa.Table.from_batches(item.batches, item.schema) # noqa return cls.arrow_to_pandas(table) raise _ex.ETracInternal(f"Data item does not contain any usable data")
def convert_string_value(raw_value: tp.Any) -> _meta.Value: type_desc = _meta.TypeDescriptor(_meta.BasicType.STRING) if isinstance(raw_value, str): return _meta.Value(type_desc, stringValue=raw_value) if isinstance(raw_value, bool) or \ isinstance(raw_value, int) or \ isinstance(raw_value, float) or \ isinstance(raw_value, decimal.Decimal): return _meta.Value(type_desc, stringValue=str(raw_value)) msg = f"Value of type [{type(raw_value)}] cannot be converted to {_meta.BasicType.STRING.name}" raise _ex.ETracInternal(msg)
def read_table( self, storage_path: str, storage_format: str, schema: tp.Optional[pa.Schema], storage_options: tp.Dict[str, tp.Any] = None) \ -> pa.Table: try: format_impl = FormatManager.get_data_format( storage_format, storage_options) stat = self.__file_storage.stat(storage_path) if stat.file_type == FileType.DIRECTORY: dir_content = self.__file_storage.ls(storage_path) if len(dir_content) == 1: storage_path = storage_path.rstrip( "/\\") + "/" + dir_content[0] else: raise NotImplementedError( "Directory storage format not available yet") with self.__file_storage.read_byte_stream( storage_path) as byte_stream: table = format_impl.read_table(byte_stream, schema) if schema is not None: # Apply conformance, in case the format was not able to apply it fully on read # It is fine to silently ignore extra columns of an input return _data.DataConformance.conform_to_schema( table, schema, warn_extra_columns=False) else: return table except (_ex.EStorage, _ex.EData) as e: err = f"Failed to read table [{storage_path}]: {str(e)}" self.__log.error(err) raise type(e)(err) from e except Exception as e: err = f"Failed to read table [{storage_path}]: An unexpected error occurred" self.__log.error(err) self.__log.exception(str(e)) raise _ex.ETracInternal(err) from e
def write_table(self, storage_path: str, storage_format: str, table: pa.Table, storage_options: tp.Dict[str, tp.Any] = None, overwrite: bool = False): try: format_impl = FormatManager.get_data_format( storage_format, storage_options) format_extension = FormatManager.extension_for_format( storage_format) # TODO: Full handling of directory storage formats if not storage_path.endswith(format_extension): parent_dir_ = storage_path storage_path_ = storage_path.rstrip( "/\\") + f"/chunk-0.{format_extension}" self.__file_storage.mkdir(parent_dir_, True, exists_ok=overwrite) else: parent_dir_ = str(pathlib.PurePath(storage_path).parent) storage_path_ = storage_path self.__file_storage.mkdir(parent_dir_, True, True) with self.__file_storage.write_byte_stream( storage_path_, overwrite=overwrite) as byte_stream: format_impl.write_table(byte_stream, table) except (_ex.EStorage, _ex.EData) as e: err = f"Failed to write table [{storage_path}]: {str(e)}" self.__log.error(err) raise type(e)(err) from e except Exception as e: err = f"Failed to write table [{storage_path}]: An unexpected error occurred" self.__log.error(err) self.__log.exception(str(e)) raise _ex.ETracInternal(err) from e
def result_matches_type(cls, result, expected_type) -> bool: if expected_type is None or expected_type == cls.__NONE_TYPE: return result is None if expected_type == tp.Any: return True generic_type = _util.get_origin(expected_type) if generic_type is None: return isinstance(result, expected_type) if generic_type == list: list_type = _util.get_args(expected_type)[0] def list_type_check(item): return cls.result_matches_type(item, list_type) return isinstance(result, generic_type) and all( map(list_type_check, result)) if generic_type == dict: dict_type_args = _util.get_args(expected_type) key_type = dict_type_args[0] value_type = dict_type_args[1] def dict_type_check(entry): key, value = entry return isinstance(key, key_type) and cls.result_matches_type( value, value_type) return isinstance(result, generic_type) and all( map(dict_type_check, result.items())) raise _ex.ETracInternal( f"Cannot enforce type check for generic type [{str(generic_type)}]" )
def convert_value(cls, raw_value: tp.Any, type_desc: _meta.TypeDescriptor): if type_desc.basicType == _meta.BasicType.BOOLEAN: return cls.convert_boolean_value(raw_value) if type_desc.basicType == _meta.BasicType.INTEGER: return cls.convert_integer_value(raw_value) if type_desc.basicType == _meta.BasicType.FLOAT: return cls.convert_float_value(raw_value) if type_desc.basicType == _meta.BasicType.DECIMAL: return cls.convert_decimal_value(raw_value) if type_desc.basicType == _meta.BasicType.STRING: return cls.convert_string_value(raw_value) if type_desc.basicType == _meta.BasicType.DATE: return cls.convert_date_value(raw_value) if type_desc.basicType == _meta.BasicType.DATETIME: return cls.convert_datetime_value(raw_value) raise _ex.ETracInternal(f"Conversion to value type [{type_desc.basicType.name}] is not supported yet")