示例#1
0
def _DEPRECATED_overwrite_to_fix_arrow_table_schema(
        path: Path, fallback_schema: pa.Schema) -> None:
    if not path.stat().st_size:
        return

    table = load_trusted_arrow_file(path)

    untyped_schema = table.schema
    fields = [
        __DEPRECATED_fix_field(
            untyped_schema.field(i),
            (None if fallback_schema.get_field_index(name) == -1 else
             fallback_schema.field(fallback_schema.get_field_index(name))),
        ) for i, name in enumerate(untyped_schema.names)
    ]
    schema = pa.schema(fields)

    # Overwrite with new data
    #
    # We don't short-circuit by comparing schemas: two pa.Schema values
    # with different number formats evaluate as equal.
    #
    # We write a separate file to /var/tmp and then copy it: our sandbox
    # won't let us `rename(2)` in `path`'s directory.
    with tempfile_context(dir="/var/tmp") as rewrite_path:
        with pa.ipc.RecordBatchFileWriter(rewrite_path, schema) as writer:
            writer.write_table(pa.table(table.columns, schema=schema))
        shutil.copyfile(rewrite_path, path)
def register_schema(cls: type, schema: pa.Schema):
    global TYPE_TO_SCHEMA
    assert isinstance(cls, type)
    assert isinstance(schema, pa.Schema)
    if not schema.metadata:
        schema.add_metadata({"type": cls.__name__})
    TYPE_TO_SCHEMA[cls] = schema
    SCHEMA_TO_TYPE[schema.metadata[b"type"]] = cls
示例#3
0
 def __init__(self, arrow_schema: pa.Schema,
              tensor_representation: schema_pb2.TensorRepresentation):
   super().__init__(arrow_schema, tensor_representation)
   sparse_representation = tensor_representation.sparse_tensor
   self._index_column_indices = tuple(
       arrow_schema.get_field_index(c)
       for c in sparse_representation.index_column_names)
   self._value_column_index = arrow_schema.get_field_index(
       sparse_representation.value_column_name)
   self._shape = [dim.size for dim in sparse_representation.dense_shape.dim]
   _, value_type = _GetNestDepthAndValueType(
       arrow_schema, path.ColumnPath(sparse_representation.value_column_name))
   self._dtype = _ArrowTypeToTfDtype(value_type)
   self._coo_size = len(self._shape) + 1
   self._convert_to_binary_fn = _GetConvertToBinaryFn(value_type)
示例#4
0
 def __init__(self, arrow_schema: pa.Schema,
              tensor_representation: schema_pb2.TensorRepresentation):
   super(_SparseTensorHandler, self).__init__(
       arrow_schema, tensor_representation)
   sparse_representation = tensor_representation.sparse_tensor
   self._index_column_indices = tuple(
       arrow_schema.get_field_index(c)
       for c in sparse_representation.index_column_names)
   self._value_column_index = arrow_schema.get_field_index(
       sparse_representation.value_column_name)
   self._shape = [dim.size for dim in sparse_representation.dense_shape.dim]
   _, value_type = _GetNestDepthAndValueType(
       arrow_schema[self._value_column_index])
   self._dtype = _ArrowTypeToTfDtype(value_type)
   self._coo_size = len(self._shape) + 1
def update_existing_schema(current_schema: pa.Schema,
                           new_schema: pa.Schema) -> pa.Schema:
    """
    Takes the current schema and updates any fields in the current
    schema with fields from the new_schema. If current_schema has
    fields that do not exist in new_schema then they are unchanged.
    If current_schema has fields that also exist in new_schema then
    the field in new_schema is chosen. If fields exist in new_schema
    but not in current, these will be ignored.
    Args:
        current_schema (pa.Schema): Schema to update
        new_schema (pa.Schema): Schema with fields that you wish to be
          used to update current_schema
    Returns:
        pa.Schema: Returns a schema with the same column order as
        current_schema but with the fields updated for any fields
        that matched new_schema.
    """

    updated_schema = pa.schema([])

    for field in current_schema:
        if field.name in new_schema.names:
            updated_schema = updated_schema.append(new_schema.field(
                field.name))
        else:
            updated_schema = updated_schema.append(field)
    return updated_schema
示例#6
0
 def BaseCanHandle(
     arrow_schema: pa.Schema,
     tensor_representation: schema_pb2.TensorRepresentation) -> bool:
   depth, value_type = _GetNestDepthAndValueType(
       arrow_schema.field_by_name(
           tensor_representation.dense_tensor.column_name))
   # Can only handle 1-nested lists.
   return depth == 1 and _IsSupportedArrowValueType(value_type)
示例#7
0
 def CanHandle(arrow_schema: pa.Schema,
               tensor_representation: schema_pb2.TensorRepresentation) -> bool:
   depth, value_type = _GetNestDepthAndValueType(
       arrow_schema.field_by_name(
           tensor_representation.varlen_sparse_tensor.column_name))
   # Currently can only handle 1-nested lists, but can easily support
   # arbitrarily nested ListArrays.
   return depth == 1 and _IsSupportedArrowValueType(value_type)
示例#8
0
 def __init__(self, arrow_schema: pa.Schema,
              tensor_representation: schema_pb2.TensorRepresentation):
   super(_VarLenSparseTensorHandler, self).__init__(
       arrow_schema, tensor_representation)
   column_name = tensor_representation.varlen_sparse_tensor.column_name
   self._column_index = arrow_schema.get_field_index(column_name)
   _, value_type = _GetNestDepthAndValueType(arrow_schema[self._column_index])
   self._dtype = _ArrowTypeToTfDtype(value_type)
示例#9
0
 def __init__(self, arrow_schema: pa.Schema,
              tensor_representation: schema_pb2.TensorRepresentation):
   super().__init__(arrow_schema, tensor_representation)
   column_name = tensor_representation.varlen_sparse_tensor.column_name
   self._column_index = arrow_schema.get_field_index(column_name)
   _, value_type = _GetNestDepthAndValueType(arrow_schema,
                                             path.ColumnPath(column_name))
   self._dtype = _ArrowTypeToTfDtype(value_type)
   self._convert_to_binary_fn = _GetConvertToBinaryFn(value_type)
示例#10
0
def _EnumerateTypesAlongPath(arrow_schema: pa.Schema,
                             column_path: path.ColumnPath) -> pa.DataType:
    """Enumerates nested types along a column_path.

  A nested type is either a list-like type or a struct type.

  It uses `column_path`[0] to first address a field in the schema, and
  enumerates its type. If that type is nested, it enumerates its child and
  continues recursively until the column_path reaches an end. The child of a
  list-like type is its value type. The child of a struct type is the type of
  the child field of the name given by the corresponding step in the
  column_path.

  Args:
    arrow_schema: The arrow schema to traverse.
    column_path: A path of field names.

  Yields:
    The arrow type of each level in the schema.

  Raises:
    ValueError: If a step does not exist in the arrow schema.
    ValueError: If arrow_schema has no more struct fields, but we did not
                iterate through every field in column_path.
  """
    field_name = column_path.initial_step()
    column_path = column_path.suffix(1)

    arrow_field = arrow_schema.field(field_name)
    arrow_type = arrow_field.type
    yield arrow_type

    while True:
        if pa.types.is_struct(arrow_type):
            # get the field from the StructType
            if not column_path:
                break
            curr_field_name = column_path.initial_step()
            column_path = column_path.suffix(1)
            try:
                arrow_field = arrow_type[curr_field_name]
            except KeyError:
                raise ValueError(
                    "The field: {} could not be found in the current Struct: {}"
                    .format(curr_field_name, arrow_type))
            arrow_type = arrow_field.type
        elif _IsListLike(arrow_type):
            arrow_type = arrow_type.value_type
        else:
            yield arrow_type
            if column_path:
                raise ValueError(
                    "The arrow_schema fields are exhausted, but there are remaining "
                    "fields in the column_path: {}".format(column_path))
            break
        yield arrow_type
示例#11
0
def schemas_equal(a: pa.Schema,
                  b: pa.Schema,
                  check_order: bool = True,
                  check_metadata: bool = True) -> bool:
    """check if two schemas are equal

    :param a: first pyarrow schema
    :param b: second pyarrow schema
    :param compare_order: whether to compare order
    :param compare_order: whether to compare metadata
    :return: if the two schema equal
    """
    if check_order:
        return a.equals(b, check_metadata=check_metadata)
    if check_metadata and a.metadata != b.metadata:
        return False
    da = {k: a.field(k) for k in a.names}
    db = {k: b.field(k) for k in b.names}
    return da == db
def _set_date_column_type_to_timestamp_ms(schema: pa.Schema) -> pa.Schema:
    dt_timestamp_ms = pa.timestamp("ms")

    indexof_date_field = schema.get_field_index("DATE")

    types = schema.types
    types[indexof_date_field] = dt_timestamp_ms

    field_list = zip(schema.names, types)
    return pa.schema(field_list)
示例#13
0
  def CanHandle(
      arrow_schema: pa.Schema,
      tensor_representation: schema_pb2.TensorRepresentation) -> bool:
    """Returns whether `tensor_representation` can be handled."""
    sparse_representation = tensor_representation.sparse_tensor
    if (len(sparse_representation.dense_shape.dim) !=
        len(sparse_representation.index_column_names)):
      return False
    if any([d.size <= 0 for d in sparse_representation.dense_shape.dim]):
      return False

    # All the index columns must be of integral types.
    for index_column in sparse_representation.index_column_names:
      depth, value_type = _GetNestDepthAndValueType(
          arrow_schema.field_by_name(index_column))
      if depth != 1 or not pa.types.is_integer(value_type):
        return False

    depth, value_type = _GetNestDepthAndValueType(
        arrow_schema.field_by_name(sparse_representation.value_column_name))
    return depth == 1 and _IsSupportedArrowValueType(value_type)
示例#14
0
 def __init__(self, arrow_schema: pa.Schema,
              tensor_representation: schema_pb2.TensorRepresentation):
     super(_RaggedTensorHandler, self).__init__(arrow_schema,
                                                tensor_representation)
     ragged_representation = tensor_representation.ragged_tensor
     self._steps = list(ragged_representation.feature_path.step)
     self._column_index = arrow_schema.get_field_index(self._steps[0])
     self._ragged_rank, value_type = _GetNestDepthAndValueType(
         arrow_schema, self._steps)
     self._dtype = _ArrowTypeToTfDtype(value_type)
     self._row_partition_dtype = ragged_representation.row_partition_dtype
     self._convert_to_binary_fn = _GetConvertToBinaryFn(value_type)
示例#15
0
 def __init__(self, arrow_schema: pa.Schema,
              tensor_representation: schema_pb2.TensorRepresentation):
   super(_BaseDenseTensorHandler, self).__init__(arrow_schema,
                                                 tensor_representation)
   dense_rep = tensor_representation.dense_tensor
   column_name = dense_rep.column_name
   self._column_index = arrow_schema.get_field_index(column_name)
   _, value_type = _GetNestDepthAndValueType(arrow_schema[self._column_index])
   self._dtype = _ArrowTypeToTfDtype(value_type)
   unbatched_shape = [
       d.size for d in tensor_representation.dense_tensor.shape.dim
   ]
   self._shape = [None] + unbatched_shape
   self._unbatched_flat_len = int(np.prod(unbatched_shape, initial=1))
示例#16
0
 def __init__(self, arrow_schema: pa.Schema,
              tensor_representation: schema_pb2.TensorRepresentation):
   super().__init__(arrow_schema, tensor_representation)
   dense_rep = tensor_representation.dense_tensor
   column_name = dense_rep.column_name
   self._column_index = arrow_schema.get_field_index(column_name)
   _, value_type = _GetNestDepthAndValueType(arrow_schema,
                                             path.ColumnPath(column_name))
   self._dtype = _ArrowTypeToTfDtype(value_type)
   self._convert_to_binary_fn = _GetConvertToBinaryFn(value_type)
   unbatched_shape = [
       d.size for d in tensor_representation.dense_tensor.shape.dim
   ]
   self._shape = [None] + unbatched_shape
   self._unbatched_flat_len = int(np.prod(unbatched_shape, initial=1))
示例#17
0
def _EnumerateTypesAlongPath(arrow_schema: pa.Schema,
                             path: List[Text]) -> pa.DataType:
    """Enumerates nested types along a path.

  A nested type is either a list-like type or a struct type.

  It uses `path`[0] to first address a field in the schema, and enumerates its
  type. If that type is nested, it enumerates its child and continues
  recursively until the path reaches an end. The child of a list-like type is
  its value type. The child of a struct type is the type of the child field of
  the name given by the corresponding step in the path.

  Args:
    arrow_schema: The arrow schema to traverse.
    path: A path of field names.

  Yields:
    The arrow type of each level in the schema.

  Raises:
    ValueError: If a step does not exist in the arrow schema.
  """
    path = collections.deque(path)
    field_name = path.popleft()
    arrow_field = arrow_schema.field(field_name)
    arrow_type = arrow_field.type
    yield arrow_type

    while True:
        if pa.types.is_struct(arrow_type):
            # get the field from the StructType
            if not path:  # path is empty
                break
            curr_field_name = path.popleft()
            try:
                arrow_field = arrow_type[curr_field_name]
            except KeyError:
                raise ValueError(
                    "The field: {} could not be found in the current Struct: {}"
                    .format(curr_field_name, arrow_type))
            arrow_type = arrow_field.type
        elif _IsListLike(arrow_type):
            arrow_type = arrow_type.value_type
        else:
            yield arrow_type
            break
        yield arrow_type
示例#18
0
def _read_table(
    table_as_folder: "Table",
    columns: t.List[str],
    filter_expression: pds.Expression,
    partitioning: pds.Partitioning,
    table_schema: pa.Schema,
) -> pa.Table:
    """
    Refer: https://arrow.apache.org/docs/python/dataset.html#dataset

    todo: need to find a way to preserve indexes while writing or
     else find a way to read with sort with pyarrow ... then there
     will be no need to use to_pandas() and also no need ofr casting
    """
    if bool(columns):
        table_schema = pa.schema(
            fields=[table_schema.field(_c) for _c in columns],
            metadata=table_schema.metadata
        )
    # noinspection PyProtectedMember
    _path = table_as_folder.path
    _table = pa.Table.from_batches(
        batches=pds.dataset(
            source=_path.full_path,
            filesystem=_path.fs,
            format=_FILE_FORMAT,
            schema=table_schema,
            partitioning=partitioning,
        ).to_batches(
            # todo: verify below claim and test if this will remain generally correct
            # using filters like columns and filter_expression here is more efficient
            # as it applies for per batch loaded rather than loading entire table and
            # then applying filters
            columns=columns,
            filter=filter_expression,
        ),
        # if column is specified table_schema will change as some columns will
        # disappear ... so we set to None
        # todo: check how to drop remaining columns from table_schema
        schema=table_schema,
    )

    # todo: should we reconsider sort overhead ???
    # return self.file_type.deserialize(
    #     _table
    # ).sort_index(axis=0)
    return _table
示例#19
0
def _GetNestDepthAndValueType(arrow_schema: pa.Schema,
                              path: List[Text]) -> Tuple[int, pa.DataType]:
    """Returns the depth of a leaf field, and its innermost value type.

  The Depth is constituted by the number of nested lists in the leaf field.

  Args:
    arrow_schema: The arrow schema to traverse.
    path: A path of field names. The path must describe a leaf struct.
  Returns: A Tuple of depth and arrow type
  """
    arrow_type = arrow_schema.field(path[0]).type
    depth = 0

    for arrow_type in _EnumerateTypesAlongPath(arrow_schema, path):
        if _IsListLike(arrow_type):
            depth += 1

    return depth, arrow_type
示例#20
0
  def __init__(self, arrow_schema: pa.Schema,
               tensor_representation: schema_pb2.TensorRepresentation):
    super().__init__(arrow_schema, tensor_representation)
    ragged_representation = tensor_representation.ragged_tensor

    self._value_path = path.ColumnPath.from_proto(
        ragged_representation.feature_path)
    self._column_index = arrow_schema.get_field_index(
        ragged_representation.feature_path.step[0])
    self._outer_ragged_rank, value_type = _GetNestDepthAndValueType(
        arrow_schema, self._value_path)

    # Split partitions to the ones defining Ragged dimensions and the ones
    # defining the outer dimensions shape (through uniform row length
    # partitions).
    fixed_dimension = True
    ragged_partitions = []
    fixed_dimension_partitions = []
    # Reverse through the partitions (from outer partition to inner), in order
    # to extract the inner fixed shape of the resulting RaggedTensor.
    for partition in reversed(ragged_representation.partition):
      if partition.HasField("uniform_row_length") and fixed_dimension:
        fixed_dimension_partitions.append(partition)
      else:
        fixed_dimension = False
        ragged_partitions.append(partition)
    self._ragged_partitions = ragged_partitions[::-1]
    self._fixed_dimension_partitions = fixed_dimension_partitions[::-1]

    inner_fixed_shape = []
    inferred_dimensions_elements = 1
    for partition in self._fixed_dimension_partitions:
      inner_fixed_shape.append(partition.uniform_row_length)
      inferred_dimensions_elements *= partition.uniform_row_length
    self._inner_fixed_shape = inner_fixed_shape
    self._values_fixed_shape = [-1] + inner_fixed_shape
    self._inferred_dimensions_elements = inferred_dimensions_elements

    self._dtype = _ArrowTypeToTfDtype(value_type)
    self._row_partition_dtype = ragged_representation.row_partition_dtype
    self._convert_to_binary_fn = _GetConvertToBinaryFn(value_type)
示例#21
0
    def conform_to_schema(
            cls, table: pa.Table, schema: pa.Schema,
            pandas_types=None, warn_extra_columns=True) \
            -> pa.Table:
        """
        Align an Arrow table to an Arrow schema.

        Columns will be matched using case-insensitive matching and columns not in the schema will be dropped.
        The resulting table will have the field order and case defined in the schema.

        Where column types do not match exactly, type coercion will be applied if possible.
        In some cases type coercion may result in overflows,
        for example casting int64 -> int32 will fail if any values are greater than the maximum int32 value.

        If the incoming data has been converted from Pandas, there are some conversions that can be applied
        if the original Pandas dtype is known. These dtypes can be supplied via the pandas_dtypes parameter
        and should line up with the data in the table (i.e. dtypes are for the source data, not the target schema).

        The method will return a dataset whose schema exactly matches the requested schema.
        If it is not possible to make the data conform to the schema for any reason, EDataConformance will be raised.

        :param table: The data to be conformed
        :param schema: The schema to conform to
        :param pandas_types: Pandas dtypes for the table, if the table has been converted from Pandas
        :param warn_extra_columns: Whether to log warnings it the table contains columns not in the schema
        :return: The conformed data, whose schema will exactly match the supplied schema parameter
        :raises: _ex.EDataConformance if conformance is not possible for any reason
        """

        # If Pandas types are supplied they must match the table, i.e. table has been converted from Pandas
        if pandas_types is not None and len(pandas_types) != len(
                table.schema.types):
            raise _ex.EUnexpected()

        cls._check_duplicate_fields(schema, True)
        cls._check_duplicate_fields(table.schema, False)

        table_indices = {
            f.lower(): i
            for (i, f) in enumerate(table.schema.names)
        }
        conformed_data = []
        conformance_errors = []

        # Coerce types to match expected schema where possible
        for schema_index in range(len(schema.names)):

            try:
                schema_field = schema.field(schema_index)
                table_index = table_indices.get(schema_field.name.lower())

                if table_index is None:
                    message = cls.__E_FIELD_MISSING.format(
                        field_name=schema_field.name)
                    cls.__log.error(message)
                    raise _ex.EDataConformance(message)

                table_column: pa.Array = table.column(table_index)

                pandas_type = pandas_types[table_index] \
                    if pandas_types is not None \
                    else None

                if table_column.type == schema_field.type:
                    conformed_column = table_column
                else:
                    conformed_column = cls._coerce_vector(
                        table_column, schema_field, pandas_type)

                if not schema_field.nullable and table_column.null_count > 0:
                    message = f"Null values present in non-null field [{schema_field.name}]"
                    cls.__log.error(message)
                    raise _ex.EDataConformance(message)

                conformed_data.append(conformed_column)

            except _ex.EDataConformance as e:
                conformance_errors.append(e)

        # Columns not defined in the schema will not be included in the conformed output
        if warn_extra_columns and table.num_columns > len(schema.types):

            schema_columns = set(map(str.lower, schema.names))
            extra_columns = [
                f"[{col}]" for col in table.schema.names
                if col.lower() not in schema_columns
            ]

            message = f"Columns not defined in the schema will be dropped: {', '.join(extra_columns)}"
            cls.__log.warning(message)

        if any(conformance_errors):
            if len(conformance_errors) == 1:
                raise conformance_errors[0]
            else:
                cls.__log.error("There were multiple data conformance errors")
                raise _ex.EDataConformance(
                    "There were multiple data conformance errors",
                    conformance_errors)

        return pa.Table.from_arrays(conformed_data, schema=schema)  # noqa
示例#22
0
def arrow_schema_to_render_columns(schema: pa.Schema) -> Dict[str, RenderColumn]:
    return {
        name: _arrow_field_to_render_column(schema.field(i))
        for i, name in enumerate(schema.names)
    }
示例#23
0
  def CanHandle(arrow_schema: pa.Schema,
                tensor_representation: schema_pb2.TensorRepresentation) -> bool:
    """Returns whether `tensor_representation` can be handled.

    The case where the tensor_representation cannot be handled is when:
    1. Wrong column name / field name requested.
    2. Non-leaf field is requested (for StructTypes).
    3. There does not exist a ListType along the path.
    4. Requested partitions paths are not an integer values or doesn't exist.

    Args:
      arrow_schema: The pyarrow schema.
      tensor_representation: The TensorRepresentation proto.
    """
    ragged_tensor = tensor_representation.ragged_tensor
    if len(ragged_tensor.feature_path.step) < 1:
      return False

    value_path = path.ColumnPath.from_proto(ragged_tensor.feature_path)

    # Checking the outer dimensions represented by the value feature path.
    contains_list = False
    try:
      arrow_type = None
      for arrow_type in _EnumerateTypesAlongPath(arrow_schema, value_path):
        if _IsListLike(arrow_type):
          contains_list = True
      if pa.types.is_struct(arrow_type):
        # The path is depleted, but the last arrow_type is a struct. This means
        # the path is a Non-leaf field.
        return False
    except ValueError:
      # ValueError signifies wrong column name / field name requested.
      return False
    if not contains_list:
      return False

    # Check the auxiliar features that need to be accessed to form the inner
    # dimensions partitions.
    parent_path = value_path.parent()

    # Check the columns exists and have correct depth and type.
    for partition in ragged_tensor.partition:
      if partition.HasField("row_length"):
        try:
          field_path = parent_path.child(partition.row_length)
          # To avoid loop undefined variable lint error.
          partition_type = arrow_schema.field(field_path.initial_step()).type
          for partition_type in _EnumerateTypesAlongPath(
              arrow_schema, field_path, stop_at_path_end=True):
            # Iterate through them all. Only interested on the last type.
            pass
          if not _IsListLike(partition_type) or not pa.types.is_integer(
              partition_type.value_type):
            return False
        except ValueError:
          # ValueError signifies wrong column name / field name requested.
          return False

      elif partition.HasField("uniform_row_length"):
        if partition.uniform_row_length <= 0:
          return False
      else:
        return False

    # All checks passed successfully.
    return True