def test_element_lengths_list_array(self, list_type_factory): list_lengths = array_util.GetElementLengths( pa.array([], type=list_type_factory(pa.int64()))) self.assertTrue(list_lengths.equals(pa.array([], type=pa.int64()))) list_lengths = array_util.GetElementLengths( pa.array([[1., 2.], [], [3.]], list_type_factory(pa.float32()))) self.assertTrue(list_lengths.equals(pa.array([2, 0, 1], type=pa.int64()))) list_lengths = array_util.GetElementLengths( pa.array([[1., 2.], None, [3.]], list_type_factory(pa.float64()))) self.assertTrue(list_lengths.equals(pa.array([2, 0, 1], type=pa.int64())))
def test_element_lengths(self): list_lengths = array_util.GetElementLengths( pa.array([], type=pa.list_(pa.int64()))) self.assertTrue(list_lengths.equals(pa.array([], type=pa.int32()))) list_lengths = array_util.GetElementLengths( pa.array([[1., 2.], [], [3.]])) self.assertTrue( list_lengths.equals(pa.array([2, 0, 1], type=pa.int32()))) list_lengths = array_util.GetElementLengths( pa.array([[1., 2.], None, [3.]])) self.assertTrue( list_lengths.equals(pa.array([2, 0, 1], type=pa.int32()))) list_lengths = array_util.GetElementLengths( pa.array([b"a", b"bb", None, b"", b"ccc"], type=pa.binary())) self.assertTrue( list_lengths.equals(pa.array([1, 2, 0, 0, 3], type=pa.int32()))) list_lengths = array_util.GetElementLengths( pa.array([u"a", u"bb", None, u"", u"ccc"], type=pa.string())) self.assertTrue( list_lengths.equals(pa.array([1, 2, 0, 0, 3], type=pa.int32()))) with self.assertRaisesRegex(RuntimeError, "NotImplemented"): array_util.GetElementLengths(pa.array([1, 2, 3], type=pa.int32()))
def update(self, feature_array: pa.Array) -> None: """Update the partial bytes statistics using the input value.""" if pa.types.is_null(feature_array.type): return # Iterate through the value array and update the partial stats.' flattened_values_array, _ = arrow_util.flatten_nested(feature_array) if (pa.types.is_floating(flattened_values_array.type) or pa.types.is_integer(flattened_values_array.type)): raise ValueError('Bytes stats cannot be computed on INT/FLOAT features.') if flattened_values_array: num_bytes = array_util.GetElementLengths( flattened_values_array).to_numpy() self.min_num_bytes = min(self.min_num_bytes, np.min(num_bytes)) self.max_num_bytes = max(self.max_num_bytes, np.max(num_bytes)) self.total_num_bytes += np.sum(num_bytes)
def test_element_lengths_unsupported_type(self): with self.assertRaisesRegex(RuntimeError, "Unimplemented"): array_util.GetElementLengths(pa.array([1, 2, 3], type=pa.int32()))
def test_element_lengths_binary_like(self, binary_like_type): list_lengths = array_util.GetElementLengths( pa.array([b"a", b"bb", None, b"", b"ccc"], type=binary_like_type)) self.assertTrue(list_lengths.equals(pa.array([1, 2, 0, 0, 3], type=pa.int64())))