def testMultipleColumnsTwoRowGroupsAndEqualBatchSize_OutputsPrensor(self): """Tests that the correct prensor for three columns is outputted.""" pq_ds = parquet.ParquetDataset(filenames=self._rowgroup_test_filenames, value_paths=[ "DocId", "Name.Language.Code", "Name.Language.Country" ], batch_size=2) expected_prensor = prensor.create_prensor_from_descendant_nodes({ path.Path([]): prensor.RootNodeTensor(tf.constant(2, dtype=tf.int64)), path.Path(["DocId"]): prensor.LeafNodeTensor(tf.constant([0, 1], dtype=tf.int64), tf.constant([10, 20], dtype=tf.int64), True), path.Path(["Name"]): prensor.ChildNodeTensor(tf.constant([0, 0, 0, 1], dtype=tf.int64), True), path.Path(["Name", "Language"]): prensor.ChildNodeTensor(tf.constant([0, 0, 2], dtype=tf.int64), True), path.Path(["Name", "Language", "Code"]): prensor.LeafNodeTensor(tf.constant([0, 1, 2], dtype=tf.int64), tf.constant([b"en-us", b"en", b"en-gb"]), True), path.Path(["Name", "Language", "Country"]): prensor.LeafNodeTensor(tf.constant([0, 2], dtype=tf.int64), tf.constant([b"us", b"gb"]), True) }) for i, pren in enumerate(pq_ds): if i == 0: self._assertPrensorEqual(pren, expected_prensor)
def calculate( self, sources: Sequence[prensor.NodeTensor], destinations: Sequence[expression.Expression], options: calculate_options.Options, side_info: Optional[prensor.Prensor] = None) -> prensor.NodeTensor: [origin_value, parent_value] = sources # We should never be recalculating a RootNodeTensor. assert not isinstance(origin_value, prensor.RootNodeTensor), origin_value # The parent cannot be a LeafNodeTensor or RootNodeTensor, because # a) a leaf node cannot have a submessage # b) you cannot broadcast into a root assert isinstance(parent_value, prensor.ChildNodeTensor), parent_value # We use equi_join_any_indices on the parent's `index_to_value` because it # represents which child nodes were duplicated. Thus, which origin values # also need to be duplicated. [broadcasted_to_sibling_index, index_to_values] = struct2tensor_ops.equi_join_any_indices( parent_value.index_to_value, origin_value.parent_index) if isinstance(origin_value, prensor.LeafNodeTensor): new_values = tf.gather(origin_value.values, index_to_values) return prensor.LeafNodeTensor(broadcasted_to_sibling_index, new_values, self.is_repeated) else: return prensor.ChildNodeTensor(broadcasted_to_sibling_index, self.is_repeated, index_to_values)
def calculate( self, sources: Sequence[prensor.NodeTensor], destinations: Sequence[expression.Expression], options: calculate_options.Options, side_info: Optional[prensor.Prensor] = None) -> prensor.NodeTensor: [origin_value, origin_parent_value] = sources if not isinstance(origin_value, prensor.ChildNodeTensor): raise ValueError("origin_value must be a child") if not isinstance(origin_parent_value, prensor.ChildNodeTensor): raise ValueError("origin_parent_value must be a child node") new_parent_index = tf.gather(origin_parent_value.parent_index, origin_value.parent_index) return prensor.ChildNodeTensor(new_parent_index, self.is_repeated)
def calculate( self, sources: Sequence[prensor.NodeTensor], destinations: Sequence[expression.Expression], options: calculate_options.Options, side_info: Optional[prensor.Prensor] = None) -> prensor.NodeTensor: [origin_value, sibling_value] = sources if not isinstance(origin_value, prensor.ChildNodeTensor): raise ValueError("origin not a ChildNodeTensor") if not isinstance(sibling_value, prensor.ChildNodeTensor): raise ValueError("sibling value is not a ChildNodeTensor") [broadcasted_to_sibling_index, index_to_values] = struct2tensor_ops.equi_join_any_indices( sibling_value.parent_index, origin_value.parent_index) return prensor.ChildNodeTensor(broadcasted_to_sibling_index, self.is_repeated, index_to_value=index_to_values)
def testPromoteAndProjectExpression(self): filenames = [ "struct2tensor/testdata/parquet_testdata/dremel_example.parquet" ] batch_size = 2 exp = parquet.create_expression_from_parquet_file(filenames) new_exp = promote.promote(exp, path.Path(["Name", "Language", "Code"]), "new_code") new_code_project_exp = project.project( new_exp, [path.Path(["Name", "new_code"])]) docid_project_exp = project.project(exp, [path.Path(["DocId"])]) pqds = parquet.calculate_parquet_values( [new_code_project_exp, docid_project_exp], exp, filenames, batch_size) new_code_expected = prensor.create_prensor_from_descendant_nodes({ path.Path([]): prensor.RootNodeTensor(tf.constant(2, dtype=tf.int64)), path.Path(["Name"]): prensor.ChildNodeTensor(tf.constant([0, 0, 0, 1], dtype=tf.int64), True), path.Path(["Name", "new_code"]): prensor.LeafNodeTensor(tf.constant([0, 0, 2], dtype=tf.int64), tf.constant([b"en-us", b"en", b"en-gb"]), True) }) docid_expected = prensor.create_prensor_from_descendant_nodes({ path.Path([]): prensor.RootNodeTensor(tf.constant(2, dtype=tf.int64)), path.Path(["DocId"]): prensor.LeafNodeTensor(tf.constant([0, 1], dtype=tf.int64), tf.constant([10, 20], dtype=tf.int64), False) }) for ele in pqds: new_code_pren = ele[0] docid_pren = ele[1] self._assertPrensorEqual(new_code_pren, new_code_expected) self._assertPrensorEqual(docid_pren, docid_expected)
def create_child_node(parent_index: Sequence[int], is_repeated: bool) -> prensor.ChildNodeTensor: return prensor.ChildNodeTensor( tf.constant(parent_index, dtype=tf.int64), is_repeated)
def _row_partition_to_child_node_tensor(row_partition: RowPartition): """Creates a ChildNodeTensor from a RowPartition.""" return prensor.ChildNodeTensor( row_partition.with_row_splits_dtype(tf.int64).value_rowids(), is_repeated=True)
def create_child_node(parent_index, is_repeated): return prensor.ChildNodeTensor(tf.constant(parent_index, dtype=tf.int64), is_repeated)
def _row_partition_to_child_node_tensor(row_partition: RowPartition): """Creates a ChildNodeTensor from a RowPartition.""" return prensor.ChildNodeTensor(tf.cast(row_partition.value_rowids(), tf.int64), is_repeated=True)