def test_shared_embedding_column_with_non_sequence_categorical(self):
        """Tests that error is raised for non-sequence shared embedding column."""
        vocabulary_size = 3
        sparse_input_a = sparse_tensor.SparseTensorValue(
            # example 0, ids [2]
            # example 1, ids [0, 1]
            indices=((0, 0), (1, 0), (1, 1)),
            values=(2, 0, 1),
            dense_shape=(2, 2))
        sparse_input_b = sparse_tensor.SparseTensorValue(
            # example 0, ids [2]
            # example 1, ids [0, 1]
            indices=((0, 0), (1, 0), (1, 1)),
            values=(2, 0, 1),
            dense_shape=(2, 2))

        categorical_column_a = fc.categorical_column_with_identity(
            key='aaa', num_buckets=vocabulary_size)
        categorical_column_b = fc.categorical_column_with_identity(
            key='bbb', num_buckets=vocabulary_size)
        shared_embedding_columns = fc.shared_embedding_columns_v2(
            [categorical_column_a, categorical_column_b], dimension=2)

        sequence_input_layer = ksfc.SequenceFeatures(shared_embedding_columns)
        with self.assertRaisesRegex(
                ValueError,
                r'In embedding_column: aaa_shared_embedding\. categorical_column must '
                r'be of type SequenceCategoricalColumn to use SequenceFeatures\.'
        ):
            _, _ = sequence_input_layer({
                'aaa': sparse_input_a,
                'bbb': sparse_input_b
            })
示例#2
0
    def test_multiple_layers_with_same_shared_embedding_column(self):
        categorical_column_a = fc.categorical_column_with_identity(
            key='aaa', num_buckets=3)
        categorical_column_b = fc.categorical_column_with_identity(
            key='bbb', num_buckets=3)
        embedding_dimension = 2
        embedding_column_b, embedding_column_a = fc.shared_embedding_columns_v2(
            [categorical_column_b, categorical_column_a],
            dimension=embedding_dimension)

        with ops.Graph().as_default():
            features = {
                'aaa':
                sparse_tensor.SparseTensor(indices=((0, 0), (1, 0), (1, 1)),
                                           values=(0, 1, 0),
                                           dense_shape=(2, 2)),
                'bbb':
                sparse_tensor.SparseTensor(indices=((0, 0), (1, 0), (1, 1)),
                                           values=(1, 2, 1),
                                           dense_shape=(2, 2)),
            }
            all_cols = [embedding_column_a, embedding_column_b]
            df.DenseFeatures(all_cols)(features)
            df.DenseFeatures(all_cols)(features)
            # Make sure that only 1 variable gets created in this case.
            self.assertEqual(
                1, len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
            self.assertItemsEqual(['aaa_bbb_shared_embedding:0'], [
                v.name
                for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
            ])
    def test_shared_sequence_non_sequence_into_input_layer(self):
        non_seq = fc.categorical_column_with_identity('non_seq',
                                                      num_buckets=10)
        seq = sfc.sequence_categorical_column_with_identity('seq',
                                                            num_buckets=10)
        shared_non_seq, shared_seq = fc.shared_embedding_columns_v2(
            [non_seq, seq],
            dimension=4,
            combiner='sum',
            initializer=init_ops_v2.Ones(),
            shared_embedding_collection_name='shared')

        seq = sparse_tensor.SparseTensor(indices=[[0, 0], [0, 1], [1, 0]],
                                         values=[0, 1, 2],
                                         dense_shape=[2, 2])
        non_seq = sparse_tensor.SparseTensor(indices=[[0, 0], [0, 1], [1, 0]],
                                             values=[0, 1, 2],
                                             dense_shape=[2, 2])
        features = {'seq': seq, 'non_seq': non_seq}

        # Tile the context features across the sequence features
        seq_input, seq_length = ksfc.SequenceFeatures([shared_seq])(features)
        non_seq_input = dense_features.DenseFeatures([shared_non_seq
                                                      ])(features)

        with self.cached_session() as sess:
            sess.run(variables.global_variables_initializer())
            output_seq, output_seq_length, output_non_seq = sess.run(
                [seq_input, seq_length, non_seq_input])
            self.assertAllEqual(
                output_seq,
                [[[1, 1, 1, 1], [1, 1, 1, 1]], [[1, 1, 1, 1], [0, 0, 0, 0]]])
            self.assertAllEqual(output_seq_length, [2, 1])
            self.assertAllEqual(output_non_seq, [[2, 2, 2, 2], [1, 1, 1, 1]])
示例#4
0
    def test_sequence_length_with_empty_rows(self):
        """Tests _sequence_length when some examples do not have ids."""
        with ops.Graph().as_default():
            vocabulary_size = 3
            sparse_input_a = sparse_tensor.SparseTensorValue(
                # example 0, ids []
                # example 1, ids [2]
                # example 2, ids [0, 1]
                # example 3, ids []
                # example 4, ids [1]
                # example 5, ids []
                indices=((1, 0), (2, 0), (2, 1), (4, 0)),
                values=(2, 0, 1, 1),
                dense_shape=(6, 2))
            expected_sequence_length_a = [0, 1, 2, 0, 1, 0]
            categorical_column_a = sfc.sequence_categorical_column_with_identity(
                key='aaa', num_buckets=vocabulary_size)

            sparse_input_b = sparse_tensor.SparseTensorValue(
                # example 0, ids [2]
                # example 1, ids []
                # example 2, ids []
                # example 3, ids []
                # example 4, ids [1]
                # example 5, ids [0, 1]
                indices=((0, 0), (4, 0), (5, 0), (5, 1)),
                values=(2, 1, 0, 1),
                dense_shape=(6, 2))
            expected_sequence_length_b = [1, 0, 0, 0, 1, 2]
            categorical_column_b = sfc.sequence_categorical_column_with_identity(
                key='bbb', num_buckets=vocabulary_size)

            shared_embedding_columns = fc.shared_embedding_columns_v2(
                [categorical_column_a, categorical_column_b], dimension=2)

            sequence_length_a = _get_sequence_dense_tensor(
                shared_embedding_columns[0], {'aaa': sparse_input_a})[1]
            sequence_length_b = _get_sequence_dense_tensor(
                shared_embedding_columns[1], {'bbb': sparse_input_b})[1]

            with _initialized_session() as sess:
                self.assertAllEqual(expected_sequence_length_a,
                                    sequence_length_a.eval(session=sess))
                self.assertAllEqual(expected_sequence_length_b,
                                    sequence_length_b.eval(session=sess))
示例#5
0
    def test_sequence_length(self):
        with ops.Graph().as_default():
            vocabulary_size = 3

            sparse_input_a = sparse_tensor.SparseTensorValue(
                # example 0, ids [2]
                # example 1, ids [0, 1]
                indices=((0, 0), (1, 0), (1, 1)),
                values=(2, 0, 1),
                dense_shape=(2, 2))
            expected_sequence_length_a = [1, 2]
            categorical_column_a = sfc.sequence_categorical_column_with_identity(
                key='aaa', num_buckets=vocabulary_size)

            sparse_input_b = sparse_tensor.SparseTensorValue(
                # example 0, ids [0, 2]
                # example 1, ids [1]
                indices=((0, 0), (0, 1), (1, 0)),
                values=(0, 2, 1),
                dense_shape=(2, 2))
            expected_sequence_length_b = [2, 1]
            categorical_column_b = sfc.sequence_categorical_column_with_identity(
                key='bbb', num_buckets=vocabulary_size)
            shared_embedding_columns = fc.shared_embedding_columns_v2(
                [categorical_column_a, categorical_column_b], dimension=2)

            sequence_length_a = _get_sequence_dense_tensor(
                shared_embedding_columns[0], {'aaa': sparse_input_a})[1]
            sequence_length_b = _get_sequence_dense_tensor(
                shared_embedding_columns[1], {'bbb': sparse_input_b})[1]

            with _initialized_session() as sess:
                sequence_length_a = sess.run(sequence_length_a)
                self.assertAllEqual(expected_sequence_length_a,
                                    sequence_length_a)
                self.assertEqual(np.int64, sequence_length_a.dtype)
                sequence_length_b = sess.run(sequence_length_b)
                self.assertAllEqual(expected_sequence_length_b,
                                    sequence_length_b)
                self.assertEqual(np.int64, sequence_length_b.dtype)
    def test_shared_embedding_column(self):
        with ops.Graph().as_default():
            vocabulary_size = 3
            sparse_input_a = sparse_tensor.SparseTensorValue(
                # example 0, ids [2]
                # example 1, ids [0, 1]
                indices=((0, 0), (1, 0), (1, 1)),
                values=(2, 0, 1),
                dense_shape=(2, 2))
            sparse_input_b = sparse_tensor.SparseTensorValue(
                # example 0, ids [1]
                # example 1, ids [2, 0]
                indices=((0, 0), (1, 0), (1, 1)),
                values=(1, 2, 0),
                dense_shape=(2, 2))

            embedding_dimension = 2
            embedding_values = (
                (1., 2.),  # id 0
                (3., 4.),  # id 1
                (5., 6.)  # id 2
            )

            def _get_initializer(embedding_dimension, embedding_values):
                def _initializer(shape, dtype, partition_info=None):
                    self.assertAllEqual((vocabulary_size, embedding_dimension),
                                        shape)
                    self.assertEqual(dtypes.float32, dtype)
                    self.assertIsNone(partition_info)
                    return embedding_values

                return _initializer

            expected_input_layer = [
                # example 0, ids_a [2], ids_b [1]
                [[5., 6., 3., 4.], [0., 0., 0., 0.]],
                # example 1, ids_a [0, 1], ids_b [2, 0]
                [[1., 2., 5., 6.], [3., 4., 1., 2.]],
            ]
            expected_sequence_length = [1, 2]

            categorical_column_a = sfc.sequence_categorical_column_with_identity(
                key='aaa', num_buckets=vocabulary_size)
            categorical_column_b = sfc.sequence_categorical_column_with_identity(
                key='bbb', num_buckets=vocabulary_size)
            # Test that columns are reordered alphabetically.
            shared_embedding_columns = fc.shared_embedding_columns_v2(
                [categorical_column_b, categorical_column_a],
                dimension=embedding_dimension,
                initializer=_get_initializer(embedding_dimension,
                                             embedding_values))

            sequence_input_layer = ksfc.SequenceFeatures(
                shared_embedding_columns)
            input_layer, sequence_length = sequence_input_layer({
                'aaa':
                sparse_input_a,
                'bbb':
                sparse_input_b
            })

            global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
            self.assertCountEqual(('aaa_bbb_shared_embedding:0', ),
                                  tuple([v.name for v in global_vars]))
            with _initialized_session() as sess:
                self.assertAllEqual(embedding_values,
                                    global_vars[0].eval(session=sess))
                self.assertAllEqual(expected_input_layer,
                                    input_layer.eval(session=sess))
                self.assertAllEqual(expected_sequence_length,
                                    sequence_length.eval(session=sess))
示例#7
0
    def test_get_sequence_dense_tensor(self):
        vocabulary_size = 3
        embedding_dimension = 2
        embedding_values = (
            (1., 2.),  # id 0
            (3., 5.),  # id 1
            (7., 11.)  # id 2
        )

        def _initializer(shape, dtype, partition_info=None):
            self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
            self.assertEqual(dtypes.float32, dtype)
            self.assertIsNone(partition_info)
            return embedding_values

        sparse_input_a = sparse_tensor.SparseTensorValue(
            # example 0, ids [2]
            # example 1, ids [0, 1]
            # example 2, ids []
            # example 3, ids [1]
            indices=((0, 0), (1, 0), (1, 1), (3, 0)),
            values=(2, 0, 1, 1),
            dense_shape=(4, 2))
        sparse_input_b = sparse_tensor.SparseTensorValue(
            # example 0, ids [1]
            # example 1, ids [0, 2]
            # example 2, ids [0]
            # example 3, ids []
            indices=((0, 0), (1, 0), (1, 1), (2, 0)),
            values=(1, 0, 2, 0),
            dense_shape=(4, 2))

        expected_lookups_a = [
            # example 0, ids [2]
            [[7., 11.], [0., 0.]],
            # example 1, ids [0, 1]
            [[1., 2.], [3., 5.]],
            # example 2, ids []
            [[0., 0.], [0., 0.]],
            # example 3, ids [1]
            [[3., 5.], [0., 0.]],
        ]

        expected_lookups_b = [
            # example 0, ids [1]
            [[3., 5.], [0., 0.]],
            # example 1, ids [0, 2]
            [[1., 2.], [7., 11.]],
            # example 2, ids [0]
            [[1., 2.], [0., 0.]],
            # example 3, ids []
            [[0., 0.], [0., 0.]],
        ]

        categorical_column_a = sfc.sequence_categorical_column_with_identity(
            key='aaa', num_buckets=vocabulary_size)
        categorical_column_b = sfc.sequence_categorical_column_with_identity(
            key='bbb', num_buckets=vocabulary_size)
        shared_embedding_columns = fc.shared_embedding_columns_v2(
            [categorical_column_a, categorical_column_b],
            dimension=embedding_dimension,
            initializer=_initializer)

        embedding_lookup_a = _get_sequence_dense_tensor(
            shared_embedding_columns[0], {'aaa': sparse_input_a})[0]
        embedding_lookup_b = _get_sequence_dense_tensor(
            shared_embedding_columns[1], {'bbb': sparse_input_b})[0]

        self.evaluate(variables_lib.global_variables_initializer())
        global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
        self.assertItemsEqual(('aaa_bbb_shared_embedding:0', ),
                              tuple([v.name for v in global_vars]))
        self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
        self.assertAllEqual(expected_lookups_a,
                            self.evaluate(embedding_lookup_a))
        self.assertAllEqual(expected_lookups_b,
                            self.evaluate(embedding_lookup_b))
示例#8
0
    def _test_dense_features(self, trainable=True):
        # Inputs.
        vocabulary_size = 3
        sparse_input_a = sparse_tensor.SparseTensorValue(
            # example 0, ids [2]
            # example 1, ids [0, 1]
            indices=((0, 0), (1, 0), (1, 4)),
            values=(2, 0, 1),
            dense_shape=(2, 5))
        sparse_input_b = sparse_tensor.SparseTensorValue(
            # example 0, ids [0]
            # example 1, ids []
            indices=((0, 0), ),
            values=(0, ),
            dense_shape=(2, 5))
        sparse_input_c = sparse_tensor.SparseTensorValue(
            # example 0, ids [2]
            # example 1, ids [0, 1]
            indices=((0, 1), (1, 1), (1, 3)),
            values=(2, 0, 1),
            dense_shape=(2, 5))
        sparse_input_d = sparse_tensor.SparseTensorValue(
            # example 0, ids [2]
            # example 1, ids []
            indices=((0, 1), ),
            values=(2, ),
            dense_shape=(2, 5))

        # Embedding variable.
        embedding_dimension = 2
        embedding_values = (
            (1., 2.),  # id 0
            (3., 5.),  # id 1
            (7., 11.)  # id 2
        )

        def _initializer(shape, dtype, partition_info=None):
            self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
            self.assertEqual(dtypes.float32, dtype)
            self.assertIsNone(partition_info)
            return embedding_values

        # Expected lookup result, using combiner='mean'.
        expected_lookups = (
            # example 0:
            # A ids [2], embedding = [7, 11]
            # B ids [0], embedding = [1, 2]
            # C ids [2], embedding = [7, 11]
            # D ids [2], embedding = [7, 11]
            (7., 11., 1., 2., 7., 11., 7., 11.),
            # example 1:
            # A ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
            # B ids [], embedding = [0, 0]
            # C ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
            # D ids [], embedding = [0, 0]
            (2., 3.5, 0., 0., 2., 3.5, 0., 0.),
        )

        # Build columns.
        categorical_column_a = fc.categorical_column_with_identity(
            key='aaa', num_buckets=vocabulary_size)
        categorical_column_b = fc.categorical_column_with_identity(
            key='bbb', num_buckets=vocabulary_size)
        categorical_column_c = fc.categorical_column_with_identity(
            key='ccc', num_buckets=vocabulary_size)
        categorical_column_d = fc.categorical_column_with_identity(
            key='ddd', num_buckets=vocabulary_size)

        embedding_column_a, embedding_column_b = fc.shared_embedding_columns_v2(
            [categorical_column_a, categorical_column_b],
            dimension=embedding_dimension,
            initializer=_initializer,
            trainable=trainable)
        embedding_column_c, embedding_column_d = fc.shared_embedding_columns_v2(
            [categorical_column_c, categorical_column_d],
            dimension=embedding_dimension,
            initializer=_initializer,
            trainable=trainable)

        features = {
            'aaa': sparse_input_a,
            'bbb': sparse_input_b,
            'ccc': sparse_input_c,
            'ddd': sparse_input_d
        }

        # Provide sparse input and get dense result.
        dense_features = df.DenseFeatures(
            feature_columns=(embedding_column_b, embedding_column_a,
                             embedding_column_c, embedding_column_d))(features)

        # Assert expected embedding variable and lookups.
        global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
        self.assertCountEqual(
            ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
            tuple([v.name for v in global_vars]))
        for v in global_vars:
            self.assertIsInstance(v, variables_lib.Variable)
        trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
        if trainable:
            self.assertCountEqual(
                ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
                tuple([v.name for v in trainable_vars]))
        else:
            self.assertCountEqual([], tuple([v.name for v in trainable_vars]))
        shared_embedding_vars = global_vars

        self.evaluate(variables_lib.global_variables_initializer())
        self.evaluate(lookup_ops.tables_initializer())

        self.assertAllEqual(embedding_values,
                            self.evaluate(shared_embedding_vars[0]))
        self.assertAllEqual(expected_lookups, self.evaluate(dense_features))