示例#1
0
  def test_load_embedding_initializer(self):
    """Tests for the load_embedding_initializer wrapper."""
    embedding_loading_initializer = (checkpoint_ops._load_embedding_initializer(
        new_vocab_file=self.new_feature_vocab_file,
        old_vocab_file=self.old_feature_vocab_file,
        new_vocab_size=5,
        embedding_dim=16,
        embedding_tensor_name='some_scope/embeddings',
        ckpt_path=[self.checkpoint_file],
        num_oov_buckets=1,
        initializer=self.initializer))

    # The new weight matrix is of size
    # [5 feature vocab + 1 feature OOV, 16 (embedding dimension)], where the
    # last vocab row (2nd last row) is newly initialized (wasn't found in
    # previous vocab) and the actual last row is OOV and also newly initialized.
    # Use a partitioned variable to confirm that the offset logic works.
    expected_remapped_embeddings = np.concatenate(
        [
            np.reshape(range(64), [4, 16]),
            np.reshape([self.init_val] * 32, [2, 16]),
        ],
        axis=0)
    remapped_embeddings = variable_scope.get_variable(
        name='embedding/obtained_embedding_matrix',
        shape=[6, 16],
        initializer=embedding_loading_initializer,
        partitioner=partitioned_variables.fixed_size_partitioner(2))

    with self.cached_session():
      self.evaluate(variables.global_variables_initializer())
      self.assertAllClose(expected_remapped_embeddings,
                          remapped_embeddings.as_tensor())
  def test_load_embedding_initializer(self):
    """Tests for the load_embedding_initializer wrapper."""
    embedding_loading_initializer = (checkpoint_ops._load_embedding_initializer(
        new_vocab_file=self.new_feature_vocab_file,
        old_vocab_file=self.old_feature_vocab_file,
        new_vocab_size=5,
        embedding_dim=16,
        embedding_tensor_name='some_scope/embeddings',
        ckpt_path=[self.checkpoint_file],
        num_oov_buckets=1,
        initializer=self.initializer))

    expected_remapped_embeddings = np.concatenate(
        [
            np.reshape(range(64), [4, 16]),
            np.reshape([self.init_val] * 32, [2, 16]),
        ],
        axis=0)

    # The new weight matrix is of size
    # [5 feature vocab + 1 feature OOV, 16 (embedding dimension)], where the
    # last vocab row (2nd last row) is newly initialized (wasn't found in
    # previous vocab) and the actual last row is OOV and also newly initialized.
    # Use a partitioned variable to confirm that the offset logic works.
    remapped_embeddings = variable_scope.get_variable(
        name='embedding/obtained_embedding_matrix',
        shape=[6, 16],
        initializer=embedding_loading_initializer,
        partitioner=partitioned_variables.fixed_size_partitioner(2))

    with self.test_session():
      variables.global_variables_initializer().run()
      self.assertAllClose(expected_remapped_embeddings,
                          remapped_embeddings.as_tensor().eval())
  def test_load_embedding_initializer_large_oov(self):
    """Tests for the large OOV case for load_embedding_initializer wrapper."""
    self.new_feature_vocab_file = os.path.join(
        self.get_temp_dir(), 'new_feature_vocab.txt')
    with open(self.new_feature_vocab_file, 'w') as f:
      f.write('\n'.join(['one', 'zero', 'two', 'four']) + '\n')

    # Checkpoint has 5 entries, 3 of which correspond to OOV.
    self.old_feature_vocab_file = os.path.join(
        self.get_temp_dir(), 'old_feature_vocab.txt')
    with open(self.old_feature_vocab_file, 'w') as f:
      f.write('\n'.join(['zero', 'one']) + '\n')

    embedding_loading_initializer = (checkpoint_ops._load_embedding_initializer(
        new_vocab_file=self.new_feature_vocab_file,
        old_vocab_file=self.old_feature_vocab_file,
        new_vocab_size=4,
        embedding_dim=16,
        embedding_tensor_name='some_scope/embeddings',
        ckpt_path=[self.checkpoint_file],
        num_oov_buckets=5,
        initializer=self.initializer))

    expected_remapped_embeddings = np.concatenate(
        [
            np.reshape(range(16, 32), [1, 16]),
            np.reshape(range(16), [1, 16]),
            np.reshape([self.init_val] * 112, [7, 16]),
        ],
        axis=0)

    # The new weight matrix is of size
    # [4 feature vocab + 5 feature OOV, 16 (embedding dimension)], where the
    # 3rd and 4th rows are not found in the old vocabulary and therefore newly
    # initialized.  The last five rows are OOV and also newly initialized.
    # Use a partitioned variable to confirm that the offset logic works.
    remapped_embeddings = variable_scope.get_variable(
        name='embedding/obtained_embedding_matrix',
        shape=[9, 16],
        initializer=embedding_loading_initializer,
        partitioner=partitioned_variables.fixed_size_partitioner(2))

    with self.test_session():
      variables.global_variables_initializer().run()
      self.assertAllClose(expected_remapped_embeddings,
                          remapped_embeddings.as_tensor().eval())
示例#4
0
    def test_load_embedding_initializer_large_oov(self):
        """Tests for the large OOV case for load_embedding_initializer wrapper."""
        self.new_feature_vocab_file = os.path.join(self.get_temp_dir(),
                                                   'new_feature_vocab.txt')
        with open(self.new_feature_vocab_file, 'w') as f:
            f.write('\n'.join(['one', 'zero', 'two', 'four']) + '\n')

        # Checkpoint has 5 entries, 3 of which correspond to OOV.
        self.old_feature_vocab_file = os.path.join(self.get_temp_dir(),
                                                   'old_feature_vocab.txt')
        with open(self.old_feature_vocab_file, 'w') as f:
            f.write('\n'.join(['zero', 'one']) + '\n')

        embedding_loading_initializer = (
            checkpoint_ops._load_embedding_initializer(
                new_vocab_file=self.new_feature_vocab_file,
                old_vocab_file=self.old_feature_vocab_file,
                new_vocab_size=4,
                embedding_dim=16,
                embedding_tensor_name='some_scope/embeddings',
                ckpt_path=[self.checkpoint_file],
                num_oov_buckets=5,
                initializer=self.initializer))

        expected_remapped_embeddings = np.concatenate([
            np.reshape(range(16, 32), [1, 16]),
            np.reshape(range(16), [1, 16]),
            np.reshape([self.init_val] * 112, [7, 16]),
        ],
                                                      axis=0)

        # The new weight matrix is of size
        # [4 feature vocab + 5 feature OOV, 16 (embedding dimension)], where the
        # 3rd and 4th rows are not found in the old vocabulary and therefore newly
        # initialized.  The last five rows are OOV and also newly initialized.
        # Use a partitioned variable to confirm that the offset logic works.
        remapped_embeddings = variable_scope.get_variable(
            name='embedding/obtained_embedding_matrix',
            shape=[9, 16],
            initializer=embedding_loading_initializer,
            partitioner=partitioned_variables.fixed_size_partitioner(2))

        with self.test_session():
            variables.global_variables_initializer().run()
            self.assertAllClose(expected_remapped_embeddings,
                                remapped_embeddings.as_tensor().eval())
  def test_load_embedding_initializer_old_row_vocab(self):
    """Tests for load_embedding_initializer where we constrain old vocab."""
    embedding_loading_initializer = (
        checkpoint_ops._load_embedding_initializer(
            new_vocab_file=self.new_feature_vocab_file,
            old_vocab_file=self.old_feature_vocab_file,
            # Considered old vocabulary becomes ['zero', 'one', 'two'].  This
            # means 'three' in the new vocabulary is newly initialized.
            old_vocab_size=3,
            new_vocab_size=5,
            embedding_dim=16,
            embedding_tensor_name='some_scope/embeddings',
            ckpt_path=[self.checkpoint_file],
            num_oov_buckets=1,
            initializer=self.initializer))

    # The new weight matrix is of size
    # [5 feature vocab + 1 feature OOV, 16 (embedding dimension)], where the
    # last vocab row (2nd last row) is newly initialized (wasn't found in
    # previous vocab) and the actual last row is OOV and also newly initialized.
    # Use a partitioned variable to confirm that the offset logic works.
    expected_remapped_embeddings = np.concatenate(
        [
            np.reshape(range(48), [3, 16]),
            np.reshape([self.init_val] * 48, [3, 16]),
        ],
        axis=0)
    remapped_embeddings = variable_scope.get_variable(
        name='embedding/obtained_embedding_matrix',
        shape=[6, 16],
        initializer=embedding_loading_initializer,
        partitioner=partitioned_variables.fixed_size_partitioner(2))

    with self.test_session():
      variables.global_variables_initializer().run()
      self.assertAllClose(expected_remapped_embeddings,
                          remapped_embeddings.as_tensor().eval())