Пример #1
0
    def test_cache_xlu_embeds(self):
        embeddings_ref = PretrainedEmbedding()

        dialects = ["en_US", "en_UK", "es_XX"]
        for dialect in dialects:
            embeddings_ref.load_pretrained_embeddings(
                EMBED_RAW_PATH, append=True, dialect=dialect
            )
        with tempfile.NamedTemporaryFile(
            delete=False, suffix=".{}".format("cached")
        ) as cached_path:
            embeddings_ref.cache_pretrained_embeddings(cached_path.name)
            embeddings_cached = PretrainedEmbedding()
            embeddings_cached.load_cached_embeddings(cached_path.name)

        np.testing.assert_array_equal(
            sorted(embeddings_cached.stoi.keys()), sorted(embeddings_ref.stoi.keys())
        )
        np.testing.assert_array_equal(
            embeddings_cached.embed_vocab, embeddings_ref.embed_vocab
        )
        np.testing.assert_array_equal(
            sorted(embeddings_cached.stoi.values()),
            sorted(embeddings_ref.stoi.values()),
        )
        for word_idx in embeddings_ref.stoi.values():
            np.testing.assert_array_almost_equal(
                embeddings_cached.embedding_vectors[word_idx],
                embeddings_ref.embedding_vectors[word_idx],
            )
Пример #2
0
    def from_config(
        cls,
        config: WordFeatConfig,
        metadata: Optional[FieldMeta] = None,
        tensorizer: Optional[Tensorizer] = None,
        init_from_saved_state: Optional[bool] = False,
    ):
        """Factory method to construct an instance of WordEmbedding from
        the module's config object and the field's metadata object.

        Args:
            config (WordFeatConfig): Configuration object specifying all the
            parameters of WordEmbedding.
            metadata (FieldMeta): Object containing this field's metadata.

        Returns:
            type: An instance of WordEmbedding.

        """
        if tensorizer is not None:
            if config.vocab_from_pretrained_embeddings:
                raise ValueError(
                    "In new data design, to add tokens from a pretrained embeddings "
                    "file to the vocab, specify `vocab_file` in the token tensorizer."
                )

            embeddings_weight = None
            # We don't need to load pretrained embeddings if we know the
            # embedding weights are going to be loaded from a snapshot.
            if config.pretrained_embeddings_path and not init_from_saved_state:
                pretrained_embedding = PretrainedEmbedding(
                    config.pretrained_embeddings_path,  # doesn't support fbpkg
                    lowercase_tokens=config.lowercase_tokens,
                    skip_header=config.skip_header,
                )
                embeddings_weight = pretrained_embedding.initialize_embeddings_weights(
                    tensorizer.vocab.idx,
                    tensorizer.vocab.unk_token,
                    config.embed_dim,
                    config.embedding_init_strategy,
                )
            num_embeddings = len(tensorizer.vocab)
            unk_token_idx = tensorizer.vocab.get_unk_index()
            vocab = tensorizer.vocab
        else:  # This else condition should go away after metadata goes away.
            num_embeddings = metadata.vocab_size
            embeddings_weight = metadata.pretrained_embeds_weight
            unk_token_idx = metadata.unk_token_idx
            vocab = metadata.vocab

        return cls(
            num_embeddings=num_embeddings,
            embedding_dim=config.embed_dim,
            embeddings_weight=embeddings_weight,
            init_range=config.embedding_init_range,
            unk_token_idx=unk_token_idx,
            mlp_layer_dims=config.mlp_layer_dims,
            padding_idx=config.padding_idx,
            vocab=vocab,
        )
Пример #3
0
    def from_config(
        cls,
        config: WordFeatConfig,
        metadata: Optional[FieldMeta] = None,
        tensorizer: Optional[Tensorizer] = None,
    ):
        """Factory method to construct an instance of WordEmbedding from
        the module's config object and the field's metadata object.

        Args:
            config (WordFeatConfig): Configuration object specifying all the
            parameters of WordEmbedding.
            metadata (FieldMeta): Object containing this field's metadata.

        Returns:
            type: An instance of WordEmbedding.

        """
        if tensorizer is not None:
            embeddings_weight = None
            if config.pretrained_embeddings_path:
                pretrained_embedding = PretrainedEmbedding(
                    config.pretrained_embeddings_path,  # doesn't support fbpkg
                    lowercase_tokens=tensorizer.tokenizer.lowercase,
                )
                if config.vocab_from_pretrained_embeddings:
                    if not config.vocab_from_train_data:  # Reset token counter.
                        tensorizer.vocab_builder._counter = collections.Counter(
                        )
                    tensorizer.vocab_builder.add_all(
                        pretrained_embedding.embed_vocab)
                    tensorizer.vocab = tensorizer.vocab_builder.make_vocab()
                embeddings_weight = pretrained_embedding.initialize_embeddings_weights(
                    tensorizer.vocab.idx,
                    UNK,
                    config.embed_dim,
                    config.embedding_init_strategy,
                )
            num_embeddings = len(tensorizer.vocab)
            unk_token_idx = tensorizer.vocab.idx[UNK]
        else:  # This else condition should go away after metadata goes away.
            num_embeddings = metadata.vocab_size
            embeddings_weight = metadata.pretrained_embeds_weight
            unk_token_idx = metadata.unk_token_idx

        return cls(
            num_embeddings=num_embeddings,
            embedding_dim=config.embed_dim,
            embeddings_weight=embeddings_weight,
            init_range=config.embedding_init_range,
            unk_token_idx=unk_token_idx,
            mlp_layer_dims=config.mlp_layer_dims,
        )
Пример #4
0
    def __init__(
        self,
        pretrained_embeddings_path: str,
        embedding_dim: int,
        mlp_layer_dims: Optional[Sequence[int]] = None,
        lowercase_tokens: bool = False,
        skip_header: bool = True,
        delimiter: str = " ",
        vocab: ScriptVocabulary = None,
    ) -> None:
        super().__init__()
        vocab = vocab or build_vocab(pretrained_embeddings_path)
        pretrained_embedding = PretrainedEmbedding(
            pretrained_embeddings_path,
            lowercase_tokens=lowercase_tokens,
            skip_header=skip_header,
            delimiter=delimiter,
        )
        embeddings_weight = pretrained_embedding.initialize_embeddings_weights(
            vocab.idx,  # tensorizer.vocab.idx,
            vocab.unk_token,  # tensorizer.vocab.unk_token,
            embedding_dim,
            EmbedInitStrategy.RANDOM,
        )
        num_embeddings = len(vocab.idx)

        self.embedding = nn.Embedding(
            num_embeddings,
            embedding_dim,
            _weight=embeddings_weight,
            padding_idx=vocab.get_pad_index(),
        )

        # Initialize unk embedding with zeros
        # to guard the model against randomized decisions based on unknown words
        unk_token_idx = vocab.get_unk_index()
        if unk_token_idx >= 0:
            self.embedding.weight.data[unk_token_idx].fill_(0.0)

        # Create MLP layers
        if mlp_layer_dims is None:
            mlp_layer_dims = []

        self.mlp = nn.Sequential(
            *(
                nn.Sequential(nn.Linear(m, n), nn.ReLU())
                for m, n in zip([embedding_dim] + list(mlp_layer_dims), mlp_layer_dims)
            )
        )
        self.output_dim = mlp_layer_dims[-1] if mlp_layer_dims else embedding_dim
Пример #5
0
    def from_config(
        cls,
        config: Config,
        tensorizer: Tensorizer = None,
        init_from_saved_state: Optional[bool] = False,
    ):
        """Factory method to construct an instance of WordEmbedding from
        the module's config object and the field's metadata object.

        Args:
            config (WordSeqEmbedding.Config): Configuration object specifying all the
            parameters of WordEmbedding.

        Returns:
            type: An instance of WordSeqEmbedding.
        """
        embeddings_weight = None
        # We don't need to load pretrained embeddings if we know the
        # embedding weights are going to be loaded from a snapshot.
        if config.pretrained_embeddings_path and not init_from_saved_state:
            pretrained_embedding = PretrainedEmbedding(
                config.pretrained_embeddings_path,  # doesn't support fbpkg
                lowercase_tokens=config.lowercase_tokens,
                skip_header=config.skip_header,
                delimiter=config.delimiter,
            )
            embeddings_weight = pretrained_embedding.initialize_embeddings_weights(
                tensorizer.vocab.idx,
                tensorizer.vocab.unk_token,
                config.word_embed_dim,
                config.embedding_init_strategy,
            )
        num_embeddings = len(tensorizer.vocab)
        unk_token_idx = tensorizer.vocab.get_unk_index()
        vocab = tensorizer.vocab
        vocab_pad_idx = vocab.get_pad_index(value=-1)
        if vocab_pad_idx == -1:
            vocab_pad_idx = None

        return cls(
            lstm_config=config.lstm,
            num_embeddings=num_embeddings,
            word_embed_dim=config.word_embed_dim,
            embeddings_weight=embeddings_weight,
            init_range=config.embedding_init_range,
            init_std=config.embeddding_init_std,
            unk_token_idx=unk_token_idx,
            padding_idx=config.padding_idx or vocab_pad_idx,
            vocab=vocab,
        )
Пример #6
0
 def test_assign_pretrained_weights(self):
     embeddings_ref = PretrainedEmbedding()
     embeddings_ref.load_cached_embeddings(EMBED_CACHED_PATH)
     VOCAB = ["UNK", "aloha", "the"]
     embed_vocab_to_idx = {tok: i for i, tok in enumerate(VOCAB)}
     pretrained_embeds = embeddings_ref.initialize_embeddings_weights(
         embed_vocab_to_idx, "UNK", EMBED_DIM, EmbedInitStrategy.RANDOM
     )
     assert pretrained_embeds.shape[0] == len(VOCAB)
     assert pretrained_embeds.shape[1] == EMBED_DIM
     np.testing.assert_array_almost_equal(
         pretrained_embeds[1].numpy(),
         [-0.43124, 0.014934, -0.50635, 0.60506, 0.56051],
     )  # embedding vector for 'aloha'
     np.testing.assert_array_almost_equal(
         pretrained_embeds[2].numpy(),
         [-0.39153, -0.19803, 0.2573, -0.18617, 0.25551],
     )  # embedding vector for 'the'
Пример #7
0
    def test_load_pretrained_embeddings(self):
        pretrained_emb = PretrainedEmbedding(EMBED_RAW_PATH)

        self.assertEqual(len(pretrained_emb.embed_vocab), VOCAB_SIZE)
        self.assertEqual(pretrained_emb.embed_vocab[0], "</s>")
        self.assertEqual(pretrained_emb.embed_vocab[2], "to")

        self.assertEqual(len(pretrained_emb.stoi), VOCAB_SIZE)
        self.assertEqual(pretrained_emb.stoi["</s>"], 0)
        self.assertEqual(pretrained_emb.stoi["to"], 2)

        self.assertEqual(pretrained_emb.embedding_vectors.size(0), VOCAB_SIZE)
        self.assertEqual(pretrained_emb.embedding_vectors.size(1), EMBED_DIM)
Пример #8
0
    def from_config(
        cls,
        config: WordFeatConfig,
        metadata: Optional[FieldMeta] = None,
        tensorizer: Optional[Tensorizer] = None,
        init_from_saved_state: Optional[bool] = False,
    ):
        """Factory method to construct an instance of WordEmbedding from
        the module's config object and the field's metadata object.

        Args:
            config (WordFeatConfig): Configuration object specifying all the
            parameters of WordEmbedding.
            metadata (FieldMeta): Object containing this field's metadata.

        Returns:
            type: An instance of WordEmbedding.

        """
        if tensorizer is not None:
            embeddings_weight = None
            if config.pretrained_embeddings_path and (
                    # We don't need to load pretrained embeddings if we know the
                    # embedding weights are going to be loaded from a snapshot. The
                    # exception is if we rely on the pretrained embeddings to give us
                    # the vocab, in which case, we have to load it regardless.
                    config.vocab_from_pretrained_embeddings
                    or not init_from_saved_state):
                pretrained_embedding = PretrainedEmbedding(
                    config.pretrained_embeddings_path,  # doesn't support fbpkg
                    lowercase_tokens=config.lowercase_tokens,
                )

                if config.vocab_from_pretrained_embeddings:
                    # pretrained embeddings will get a freq count of 1
                    assert config.min_freq == 1, (
                        "If `vocab_from_pretrained_embeddings` is set, the vocab's "
                        "`min_freq` must be 1")
                    if not config.vocab_from_train_data:  # Reset token counter.
                        tensorizer.vocab_builder._counter = collections.Counter(
                        )
                    pretrained_vocab = pretrained_embedding.embed_vocab
                    if config.vocab_size:
                        pretrained_vocab = pretrained_vocab[:config.vocab_size]
                    tensorizer.vocab_builder.add_all(pretrained_vocab)
                    tensorizer.vocab = tensorizer.vocab_builder.make_vocab()

                embeddings_weight = pretrained_embedding.initialize_embeddings_weights(
                    tensorizer.vocab.idx,
                    UNK,
                    config.embed_dim,
                    config.embedding_init_strategy,
                )
            num_embeddings = len(tensorizer.vocab)
            unk_token_idx = tensorizer.vocab.idx[UNK]
        else:  # This else condition should go away after metadata goes away.
            num_embeddings = metadata.vocab_size
            embeddings_weight = metadata.pretrained_embeds_weight
            unk_token_idx = metadata.unk_token_idx

        return cls(
            num_embeddings=num_embeddings,
            embedding_dim=config.embed_dim,
            embeddings_weight=embeddings_weight,
            init_range=config.embedding_init_range,
            unk_token_idx=unk_token_idx,
            mlp_layer_dims=config.mlp_layer_dims,
        )