示例#1
0
    def __init__(self,
                 options_file: str,
                 weight_file: str,
                 requires_grad: bool = False) -> None:
        super(_ElmoBiLm, self).__init__()

        self._token_embedder = _ElmoCharacterEncoder(
            options_file, weight_file, requires_grad=requires_grad)

        with open(cached_path(options_file), 'r') as fin:
            options = json.load(fin)
        if not options['lstm'].get('use_skip_connections'):
            raise ConfigurationError(
                'We only support pretrained biLMs with residual connections')
        self._elmo_lstm = ElmoLstm(
            input_size=options['lstm']['projection_dim'],
            hidden_size=options['lstm']['projection_dim'],
            cell_size=options['lstm']['dim'],
            num_layers=options['lstm']['n_layers'],
            memory_cell_clip_value=options['lstm']['cell_clip'],
            state_projection_clip_value=options['lstm']['proj_clip'],
            requires_grad=requires_grad)
        self._elmo_lstm.load_weights(weight_file)
        # Number of representation layers including context independent layer
        self.num_layers = options['lstm']['n_layers'] + 1
示例#2
0
    def __init__(self,
                 options_file: str,
                 weight_file: str,
                 requires_grad: bool = False,
                 vocab_to_cache: List[str] = None) -> None:
        super(_ElmoBiLm, self).__init__()

        self._token_embedder = _ElmoCharacterEncoder(
            options_file,
            weight_file,
            requires_grad=False
            if vocab_to_cache is not None else requires_grad)

        self._requires_grad = requires_grad
        if requires_grad and vocab_to_cache:
            logging.warning(
                "You are fine tuning ELMo and caching char CNN word vectors. "
                "This behaviour is not guaranteed to be well defined, particularly. "
                "if not all of your inputs will occur in the vocabulary cache. "
                "_ElmoCharacterEncoder will be frozen because "
                "it is not used after word embedding caching.")
        # This is an embedding, used to look up cached
        # word vectors built from character level cnn embeddings.
        self._word_embedding = None
        self._bos_embedding: torch.Tensor = None
        self._eos_embedding: torch.Tensor = None
        if vocab_to_cache:
            logging.info(
                "Caching character cnn layers for words in vocabulary.")
            # This sets 3 attributes, _word_embedding, _bos_embedding and _eos_embedding.
            # They are set in the method so they can be accessed from outside the
            # constructor.
            self.create_cached_cnn_embeddings(vocab_to_cache)

        with open(cached_path(options_file), 'r') as fin:
            options = json.load(fin)
        if not options['lstm'].get('use_skip_connections'):
            raise ConfigurationError(
                'We only support pretrained biLMs with residual connections')
        self._elmo_lstm = ElmoLstm(
            input_size=options['lstm']['projection_dim'],
            hidden_size=options['lstm']['projection_dim'],
            cell_size=options['lstm']['dim'],
            num_layers=options['lstm']['n_layers'],
            memory_cell_clip_value=options['lstm']['cell_clip'],
            state_projection_clip_value=options['lstm']['proj_clip'],
            requires_grad=requires_grad)
        self._elmo_lstm.load_weights(weight_file)
        # Number of representation layers including context independent layer
        self.num_layers = options['lstm']['n_layers'] + 1
    def test_elmo_lstm(self):
        input_tensor = torch.rand(4, 5, 3)
        input_tensor[1, 4:, :] = 0.
        input_tensor[2, 2:, :] = 0.
        input_tensor[3, 1:, :] = 0.
        mask = torch.ones([4, 5])
        mask[1, 4:] = 0.
        mask[2, 2:] = 0.
        mask[3, 1:] = 0.

        lstm = ElmoLstm(num_layers=2,
                        input_size=3,
                        hidden_size=5,
                        cell_size=7,
                        memory_cell_clip_value=2,
                        state_projection_clip_value=1)
        output_sequence = lstm(input_tensor, mask)

        # Check all the layer outputs are masked properly.
        numpy.testing.assert_array_equal(
            output_sequence.data[:, 1, 4:, :].numpy(), 0.0)
        numpy.testing.assert_array_equal(
            output_sequence.data[:, 2, 2:, :].numpy(), 0.0)
        numpy.testing.assert_array_equal(
            output_sequence.data[:, 3, 1:, :].numpy(), 0.0)

        # LSTM state should be (num_layers, batch_size, hidden_size)
        assert list(lstm._states[0].size()) == [2, 4, 10]
        # LSTM memory cell should be (num_layers, batch_size, cell_size)
        assert list((lstm._states[1].size())) == [2, 4, 14]
示例#4
0
文件: elmo.py 项目: pyknife/allennlp
    def __init__(self,
                 options_file: str,
                 weight_file: str,
                 requires_grad: bool = False,
                 vocab_to_cache: List[str] = None) -> None:
        super(_ElmoBiLm, self).__init__()

        self._token_embedder = _ElmoCharacterEncoder(options_file, weight_file, requires_grad=requires_grad)

        self._requires_grad = requires_grad
        if requires_grad and vocab_to_cache:
            logging.warning("You are fine tuning ELMo and caching char CNN word vectors. "
                            "This behaviour is not guaranteed to be well defined, particularly. "
                            "if not all of your inputs will occur in the vocabulary cache.")
        # This is an embedding, used to look up cached
        # word vectors built from character level cnn embeddings.
        self._word_embedding = None
        self._bos_embedding: torch.Tensor = None
        self._eos_embedding: torch.Tensor = None
        if vocab_to_cache:
            logging.info("Caching character cnn layers for words in vocabulary.")
            # This sets 3 attributes, _word_embedding, _bos_embedding and _eos_embedding.
            # They are set in the method so they can be accessed from outside the
            # constructor.
            self.create_cached_cnn_embeddings(vocab_to_cache)

        with open(cached_path(options_file), 'r') as fin:
            options = json.load(fin)
        if not options['lstm'].get('use_skip_connections'):
            raise ConfigurationError('We only support pretrained biLMs with residual connections')
        self._elmo_lstm = ElmoLstm(input_size=options['lstm']['projection_dim'],
                                   hidden_size=options['lstm']['projection_dim'],
                                   cell_size=options['lstm']['dim'],
                                   num_layers=options['lstm']['n_layers'],
                                   memory_cell_clip_value=options['lstm']['cell_clip'],
                                   state_projection_clip_value=options['lstm']['proj_clip'],
                                   requires_grad=requires_grad)
        self._elmo_lstm.load_weights(weight_file)
        # Number of representation layers including context independent layer
        self.num_layers = options['lstm']['n_layers'] + 1
示例#5
0
    def __init__(self,
                 options_file: str,
                 weight_file: str,
                 requires_grad: bool = False) -> None:
        super(_ElmoBiLm, self).__init__()

        self._token_embedder = _ElmoCharacterEncoder(options_file, weight_file, requires_grad=requires_grad)

        with open(cached_path(options_file), 'r') as fin:
            options = json.load(fin)
        if not options['lstm'].get('use_skip_connections'):
            raise ConfigurationError('We only support pretrained biLMs with residual connections')
        self._elmo_lstm = ElmoLstm(input_size=options['lstm']['projection_dim'],
                                   hidden_size=options['lstm']['projection_dim'],
                                   cell_size=options['lstm']['dim'],
                                   num_layers=options['lstm']['n_layers'],
                                   memory_cell_clip_value=options['lstm']['cell_clip'],
                                   state_projection_clip_value=options['lstm']['proj_clip'],
                                   requires_grad=requires_grad)
        self._elmo_lstm.load_weights(weight_file)
        # Number of representation layers including context independent layer
        self.num_layers = options['lstm']['n_layers'] + 1
示例#6
0
class _ElmoBiLm(torch.nn.Module):
    u"""
    Run a pre-trained bidirectional language model, outputing the activations at each
    layer for weighting together into an ELMo representation (with
    ``allennlp.modules.seq2seq_encoders.Elmo``).  This is a lower level class, useful
    for advanced uses, but most users should use ``allennlp.modules.seq2seq_encoders.Elmo``
    directly.

    Parameters
    ----------
    options_file : ``str``
        ELMo JSON options file
    weight_file : ``str``
        ELMo hdf5 weight file
    requires_grad: ``bool``, optional
        If True, compute gradient of ELMo parameters for fine tuning.
    vocab_to_cache : ``List[str]``, optional, (default = 0.5).
        A list of words to pre-compute and cache character convolutions
        for. If you use this option, _ElmoBiLm expects that you pass word
        indices of shape (batch_size, timesteps) to forward, instead
        of character indices. If you use this option and pass a word which
        wasn't pre-cached, this will break.
    """
    def __init__(self,
                 options_file     ,
                 weight_file     ,
                 requires_grad       = False,
                 vocab_to_cache            = None)        :
        super(_ElmoBiLm, self).__init__()

        self._token_embedder = _ElmoCharacterEncoder(options_file, weight_file, requires_grad=requires_grad)

        self._requires_grad = requires_grad
        if requires_grad and vocab_to_cache:
            logging.warning(u"You are fine tuning ELMo and caching char CNN word vectors. "
                            u"This behaviour is not guaranteed to be well defined, particularly. "
                            u"if not all of your inputs will occur in the vocabulary cache.")
        # This is an embedding, used to look up cached
        # word vectors built from character level cnn embeddings.
        self._word_embedding = None
        self._bos_embedding = None
        self._eos_embedding = None
        if vocab_to_cache:
            logging.info(u"Caching character cnn layers for words in vocabulary.")
            # This sets 3 attributes, _word_embedding, _bos_embedding and _eos_embedding.
            # They are set in the method so they can be accessed from outside the
            # constructor.
            self.create_cached_cnn_embeddings(vocab_to_cache)

        with open(cached_path(options_file), u'r') as fin:
            options = json.load(fin)
        if not options[u'lstm'].get(u'use_skip_connections'):
            raise ConfigurationError(u'We only support pretrained biLMs with residual connections')
        self._elmo_lstm = ElmoLstm(input_size=options[u'lstm'][u'projection_dim'],
                                   hidden_size=options[u'lstm'][u'projection_dim'],
                                   cell_size=options[u'lstm'][u'dim'],
                                   num_layers=options[u'lstm'][u'n_layers'],
                                   memory_cell_clip_value=options[u'lstm'][u'cell_clip'],
                                   state_projection_clip_value=options[u'lstm'][u'proj_clip'],
                                   requires_grad=requires_grad)
        self._elmo_lstm.load_weights(weight_file)
        # Number of representation layers including context independent layer
        self.num_layers = options[u'lstm'][u'n_layers'] + 1

    def get_output_dim(self):
        return 2 * self._token_embedder.get_output_dim()

    def forward(self,  # pylint: disable=arguments-differ
                inputs              ,
                word_inputs               = None)                                                      :
        u"""
        Parameters
        ----------
        inputs: ``torch.Tensor``, required.
            Shape ``(batch_size, timesteps, 50)`` of character ids representing the current batch.
        word_inputs : ``torch.Tensor``, required.
            If you passed a cached vocab, you can in addition pass a tensor of shape ``(batch_size, timesteps)``,
            which represent word ids which have been pre-cached.

        Returns
        -------
        Dict with keys:

        ``'activations'``: ``List[torch.Tensor]``
            A list of activations at each layer of the network, each of shape
            ``(batch_size, timesteps + 2, embedding_dim)``
        ``'mask'``:  ``torch.Tensor``
            Shape ``(batch_size, timesteps + 2)`` long tensor with sequence mask.

        Note that the output tensors all include additional special begin and end of sequence
        markers.
        """
        if self._word_embedding is not None and word_inputs is not None:
            try:
                mask_without_bos_eos = (word_inputs > 0).long()
                # The character cnn part is cached - just look it up.
                embedded_inputs = self._word_embedding(word_inputs) # type: ignore
                # shape (batch_size, timesteps + 2, embedding_dim)
                type_representation, mask = add_sentence_boundary_token_ids(
                        embedded_inputs,
                        mask_without_bos_eos,
                        self._bos_embedding,
                        self._eos_embedding
                )
            except RuntimeError:
                # Back off to running the character convolutions,
                # as we might not have the words in the cache.
                token_embedding = self._token_embedder(inputs)
                mask = token_embedding[u'mask']
                type_representation = token_embedding[u'token_embedding']
        else:
            token_embedding = self._token_embedder(inputs)
            mask = token_embedding[u'mask']
            type_representation = token_embedding[u'token_embedding']
        lstm_outputs = self._elmo_lstm(type_representation, mask)

        # Prepare the output.  The first layer is duplicated.
        # Because of minor differences in how masking is applied depending
        # on whether the char cnn layers are cached, we'll be defensive and
        # multiply by the mask here. It's not strictly necessary, as the
        # mask passed on is correct, but the values in the padded areas
        # of the char cnn representations can change.
        output_tensors = [
                torch.cat([type_representation, type_representation], dim=-1) * mask.float().unsqueeze(-1)
        ]
        for layer_activations in torch.chunk(lstm_outputs, lstm_outputs.size(0), dim=0):
            output_tensors.append(layer_activations.squeeze(0))

        return {
                u'activations': output_tensors,
                u'mask': mask,
        }

    def create_cached_cnn_embeddings(self, tokens           )        :
        u"""
        Given a list of tokens, this method precomputes word representations
        by running just the character convolutions and highway layers of elmo,
        essentially creating uncontextual word vectors. On subsequent forward passes,
        the word ids are looked up from an embedding, rather than being computed on
        the fly via the CNN encoder.

        This function sets 3 attributes:

        _word_embedding : ``torch.Tensor``
            The word embedding for each word in the tokens passed to this method.
        _bos_embedding : ``torch.Tensor``
            The embedding for the BOS token.
        _eos_embedding : ``torch.Tensor``
            The embedding for the EOS token.

        Parameters
        ----------
        tokens : ``List[str]``, required.
            A list of tokens to precompute character convolutions for.
        """
        tokens = [ELMoCharacterMapper.bos_token, ELMoCharacterMapper.eos_token] + tokens
        timesteps = 32
        batch_size = 32
        chunked_tokens = lazy_groups_of(iter(tokens), timesteps)

        all_embeddings = []
        device = get_device_of(next(self.parameters()))
        for batch in lazy_groups_of(chunked_tokens, batch_size):
            # Shape (batch_size, timesteps, 50)
            batched_tensor = batch_to_ids(batch)
            # NOTE: This device check is for when a user calls this method having
            # already placed the model on a device. If this is called in the
            # constructor, it will probably happen on the CPU. This isn't too bad,
            # because it's only a few convolutions and will likely be very fast.
            if device >= 0:
                batched_tensor = batched_tensor.cuda(device)
            output = self._token_embedder(batched_tensor)
            token_embedding = output[u"token_embedding"]
            mask = output[u"mask"]
            token_embedding, _ = remove_sentence_boundaries(token_embedding, mask)
            all_embeddings.append(token_embedding.view(-1, token_embedding.size(-1)))
        full_embedding = torch.cat(all_embeddings, 0)

        # We might have some trailing embeddings from padding in the batch, so
        # we clip the embedding and lookup to the right size.
        full_embedding = full_embedding[:len(tokens), :]
        embedding = full_embedding[2:len(tokens), :]
        vocab_size, embedding_dim = list(embedding.size())

        from allennlp.modules.token_embedders import Embedding # type: ignore
        self._bos_embedding = full_embedding[0, :]
        self._eos_embedding = full_embedding[1, :]
        self._word_embedding = Embedding(vocab_size, # type: ignore
                                         embedding_dim,
                                         weight=embedding.data,
                                         trainable=self._requires_grad,
                                         padding_index=0)
示例#7
0
class ElmoBilmDebias(torch.nn.Module):
    """
    This is a customized version of the elmo bilm + debiasing at the first layer
    """
    def __init__(self,
                 options_file: str,
                 weight_file: str,
                 requires_grad: bool = False,
                 vocab_to_cache: List[str] = None) -> None:
        super(ElmoBilmDebias, self).__init__()

        self._token_embedder = _ElmoCharacterEncoder(
            options_file, weight_file, requires_grad=requires_grad)

        self._requires_grad = requires_grad
        if requires_grad and vocab_to_cache:
            logging.warning(
                "You are fine tuning ELMo and caching char CNN word vectors. "
                "This behaviour is not guaranteed to be well defined, particularly. "
                "if not all of your inputs will occur in the vocabulary cache."
            )
        # This is an embedding, used to look up cached
        # word vectors built from character level cnn embeddings.
        self._word_embedding = None
        self._bos_embedding: torch.Tensor = None
        self._eos_embedding: torch.Tensor = None
        if vocab_to_cache:
            logging.info(
                "Caching character cnn layers for words in vocabulary.")
            # This sets 3 attributes, _word_embedding, _bos_embedding and _eos_embedding.
            # They are set in the method so they can be accessed from outside the
            # constructor.
            self.create_cached_cnn_embeddings(vocab_to_cache)

        with open(cached_path(options_file), 'r') as fin:
            options = json.load(fin)
        if not options['lstm'].get('use_skip_connections'):
            raise ConfigurationError(
                'We only support pretrained biLMs with residual connections')
        self._elmo_lstm = ElmoLstm(
            input_size=options['lstm']['projection_dim'],
            hidden_size=options['lstm']['projection_dim'],
            cell_size=options['lstm']['dim'],
            num_layers=options['lstm']['n_layers'],
            memory_cell_clip_value=options['lstm']['cell_clip'],
            state_projection_clip_value=options['lstm']['proj_clip'],
            requires_grad=requires_grad)
        self._elmo_lstm.load_weights(weight_file)
        # Number of representation layers including context independent layer
        self.num_layers = options['lstm']['n_layers'] + 1

    def get_output_dim(self):
        return 2 * self._token_embedder.get_output_dim()

    def forward(
        self,  # pylint: disable=arguments-differ
        inputs: torch.Tensor,
        bias: torch.Tensor = None,
        num_bias: int = 1,
        contraction: (torch.Tensor, torch.Tensor) = None,
        word_inputs: torch.Tensor = None
    ) -> Dict[str, Union[torch.Tensor, List[torch.Tensor]]]:
        """
        Parameters
        ----------
        inputs: ``torch.Tensor``, required.
            Shape ``(batch_size, timesteps, 50)`` of character ids representing the current batch.
        word_inputs : ``torch.Tensor``, required.
            If you passed a cached vocab, you can in addition pass a tensor of shape ``(batch_size, timesteps)``,
            which represent word ids which have been pre-cached.
        Returns
        -------
        Dict with keys:
        ``'activations'``: ``List[torch.Tensor]``
            A list of activations at each layer of the network, each of shape
            ``(batch_size, timesteps + 2, embedding_dim)``
        ``'mask'``:  ``torch.Tensor``
            Shape ``(batch_size, timesteps + 2)`` long tensor with sequence mask.
        Note that the output tensors all include additional special begin and end of sequence
        markers.
        """
        if self._word_embedding is not None and word_inputs is not None:
            try:
                mask_without_bos_eos = (word_inputs > 0).long()
                # The character cnn part is cached - just look it up.
                embedded_inputs = self._word_embedding(
                    word_inputs)  # type: ignore
                # shape (batch_size, timesteps + 2, embedding_dim)
                type_representation, mask = add_sentence_boundary_token_ids(
                    embedded_inputs, mask_without_bos_eos, self._bos_embedding,
                    self._eos_embedding)
            except RuntimeError:
                # Back off to running the character convolutions,
                # as we might not have the words in the cache.
                token_embedding = self._token_embedder(inputs)
                mask = token_embedding['mask']
                type_representation = token_embedding['token_embedding']
        else:
            token_embedding = self._token_embedder(inputs)
            mask = token_embedding['mask']
            type_representation = token_embedding['token_embedding']

        # debiasing the input embeddings
        #   1. take out the boundaries, i.e. len - 2
        batch_l, seq_l, elmo_size = type_representation.shape
        l0 = type_representation[:, 1:-1, :]
        #   2. debiasing
        if bias is not None:
            if num_bias == 1:
                bias = bias.expand(batch_l, 1, elmo_size)
                proj = l0.bmm(bias.transpose(1, 2))
                l0 = l0 - (proj * bias)
            elif num_bias == 2:
                bias = bias.expand(batch_l, 2, elmo_size)
                bias1 = bias[:, 0:1, :]
                bias2 = bias[:, 1:2, :]
                proj1 = l0.bmm(bias1.transpose(1, 2))
                proj2 = l0.bmm(bias2.transpose(1, 2))
                l0 = l0 - (proj1 * bias1) - (proj2 * bias2)
            else:
                raise Exception('unrecognized num_bias: {0}'.format(num_bias))
        #   3. contraction
        if contraction is not None:
            if not hasattr(self, 'contract_U'):
                v1 = contraction[0].view(-1, elmo_size).cpu().numpy()
                v2 = contraction[1].view(-1, elmo_size).cpu().numpy()

                v1, v2 = maxSpan(v1, v2)
                U = np.identity(elmo_size)
                U = gsConstrained(U, v1, basis(np.vstack((v1, v2))))

                self.contract_v1 = torch.from_numpy(v1).view(1, 1, elmo_size)
                self.contract_v2 = torch.from_numpy(v2).view(1, 1, elmo_size)
                self.contract_U = torch.from_numpy(U).view(
                    1, elmo_size, elmo_size).float()
                gpuid = contraction[0].get_device()
                if gpuid != -1:
                    self.contract_v1 = self.contract_v1.cuda(gpuid)
                    self.contract_v2 = self.contract_v2.cuda(gpuid)
                    self.contract_U = self.contract_U.cuda(gpuid)

            opt = Holder()
            opt.gpuid = contraction[0].get_device()
            l0 = correction(opt, self.contract_U, self.contract_v1,
                            self.contract_v2, l0.contiguous())

        #   4. reconcat with boundaries
        type_representation = torch.cat([
            type_representation[:, 0:1, :], l0, type_representation[:, -1:, :]
        ], 1)

        # continue the lm
        lstm_outputs = self._elmo_lstm(type_representation, mask)

        # Prepare the output.  The first layer is duplicated.
        # Because of minor differences in how masking is applied depending
        # on whether the char cnn layers are cached, we'll be defensive and
        # multiply by the mask here. It's not strictly necessary, as the
        # mask passed on is correct, but the values in the padded areas
        # of the char cnn representations can change.
        output_tensors = [
            torch.cat([type_representation, type_representation], dim=-1) *
            mask.float().unsqueeze(-1)
        ]
        for layer_activations in torch.chunk(lstm_outputs,
                                             lstm_outputs.size(0),
                                             dim=0):
            output_tensors.append(layer_activations.squeeze(0))

        return {
            'activations': output_tensors,
            'mask': mask,
        }
示例#8
0
class _ElmoBiLm(torch.nn.Module):
    """
    Run a pre-trained bidirectional language model, outputing the activations at each
    layer for weighting together into an ELMo representation (with
    ``allennlp.modules.seq2seq_encoders.Elmo``).  This is a lower level class, useful
    for advanced uses, but most users should use ``allennlp.modules.seq2seq_encoders.Elmo``
    directly.

    Parameters
    ----------
    options_file : ``str``
        ELMo JSON options file
    weight_file : ``str``
        ELMo hdf5 weight file
    requires_grad: ``bool``, optional
        If True, compute gradient of ELMo parameters for fine tuning.
    """
    def __init__(self,
                 options_file: str,
                 weight_file: str,
                 requires_grad: bool = False) -> None:
        super(_ElmoBiLm, self).__init__()

        self._token_embedder = _ElmoCharacterEncoder(options_file, weight_file, requires_grad=requires_grad)

        with open(cached_path(options_file), 'r') as fin:
            options = json.load(fin)
        if not options['lstm'].get('use_skip_connections'):
            raise ConfigurationError('We only support pretrained biLMs with residual connections')
        self._elmo_lstm = ElmoLstm(input_size=options['lstm']['projection_dim'],
                                   hidden_size=options['lstm']['projection_dim'],
                                   cell_size=options['lstm']['dim'],
                                   num_layers=options['lstm']['n_layers'],
                                   memory_cell_clip_value=options['lstm']['cell_clip'],
                                   state_projection_clip_value=options['lstm']['proj_clip'],
                                   requires_grad=requires_grad)
        self._elmo_lstm.load_weights(weight_file)
        # Number of representation layers including context independent layer
        self.num_layers = options['lstm']['n_layers'] + 1

    def forward(self,  # pylint: disable=arguments-differ
                inputs: torch.Tensor) -> Dict[str, Union[torch.Tensor, List[torch.Tensor]]]:
        """
        Parameters
        ----------
        inputs: ``torch.autograd.Variable``
            Shape ``(batch_size, timesteps, 50)`` of character ids representing the current batch.

        Returns
        -------
        Dict with keys:

        ``'activations'``: ``List[torch.autograd.Variable]``
            A list of activations at each layer of the network, each of shape
            ``(batch_size, timesteps + 2, embedding_dim)``
        ``'mask'``:  ``torch.autograd.Variable``
            Shape ``(batch_size, timesteps + 2)`` long tensor with sequence mask.

        Note that the output tensors all include additional special begin and end of sequence
        markers.
        """
        token_embedding = self._token_embedder(inputs)
        type_representation = token_embedding['token_embedding']
        mask = token_embedding['mask']
        lstm_outputs = self._elmo_lstm(type_representation, mask)

        # Prepare the output.  The first layer is duplicated.
        output_tensors = [
                torch.cat([type_representation, type_representation], dim=-1)
        ]
        for layer_activations in torch.chunk(lstm_outputs, lstm_outputs.size(0), dim=0):
            output_tensors.append(layer_activations.squeeze(0))

        return {
                'activations': output_tensors,
                'mask': mask,
        }
示例#9
0
文件: elmo.py 项目: pyknife/allennlp
class _ElmoBiLm(torch.nn.Module):
    """
    Run a pre-trained bidirectional language model, outputing the activations at each
    layer for weighting together into an ELMo representation (with
    ``allennlp.modules.seq2seq_encoders.Elmo``).  This is a lower level class, useful
    for advanced uses, but most users should use ``allennlp.modules.seq2seq_encoders.Elmo``
    directly.

    Parameters
    ----------
    options_file : ``str``
        ELMo JSON options file
    weight_file : ``str``
        ELMo hdf5 weight file
    requires_grad: ``bool``, optional
        If True, compute gradient of ELMo parameters for fine tuning.
    vocab_to_cache : ``List[str]``, optional, (default = 0.5).
        A list of words to pre-compute and cache character convolutions
        for. If you use this option, _ElmoBiLm expects that you pass word
        indices of shape (batch_size, timesteps) to forward, instead
        of character indices. If you use this option and pass a word which
        wasn't pre-cached, this will break.
    """
    def __init__(self,
                 options_file: str,
                 weight_file: str,
                 requires_grad: bool = False,
                 vocab_to_cache: List[str] = None) -> None:
        super(_ElmoBiLm, self).__init__()

        self._token_embedder = _ElmoCharacterEncoder(options_file, weight_file, requires_grad=requires_grad)

        self._requires_grad = requires_grad
        if requires_grad and vocab_to_cache:
            logging.warning("You are fine tuning ELMo and caching char CNN word vectors. "
                            "This behaviour is not guaranteed to be well defined, particularly. "
                            "if not all of your inputs will occur in the vocabulary cache.")
        # This is an embedding, used to look up cached
        # word vectors built from character level cnn embeddings.
        self._word_embedding = None
        self._bos_embedding: torch.Tensor = None
        self._eos_embedding: torch.Tensor = None
        if vocab_to_cache:
            logging.info("Caching character cnn layers for words in vocabulary.")
            # This sets 3 attributes, _word_embedding, _bos_embedding and _eos_embedding.
            # They are set in the method so they can be accessed from outside the
            # constructor.
            self.create_cached_cnn_embeddings(vocab_to_cache)

        with open(cached_path(options_file), 'r') as fin:
            options = json.load(fin)
        if not options['lstm'].get('use_skip_connections'):
            raise ConfigurationError('We only support pretrained biLMs with residual connections')
        self._elmo_lstm = ElmoLstm(input_size=options['lstm']['projection_dim'],
                                   hidden_size=options['lstm']['projection_dim'],
                                   cell_size=options['lstm']['dim'],
                                   num_layers=options['lstm']['n_layers'],
                                   memory_cell_clip_value=options['lstm']['cell_clip'],
                                   state_projection_clip_value=options['lstm']['proj_clip'],
                                   requires_grad=requires_grad)
        self._elmo_lstm.load_weights(weight_file)
        # Number of representation layers including context independent layer
        self.num_layers = options['lstm']['n_layers'] + 1

    def get_output_dim(self):
        return 2 * self._token_embedder.get_output_dim()

    def forward(self,  # pylint: disable=arguments-differ
                inputs: torch.Tensor,
                word_inputs: torch.Tensor = None) -> Dict[str, Union[torch.Tensor, List[torch.Tensor]]]:
        """
        Parameters
        ----------
        inputs: ``torch.Tensor``, required.
            Shape ``(batch_size, timesteps, 50)`` of character ids representing the current batch.
        word_inputs : ``torch.Tensor``, required.
            If you passed a cached vocab, you can in addition pass a tensor of shape ``(batch_size, timesteps)``,
            which represent word ids which have been pre-cached.

        Returns
        -------
        Dict with keys:

        ``'activations'``: ``List[torch.Tensor]``
            A list of activations at each layer of the network, each of shape
            ``(batch_size, timesteps + 2, embedding_dim)``
        ``'mask'``:  ``torch.Tensor``
            Shape ``(batch_size, timesteps + 2)`` long tensor with sequence mask.

        Note that the output tensors all include additional special begin and end of sequence
        markers.
        """
        if self._word_embedding is not None and word_inputs is not None:
            try:
                mask_without_bos_eos = (word_inputs > 0).long()
                # The character cnn part is cached - just look it up.
                embedded_inputs = self._word_embedding(word_inputs) # type: ignore
                # shape (batch_size, timesteps + 2, embedding_dim)
                type_representation, mask = add_sentence_boundary_token_ids(
                        embedded_inputs,
                        mask_without_bos_eos,
                        self._bos_embedding,
                        self._eos_embedding
                )
            except RuntimeError:
                # Back off to running the character convolutions,
                # as we might not have the words in the cache.
                token_embedding = self._token_embedder(inputs)
                mask = token_embedding['mask']
                type_representation = token_embedding['token_embedding']
        else:
            token_embedding = self._token_embedder(inputs)
            mask = token_embedding['mask']
            type_representation = token_embedding['token_embedding']
        lstm_outputs = self._elmo_lstm(type_representation, mask)

        # Prepare the output.  The first layer is duplicated.
        # Because of minor differences in how masking is applied depending
        # on whether the char cnn layers are cached, we'll be defensive and
        # multiply by the mask here. It's not strictly necessary, as the
        # mask passed on is correct, but the values in the padded areas
        # of the char cnn representations can change.
        output_tensors = [
                torch.cat([type_representation, type_representation], dim=-1) * mask.float().unsqueeze(-1)
        ]
        for layer_activations in torch.chunk(lstm_outputs, lstm_outputs.size(0), dim=0):
            output_tensors.append(layer_activations.squeeze(0))

        return {
                'activations': output_tensors,
                'mask': mask,
        }

    def create_cached_cnn_embeddings(self, tokens: List[str]) -> None:
        """
        Given a list of tokens, this method precomputes word representations
        by running just the character convolutions and highway layers of elmo,
        essentially creating uncontextual word vectors. On subsequent forward passes,
        the word ids are looked up from an embedding, rather than being computed on
        the fly via the CNN encoder.

        This function sets 3 attributes:

        _word_embedding : ``torch.Tensor``
            The word embedding for each word in the tokens passed to this method.
        _bos_embedding : ``torch.Tensor``
            The embedding for the BOS token.
        _eos_embedding : ``torch.Tensor``
            The embedding for the EOS token.

        Parameters
        ----------
        tokens : ``List[str]``, required.
            A list of tokens to precompute character convolutions for.
        """
        tokens = [ELMoCharacterMapper.bos_token, ELMoCharacterMapper.eos_token] + tokens
        timesteps = 32
        batch_size = 32
        chunked_tokens = lazy_groups_of(iter(tokens), timesteps)

        all_embeddings = []
        device = get_device_of(next(self.parameters()))
        for batch in lazy_groups_of(chunked_tokens, batch_size):
            # Shape (batch_size, timesteps, 50)
            batched_tensor = batch_to_ids(batch)
            # NOTE: This device check is for when a user calls this method having
            # already placed the model on a device. If this is called in the
            # constructor, it will probably happen on the CPU. This isn't too bad,
            # because it's only a few convolutions and will likely be very fast.
            if device >= 0:
                batched_tensor = batched_tensor.cuda(device)
            output = self._token_embedder(batched_tensor)
            token_embedding = output["token_embedding"]
            mask = output["mask"]
            token_embedding, _ = remove_sentence_boundaries(token_embedding, mask)
            all_embeddings.append(token_embedding.view(-1, token_embedding.size(-1)))
        full_embedding = torch.cat(all_embeddings, 0)

        # We might have some trailing embeddings from padding in the batch, so
        # we clip the embedding and lookup to the right size.
        full_embedding = full_embedding[:len(tokens), :]
        embedding = full_embedding[2:len(tokens), :]
        vocab_size, embedding_dim = list(embedding.size())

        from allennlp.modules.token_embedders import Embedding # type: ignore
        self._bos_embedding = full_embedding[0, :]
        self._eos_embedding = full_embedding[1, :]
        self._word_embedding = Embedding(vocab_size, # type: ignore
                                         embedding_dim,
                                         weight=embedding.data,
                                         trainable=self._requires_grad,
                                         padding_index=0)
示例#10
0
    def __init__(self, conf: Dict,
                 word_batch: WordBatch,
                 char_batch: CharacterBatch):
        super(BiLMBase, self).__init__()
        self.conf = conf

        c = conf['token_embedder']
        if word_batch is not None:
            if 'pretrained' in c:
                embs = load_embedding_txt(c['pretrained'], c['has_header'])
                logger.info('loaded {0} embedding entries.'.format(len(embs[0])))
            else:
                embs = None
            word_embedder = Embeddings(c['word_dim'], word_batch.mapping, embs=embs, fix_emb=False, normalize=False)
        else:
            word_embedder = None

        if char_batch is not None:
            dim = c.get('char_dim') if c.get('char_dim', 0) > 0 else c.get('wordpiece_dim')
            char_embedder = Embeddings(dim, char_batch.mapping, embs=None, fix_emb=False, normalize=False)
        else:
            char_embedder = None

        token_embedder_name = c['name'].lower()
        if token_embedder_name == 'cnn':
            self.token_embedder = ConvTokenEmbedder(output_dim=conf['encoder']['projection_dim'],
                                                    word_embedder=word_embedder,
                                                    char_embedder=char_embedder,
                                                    filters=c['filters'],
                                                    n_highway=c['n_highway'],
                                                    activation=c['activation'])
        elif token_embedder_name == 'lstm':
            self.token_embedder = LstmTokenEmbedder(output_dim=conf['encoder']['projection_dim'],
                                                    word_embedder=word_embedder,
                                                    char_embedder=char_embedder,
                                                    dropout=conf['dropout'])
        elif token_embedder_name == 'grecnn':
            self.token_embedder = GatedRecNNTokenEmbedder(output_dim=conf['encoder']['projection_dim'],
                                                          word_embedder=word_embedder,
                                                          char_embedder=char_embedder)
        elif token_embedder_name == 'sum':
            self.token_embedder = SumTokenEmbedder(output_dim=conf['encoder']['projection_dim'],
                                                   word_embedder=word_embedder,
                                                   char_embedder=char_embedder)
        else:
            raise ValueError('Unknown token embedder name: {}'.format(token_embedder_name))

        self.add_sentence_boundary = c.get('add_sentence_boundary', False)
        self.add_sentence_boundary_ids = c.get('add_sentence_boundary_ids', False)
        assert not (self.add_sentence_boundary and self.add_sentence_boundary_ids)

        if self.add_sentence_boundary:
            dim = self.token_embedder.get_output_dim()
            self.bos_embeddings = torch.nn.Parameter(torch.randn(dim) / math.sqrt(dim))
            self.eos_embeddings = torch.nn.Parameter(torch.randn(dim) / math.sqrt(dim))

        c = conf['encoder']
        encoder_name = c['name'].lower()
        if encoder_name == 'elmo':
            # NOTE: for fare comparison, we set stateful to false
            self.encoder = ElmoLstm(input_size=c['projection_dim'],
                                    hidden_size=c['projection_dim'],
                                    cell_size=c['dim'],
                                    requires_grad=True,
                                    num_layers=c['n_layers'],
                                    recurrent_dropout_probability=conf['dropout'],
                                    memory_cell_clip_value=c['cell_clip'],
                                    state_projection_clip_value=c['proj_clip'],
                                    stateful=False)
        elif encoder_name == 'lstm':
            self.encoder = LstmbiLm(input_size=c['projection_dim'],
                                    hidden_size=c['projection_dim'],
                                    num_layers=c['n_layers'],
                                    dropout=conf['dropout'])
        elif encoder_name == 'bengio03highway':
            self.encoder = Bengio03HighwayBiLm(width=c['width'],
                                               input_size=c['projection_dim'],
                                               hidden_size=c['projection_dim'],
                                               n_layers=c['n_layers'],
                                               n_highway=c['n_highway'],
                                               use_position=c.get('position', False),
                                               dropout=conf['dropout'])
        elif encoder_name == 'bengio03highway_v2':
            self.encoder = Bengio03HighwayBiLmV2(width=c['width'],
                                                 input_size=c['projection_dim'],
                                                 hidden_size=c['projection_dim'],
                                                 n_layers=c['n_layers'],
                                                 n_highway=c['n_highway'],
                                                 use_position=c.get('position', False),
                                                 dropout=conf['dropout'])
        elif encoder_name == 'bengio03resnet':
            self.encoder = Bengio03ResNetBiLm(width=c['width'],
                                              input_size=c['projection_dim'],
                                              hidden_size=c['projection_dim'],
                                              n_layers=c['n_layers'],
                                              use_position=c.get('position', False),
                                              dropout=conf['dropout'])
        elif encoder_name == 'lblhighway':
            self.encoder = LBLHighwayBiLm(width=c['width'],
                                          input_size=c['projection_dim'],
                                          hidden_size=c['projection_dim'],
                                          n_layers=c['n_layers'],
                                          n_highway=c['n_highway'],
                                          use_position=c.get('position', False),
                                          dropout=conf['dropout'])
        elif encoder_name == 'lblhighway_v2':
            self.encoder = LBLHighwayBiLmV2(width=c['width'],
                                            input_size=c['projection_dim'],
                                            hidden_size=c['projection_dim'],
                                            n_layers=c['n_layers'],
                                            n_highway=c['n_highway'],
                                            use_position=c.get('position', False),
                                            dropout=conf['dropout'])
        elif encoder_name == 'lblresnet':
            self.encoder = LBLResNetBiLm(width=c['width'],
                                         input_size=c['projection_dim'],
                                         hidden_size=c['projection_dim'],
                                         n_layers=c['n_layers'],
                                         use_position=c.get('position', False),
                                         dropout=conf['dropout'])
        elif encoder_name == 'selfattn':
            self.encoder = SelfAttentiveLBLBiLM(width=c['width'],
                                                input_size=c['projection_dim'],
                                                hidden_size=c['projection_dim'],
                                                n_heads=c['n_heads'],
                                                n_layers=c['n_layers'],
                                                n_highway=c['n_highway'],
                                                use_position=c.get('position', False),
                                                use_relative_position=c.get('relative_position_weights', False),
                                                dropout=conf['dropout'])
        elif encoder_name == 'selfattn_v2':
            self.encoder = SelfAttentiveLBLBiLMV2(width=c['width'],
                                                  input_size=c['projection_dim'],
                                                  hidden_size=c['projection_dim'],
                                                  n_heads=c['n_heads'],
                                                  n_layers=c['n_layers'],
                                                  n_highway=c['n_highway'],
                                                  use_position=c.get('position', False),
                                                  use_relative_position=c.get('relative_position_weights', False),
                                                  dropout=conf['dropout'])
        elif encoder_name == 'selfattn_v3':
            self.encoder = SelfAttentiveLBLBiLMV3(width=c['width'],
                                                  input_size=c['projection_dim'],
                                                  hidden_size=c['projection_dim'],
                                                  n_heads=c['n_heads'],
                                                  n_layers=c['n_layers'],
                                                  n_highway=c['n_highway'],
                                                  use_position=c.get('position', False),
                                                  use_relative_position=c.get('relative_position_weights', False),
                                                  dropout=conf['dropout'])
        elif encoder_name == 'cnn':
            self.encoder = GatedCnnLm(input_size=c['projection_dim'],
                                      layers=c['layers'],
                                      dropout=conf['dropout'])
        else:
            raise ValueError('Unknown encoder name: {}'.format(encoder_name))

        self.output_dim = conf['encoder']['projection_dim']

        self.token_embedding_time = 0
        self.encoding_time = 0