示例#1
0
class Vocab(object):
    """ Class for vocabulary (feature map) """
    def __init__(self, filename, bpe_codes=None, reverse_seq=False):
        """ Initializes the object.

        Args:
            filename: Path to a vocabulary file containing one word per line.
              Each word is mapped to its line number (starting from 0).
            bpe_codes: A dict of BPE parameters.
            reverse_seq: Whether to reverse the sequence when encode the words
              to ids.

        Raises:
            ValueError: if `filename` or `bpe_codes_file` does not exist.
        """
        self.vocab_dict, self.vocab_r_dict, _ = create_vocabulary_lookup_table_numpy(
            filename)
        self._sos_id = self.vocab_dict[Constants.SEQUENCE_START]
        self._eos_id = self.vocab_dict[Constants.SEQUENCE_END]
        self._unk_id = self.vocab_dict[Constants.UNKOWN]
        self._vocab_size = len(self.vocab_dict)
        self._reverse_seq = reverse_seq
        self._bpe = None
        if bpe_codes and "codes" in bpe_codes:
            if "vocab" not in bpe_codes:
                bpe_codes["vocab"] = filename
            self._bpe = BPE(**bpe_codes)

    @property
    def sos_id(self):
        """ Returns the id of the symbol indicating the start of sentence. """
        return self._sos_id

    @property
    def eos_id(self):
        """ Returns the id of the symbol indicating the end of sentence. """
        return self._eos_id

    @property
    def pad_id(self):
        """ Returns the id of the symbol for padding. """
        return self._eos_id

    @property
    def unk_id(self):
        """ Returns the id of the special UNK symbol. """
        return self._unk_id

    @property
    def vocab_size(self):
        """ Returns the size of vocabulary. """
        return self._vocab_size

    def __call__(self, words):
        """ A wrapper method of `convert_to_idlist()` for `map` function,
        because this method is serializable.

        Args:
            words: A list of word tokens.

        Returns: A list of token ids with an extra `eos_id`.
        """
        return self.convert_to_idlist(words)

    def bpe_encode(self, sentence):
        """ Applies BPE encoding.

        Args:
            sentence: A string of sentence or a list of word tokens.

        Returns: The BPE encoding result of the same type as `sentence`.
        """
        if self._bpe:
            return self._bpe.encode(sentence)
        return sentence

    def convert_to_idlist(self, words, n_words=-1):
        """ Maps the sentence into sequence of ids.
              If BPE provided, apply BPE first.

        Args:
            words: A list of word tokens.
            n_words: An integer number. If provided and > 0, token id
              that exceed this value will be mapped into UNK id.

        Returns: A list of token ids with an extra `eos_id`.
        """
        if self._bpe:
            words = self._bpe.encode(words)
        if not isinstance(words, list):
            words = words.split()
        ss = [
            self.vocab_dict[w] if w in self.vocab_dict else self.unk_id
            for w in words
        ]
        if n_words > 0:
            ss = [w if w < n_words else self.unk_id for w in ss]
        if self._reverse_seq:
            ss = ss[::-1]
        ss += [self.eos_id]
        return ss

    def decorate_with_unk(self, words, unk_symbol=Constants.UNKOWN):
        """ Append (UNK) to the words that are not in the vocabulary.

        Args:
            words: A string or a list of word tokens.
            unk_symbol: A unk symbol.

        Returns: A string or a list of word tokens.
        """
        if isinstance(words, list):
            return [
                w if w in self.vocab_dict else w + "({})".format(unk_symbol)
                for w in words
            ]
        elif isinstance(words, six.string_types):
            return " ".join([
                w if w in self.vocab_dict else w + "({})".format(unk_symbol)
                for w in words.strip().split()
            ])
        else:
            raise ValueError("Unrecognized type: {}".format(type(words)))

    def convert_to_wordlist(self,
                            pred_ids,
                            bpe_decoding=True,
                            reverse_seq=True):
        """ Converts list of token ids to list of word tokens.

        Args:
            pred_ids: A list of integers indicating token ids.
            bpe_decoding: Whether to recover from BPE. Set to
              false only when using this for displaying attention.
            reverse_seq: Whether to reverse the sequence after transformation.
              Set to false only when using this for displaying attention.

        Returns: A list of word tokens.
        """
        pred_tokens = [self.vocab_r_dict[i] for i in pred_ids]
        if bpe_decoding and self._bpe:
            pred_tokens = self._bpe.decode(pred_tokens)
        if Constants.SEQUENCE_END in pred_tokens:
            if len(pred_tokens) == 1:
                return ['']
            pred_tokens = pred_tokens[:pred_tokens.index(Constants.SEQUENCE_END
                                                         )]
        if reverse_seq and self._reverse_seq:
            return pred_tokens[::-1]
        return pred_tokens

    def __getitem__(self, item):
        """ Function for operator [].

        Args:
            item: A string or an integer number.

        Returns: The word token if `item` is an integer number,
          or token id if `item` is a string.

        """
        if type(item) is int:
            if item >= self.vocab_size:
                raise ValueError(
                    "id {} exceeded the size of vocabulary (size={})".format(
                        item, self.vocab_size))
            return self.vocab_r_dict[item]
        elif isinstance(item, six.string_types):
            return self.vocab_dict[
                item] if item in self.vocab_dict else self.unk_id
        else:
            raise ValueError("Unrecognized type of item: %s" % str(type(item)))

    @staticmethod
    def equals(vocab1, vocab2):
        """ Compares two `Vocab` objects.

        Args:
            vocab1: A `Vocab` object.
            vocab2: A `Vocab` object.

        Returns: True if two objects are the same, False otherwise.
        """
        if vocab1.vocab_size != vocab2.vocab_size:
            return False
        for key, val in vocab1.vocab_dict.items():
            if key not in vocab2.vocab_dict:
                return False
            elif vocab2[key] != val:
                return False
        return True

    def equals_to(self, vocab):
        """ Compares `self` and `vocab`

        Args:
            vocab: A `Vocab object`

        Returns: True if two objects are the same, False otherwise.
        """
        assert isinstance(vocab, Vocab)
        return Vocab.equals(self, vocab)
示例#2
0
文件: vocab.py 项目: cjliux/NJUNMT-tf
class Vocab(object):
    """ Class for vocabulary (feature map) """
    def __init__(self, filename, bpe_codes_file=None):
        """ Initializes the object.

        Args:
            filename: Path to a vocabulary file containing one word per line.
              Each word is mapped to its line number (starting from 0).
            bpe_codes_file: Path to a BPE code file. If provided, do BPE
              before feature mapping.

        Raises:
            ValueError: if `filename` or `bpe_codes_file` does not exist.
        """
        self.vocab_dict, self.vocab_r_dict, _ = create_vocabulary_lookup_table_numpy(
            filename)
        self._sos_id = self.vocab_dict["SEQUENCE_START"]
        self._eos_id = self.vocab_dict["SEQUENCE_END"]
        self._unk_id = self.vocab_dict["UNK"]
        self._vocab_size = len(self.vocab_dict)
        self._bpe = None
        if bpe_codes_file and not bpe_codes_file == "":
            if not gfile.Exists(bpe_codes_file):
                raise ValueError(
                    "bpe_codes_file: {} not exists".format(bpe_codes_file))
            self._bpe = BPE(bpe_codes_file)

    @property
    def sos_id(self):
        """ Returns the id of the symbol indicating the start of sentence. """
        return self._sos_id

    @property
    def eos_id(self):
        """ Returns the id of the symbol indicating the end of sentence. """
        return self._eos_id

    @property
    def unk_id(self):
        """ Returns the id of the special UNK symbol. """
        return self._unk_id

    @property
    def vocab_size(self):
        """ Returns the size of vocabulary. """
        return self._vocab_size

    def __call__(self, words):
        """ A wrapper method of `convert_to_idlist()` for `map` function,
        because this method is serializable.

        Args:
            words: A list of word tokens.

        Returns: A list of token ids with an extra `eos_id`.
        """
        return self.convert_to_idlist(words)

    def bpe_encode(self, sentence):
        """ Applies BPE encoding.

        Args:
            sentence: A string of sentence or a list of word tokens.

        Returns: The BPE encoding result of the same type as `sentence`.
        """
        if self._bpe:
            return self._bpe.encode(sentence)
        return sentence

    def convert_to_idlist(self, words, n_words=-1):
        """ Maps the sentence into sequence of ids.
              If BPE provided, apply BPE first.

        Args:
            words: A list of word tokens.
            n_words: An integer number. If provided and > 0, token id
              that exceed this value will be mapped into UNK id.

        Returns: A list of token ids with an extra `eos_id`.
        """
        if self._bpe:
            words = self._bpe.encode(words)
        ss = [
            self.vocab_dict[w] if w in self.vocab_dict else self.unk_id
            for w in words
        ]
        if n_words > 0:
            ss = [w if w < n_words else self.unk_id for w in ss]
        ss += [self.eos_id]
        return ss

    def convert_to_wordlist(self, pred_ids, bpe_decoding=True):
        """ Converts list of token ids to list of word tokens.

        Args:
            pred_ids: A list of integers indicating token ids.
            bpe_decoding: Whether to recover from BPE. Set
              bpe_decoding to false only when using this
              for displaying attention.

        Returns: A list of word tokens.
        """
        pred_tokens = [self.vocab_r_dict[i] for i in pred_ids]
        if bpe_decoding and self._bpe:
            pred_tokens = self._bpe.decode(pred_tokens)
        if "SEQUENCE_END" in pred_tokens:
            if len(pred_tokens) == 1:
                return ['']
            pred_tokens = pred_tokens[:pred_tokens.index("SEQUENCE_END")]
        # if sys.version_info < (3, 0):
        #     return [w.decode("utf-8") for w in pred_tokens]
        # else:
        return pred_tokens

    def __getitem__(self, item):
        """ Function for operator [].

        Args:
            item: A string or an integer number.

        Returns: The word token if `item` is an integer number,
          or token id if `item` is a string.

        """
        if type(item) is int:
            if item >= self.vocab_size:
                raise ValueError(
                    "id {} exceeded the size of vocabulary (size={})".format(
                        item, self.vocab_size))
            return self.vocab_r_dict[item]
        elif type(item) is str:
            return self.vocab_dict[
                item] if item in self.vocab_dict else self.unk_id
        else:
            raise ValueError("Unrecognized type of item: %s" % str(type(item)))

    @staticmethod
    def equals(vocab1, vocab2):
        """ Compares two `Vocab` objects.

        Args:
            vocab1: A `Vocab` object.
            vocab2: A `Vocab` object.

        Returns: True if two objects are the same, False otherwise.
        """
        if vocab1.vocab_size != vocab2.vocab_size:
            return False
        for key, val in vocab1.vocab_dict.items():
            if key not in vocab2.vocab_dict:
                return False
            elif vocab2[key] != val:
                return False
        return True

    def equals_to(self, vocab):
        """ Compares `self` and `vocab`

        Args:
            vocab: A `Vocab object`

        Returns: True if two objects are the same, False otherwise.
        """
        assert isinstance(vocab, Vocab)
        return Vocab.equals(self, vocab)
示例#3
0
class Vocab(object):
    """ Class for vocabulary (feature map) """

    def __init__(self, filename, bpe_codes=None, reverse_seq=False):
        """ Initializes the object.

        Args:
            filename: Path to a vocabulary file containing one word per line.
              Each word is mapped to its line number (starting from 0).
            bpe_codes: A dict of BPE parameters.
            reverse_seq: Whether to reverse the sequence when encode the words
              to ids.

        Raises:
            ValueError: if `filename` or `bpe_codes_file` does not exist.
        """
        self.vocab_dict, self.vocab_r_dict, _ = create_vocabulary_lookup_table_numpy(filename)
        self._sos_id = self.vocab_dict[Constants.SEQUENCE_START]
        self._eos_id = self.vocab_dict[Constants.SEQUENCE_END]
        self._unk_id = self.vocab_dict[Constants.UNKOWN]
        self._vocab_size = len(self.vocab_dict)
        self._reverse_seq = reverse_seq
        self._bpe = None
        if bpe_codes and "codes" in bpe_codes:
            if "vocab" not in bpe_codes:
                bpe_codes["vocab"] = filename
            self._bpe = BPE(**bpe_codes)

    @property
    def sos_id(self):
        """ Returns the id of the symbol indicating the start of sentence. """
        return self._sos_id

    @property
    def eos_id(self):
        """ Returns the id of the symbol indicating the end of sentence. """
        return self._eos_id

    @property
    def pad_id(self):
        """ Returns the id of the symbol for padding. """
        return self._eos_id

    @property
    def unk_id(self):
        """ Returns the id of the special UNK symbol. """
        return self._unk_id

    @property
    def vocab_size(self):
        """ Returns the size of vocabulary. """
        return self._vocab_size

    def __call__(self, words):
        """ A wrapper method of `convert_to_idlist()` for `map` function,
        because this method is serializable.

        Args:
            words: A list of word tokens.

        Returns: A list of token ids with an extra `eos_id`.
        """
        return self.convert_to_idlist(words)

    def bpe_encode(self, sentence):
        """ Applies BPE encoding.

        Args:
            sentence: A string of sentence or a list of word tokens.

        Returns: The BPE encoding result of the same type as `sentence`.
        """
        if self._bpe:
            return self._bpe.encode(sentence)
        return sentence

    def convert_to_idlist(self, words, n_words=-1):
        """ Maps the sentence into sequence of ids.
              If BPE provided, apply BPE first.

        Args:
            words: A list of word tokens.
            n_words: An integer number. If provided and > 0, token id
              that exceed this value will be mapped into UNK id.

        Returns: A list of token ids with an extra `eos_id`.
        """
        if self._bpe:
            words = self._bpe.encode(words)
        if not isinstance(words, list):
            words = words.split()
        ss = [self.vocab_dict[w] if w in self.vocab_dict else self.unk_id
              for w in words]
        if n_words > 0:
            ss = [w if w < n_words else self.unk_id for w in ss]
        if self._reverse_seq:
            ss = ss[::-1]
        ss += [self.eos_id]
        return ss

    def decorate_with_unk(self, words, unk_symbol=Constants.UNKOWN):
        """ Append (UNK) to the words that are not in the vocabulary.

        Args:
            words: A string or a list of word tokens.
            unk_symbol: A unk symbol.

        Returns: A string or a list of word tokens.
        """
        if isinstance(words, list):
            return [w if w in self.vocab_dict else w + "({})".format(unk_symbol)
                    for w in words]
        elif isinstance(words, six.string_types):
            return " ".join([w if w in self.vocab_dict else w + "({})".format(unk_symbol)
                             for w in words.strip().split()])
        else:
            raise ValueError("Unrecognized type: {}".format(type(words)))

    def convert_to_wordlist(self, pred_ids, bpe_decoding=True, reverse_seq=True):
        """ Converts list of token ids to list of word tokens.

        Args:
            pred_ids: A list of integers indicating token ids.
            bpe_decoding: Whether to recover from BPE. Set to
              false only when using this for displaying attention.
            reverse_seq: Whether to reverse the sequence after transformation.
              Set to false only when using this for displaying attention.

        Returns: A list of word tokens.
        """
        pred_tokens = [self.vocab_r_dict[i] for i in pred_ids]
        if bpe_decoding and self._bpe:
            pred_tokens = self._bpe.decode(pred_tokens)
        if Constants.SEQUENCE_END in pred_tokens:
            if len(pred_tokens) == 1:
                return ['']
            pred_tokens = pred_tokens[:pred_tokens.index(Constants.SEQUENCE_END)]
        if reverse_seq and self._reverse_seq:
            return pred_tokens[::-1]
        return pred_tokens

    def __getitem__(self, item):
        """ Function for operator [].

        Args:
            item: A string or an integer number.

        Returns: The word token if `item` is an integer number,
          or token id if `item` is a string.

        """
        if type(item) is int:
            if item >= self.vocab_size:
                raise ValueError("id {} exceeded the size of vocabulary (size={})".format(item, self.vocab_size))
            return self.vocab_r_dict[item]
        elif isinstance(item, six.string_types):
            return self.vocab_dict[item] if item in self.vocab_dict else self.unk_id
        else:
            raise ValueError("Unrecognized type of item: %s" % str(type(item)))

    @staticmethod
    def equals(vocab1, vocab2):
        """ Compares two `Vocab` objects.

        Args:
            vocab1: A `Vocab` object.
            vocab2: A `Vocab` object.

        Returns: True if two objects are the same, False otherwise.
        """
        if vocab1.vocab_size != vocab2.vocab_size:
            return False
        for key, val in vocab1.vocab_dict.items():
            if key not in vocab2.vocab_dict:
                return False
            elif vocab2[key] != val:
                return False
        return True

    def equals_to(self, vocab):
        """ Compares `self` and `vocab`

        Args:
            vocab: A `Vocab object`

        Returns: True if two objects are the same, False otherwise.
        """
        assert isinstance(vocab, Vocab)
        return Vocab.equals(self, vocab)