class Vocab(object): """ Class for vocabulary (feature map) """ def __init__(self, filename, bpe_codes=None, reverse_seq=False): """ Initializes the object. Args: filename: Path to a vocabulary file containing one word per line. Each word is mapped to its line number (starting from 0). bpe_codes: A dict of BPE parameters. reverse_seq: Whether to reverse the sequence when encode the words to ids. Raises: ValueError: if `filename` or `bpe_codes_file` does not exist. """ self.vocab_dict, self.vocab_r_dict, _ = create_vocabulary_lookup_table_numpy( filename) self._sos_id = self.vocab_dict[Constants.SEQUENCE_START] self._eos_id = self.vocab_dict[Constants.SEQUENCE_END] self._unk_id = self.vocab_dict[Constants.UNKOWN] self._vocab_size = len(self.vocab_dict) self._reverse_seq = reverse_seq self._bpe = None if bpe_codes and "codes" in bpe_codes: if "vocab" not in bpe_codes: bpe_codes["vocab"] = filename self._bpe = BPE(**bpe_codes) @property def sos_id(self): """ Returns the id of the symbol indicating the start of sentence. """ return self._sos_id @property def eos_id(self): """ Returns the id of the symbol indicating the end of sentence. """ return self._eos_id @property def pad_id(self): """ Returns the id of the symbol for padding. """ return self._eos_id @property def unk_id(self): """ Returns the id of the special UNK symbol. """ return self._unk_id @property def vocab_size(self): """ Returns the size of vocabulary. """ return self._vocab_size def __call__(self, words): """ A wrapper method of `convert_to_idlist()` for `map` function, because this method is serializable. Args: words: A list of word tokens. Returns: A list of token ids with an extra `eos_id`. """ return self.convert_to_idlist(words) def bpe_encode(self, sentence): """ Applies BPE encoding. Args: sentence: A string of sentence or a list of word tokens. Returns: The BPE encoding result of the same type as `sentence`. """ if self._bpe: return self._bpe.encode(sentence) return sentence def convert_to_idlist(self, words, n_words=-1): """ Maps the sentence into sequence of ids. If BPE provided, apply BPE first. Args: words: A list of word tokens. n_words: An integer number. If provided and > 0, token id that exceed this value will be mapped into UNK id. Returns: A list of token ids with an extra `eos_id`. """ if self._bpe: words = self._bpe.encode(words) if not isinstance(words, list): words = words.split() ss = [ self.vocab_dict[w] if w in self.vocab_dict else self.unk_id for w in words ] if n_words > 0: ss = [w if w < n_words else self.unk_id for w in ss] if self._reverse_seq: ss = ss[::-1] ss += [self.eos_id] return ss def decorate_with_unk(self, words, unk_symbol=Constants.UNKOWN): """ Append (UNK) to the words that are not in the vocabulary. Args: words: A string or a list of word tokens. unk_symbol: A unk symbol. Returns: A string or a list of word tokens. """ if isinstance(words, list): return [ w if w in self.vocab_dict else w + "({})".format(unk_symbol) for w in words ] elif isinstance(words, six.string_types): return " ".join([ w if w in self.vocab_dict else w + "({})".format(unk_symbol) for w in words.strip().split() ]) else: raise ValueError("Unrecognized type: {}".format(type(words))) def convert_to_wordlist(self, pred_ids, bpe_decoding=True, reverse_seq=True): """ Converts list of token ids to list of word tokens. Args: pred_ids: A list of integers indicating token ids. bpe_decoding: Whether to recover from BPE. Set to false only when using this for displaying attention. reverse_seq: Whether to reverse the sequence after transformation. Set to false only when using this for displaying attention. Returns: A list of word tokens. """ pred_tokens = [self.vocab_r_dict[i] for i in pred_ids] if bpe_decoding and self._bpe: pred_tokens = self._bpe.decode(pred_tokens) if Constants.SEQUENCE_END in pred_tokens: if len(pred_tokens) == 1: return [''] pred_tokens = pred_tokens[:pred_tokens.index(Constants.SEQUENCE_END )] if reverse_seq and self._reverse_seq: return pred_tokens[::-1] return pred_tokens def __getitem__(self, item): """ Function for operator []. Args: item: A string or an integer number. Returns: The word token if `item` is an integer number, or token id if `item` is a string. """ if type(item) is int: if item >= self.vocab_size: raise ValueError( "id {} exceeded the size of vocabulary (size={})".format( item, self.vocab_size)) return self.vocab_r_dict[item] elif isinstance(item, six.string_types): return self.vocab_dict[ item] if item in self.vocab_dict else self.unk_id else: raise ValueError("Unrecognized type of item: %s" % str(type(item))) @staticmethod def equals(vocab1, vocab2): """ Compares two `Vocab` objects. Args: vocab1: A `Vocab` object. vocab2: A `Vocab` object. Returns: True if two objects are the same, False otherwise. """ if vocab1.vocab_size != vocab2.vocab_size: return False for key, val in vocab1.vocab_dict.items(): if key not in vocab2.vocab_dict: return False elif vocab2[key] != val: return False return True def equals_to(self, vocab): """ Compares `self` and `vocab` Args: vocab: A `Vocab object` Returns: True if two objects are the same, False otherwise. """ assert isinstance(vocab, Vocab) return Vocab.equals(self, vocab)
class Vocab(object): """ Class for vocabulary (feature map) """ def __init__(self, filename, bpe_codes_file=None): """ Initializes the object. Args: filename: Path to a vocabulary file containing one word per line. Each word is mapped to its line number (starting from 0). bpe_codes_file: Path to a BPE code file. If provided, do BPE before feature mapping. Raises: ValueError: if `filename` or `bpe_codes_file` does not exist. """ self.vocab_dict, self.vocab_r_dict, _ = create_vocabulary_lookup_table_numpy( filename) self._sos_id = self.vocab_dict["SEQUENCE_START"] self._eos_id = self.vocab_dict["SEQUENCE_END"] self._unk_id = self.vocab_dict["UNK"] self._vocab_size = len(self.vocab_dict) self._bpe = None if bpe_codes_file and not bpe_codes_file == "": if not gfile.Exists(bpe_codes_file): raise ValueError( "bpe_codes_file: {} not exists".format(bpe_codes_file)) self._bpe = BPE(bpe_codes_file) @property def sos_id(self): """ Returns the id of the symbol indicating the start of sentence. """ return self._sos_id @property def eos_id(self): """ Returns the id of the symbol indicating the end of sentence. """ return self._eos_id @property def unk_id(self): """ Returns the id of the special UNK symbol. """ return self._unk_id @property def vocab_size(self): """ Returns the size of vocabulary. """ return self._vocab_size def __call__(self, words): """ A wrapper method of `convert_to_idlist()` for `map` function, because this method is serializable. Args: words: A list of word tokens. Returns: A list of token ids with an extra `eos_id`. """ return self.convert_to_idlist(words) def bpe_encode(self, sentence): """ Applies BPE encoding. Args: sentence: A string of sentence or a list of word tokens. Returns: The BPE encoding result of the same type as `sentence`. """ if self._bpe: return self._bpe.encode(sentence) return sentence def convert_to_idlist(self, words, n_words=-1): """ Maps the sentence into sequence of ids. If BPE provided, apply BPE first. Args: words: A list of word tokens. n_words: An integer number. If provided and > 0, token id that exceed this value will be mapped into UNK id. Returns: A list of token ids with an extra `eos_id`. """ if self._bpe: words = self._bpe.encode(words) ss = [ self.vocab_dict[w] if w in self.vocab_dict else self.unk_id for w in words ] if n_words > 0: ss = [w if w < n_words else self.unk_id for w in ss] ss += [self.eos_id] return ss def convert_to_wordlist(self, pred_ids, bpe_decoding=True): """ Converts list of token ids to list of word tokens. Args: pred_ids: A list of integers indicating token ids. bpe_decoding: Whether to recover from BPE. Set bpe_decoding to false only when using this for displaying attention. Returns: A list of word tokens. """ pred_tokens = [self.vocab_r_dict[i] for i in pred_ids] if bpe_decoding and self._bpe: pred_tokens = self._bpe.decode(pred_tokens) if "SEQUENCE_END" in pred_tokens: if len(pred_tokens) == 1: return [''] pred_tokens = pred_tokens[:pred_tokens.index("SEQUENCE_END")] # if sys.version_info < (3, 0): # return [w.decode("utf-8") for w in pred_tokens] # else: return pred_tokens def __getitem__(self, item): """ Function for operator []. Args: item: A string or an integer number. Returns: The word token if `item` is an integer number, or token id if `item` is a string. """ if type(item) is int: if item >= self.vocab_size: raise ValueError( "id {} exceeded the size of vocabulary (size={})".format( item, self.vocab_size)) return self.vocab_r_dict[item] elif type(item) is str: return self.vocab_dict[ item] if item in self.vocab_dict else self.unk_id else: raise ValueError("Unrecognized type of item: %s" % str(type(item))) @staticmethod def equals(vocab1, vocab2): """ Compares two `Vocab` objects. Args: vocab1: A `Vocab` object. vocab2: A `Vocab` object. Returns: True if two objects are the same, False otherwise. """ if vocab1.vocab_size != vocab2.vocab_size: return False for key, val in vocab1.vocab_dict.items(): if key not in vocab2.vocab_dict: return False elif vocab2[key] != val: return False return True def equals_to(self, vocab): """ Compares `self` and `vocab` Args: vocab: A `Vocab object` Returns: True if two objects are the same, False otherwise. """ assert isinstance(vocab, Vocab) return Vocab.equals(self, vocab)
class Vocab(object): """ Class for vocabulary (feature map) """ def __init__(self, filename, bpe_codes=None, reverse_seq=False): """ Initializes the object. Args: filename: Path to a vocabulary file containing one word per line. Each word is mapped to its line number (starting from 0). bpe_codes: A dict of BPE parameters. reverse_seq: Whether to reverse the sequence when encode the words to ids. Raises: ValueError: if `filename` or `bpe_codes_file` does not exist. """ self.vocab_dict, self.vocab_r_dict, _ = create_vocabulary_lookup_table_numpy(filename) self._sos_id = self.vocab_dict[Constants.SEQUENCE_START] self._eos_id = self.vocab_dict[Constants.SEQUENCE_END] self._unk_id = self.vocab_dict[Constants.UNKOWN] self._vocab_size = len(self.vocab_dict) self._reverse_seq = reverse_seq self._bpe = None if bpe_codes and "codes" in bpe_codes: if "vocab" not in bpe_codes: bpe_codes["vocab"] = filename self._bpe = BPE(**bpe_codes) @property def sos_id(self): """ Returns the id of the symbol indicating the start of sentence. """ return self._sos_id @property def eos_id(self): """ Returns the id of the symbol indicating the end of sentence. """ return self._eos_id @property def pad_id(self): """ Returns the id of the symbol for padding. """ return self._eos_id @property def unk_id(self): """ Returns the id of the special UNK symbol. """ return self._unk_id @property def vocab_size(self): """ Returns the size of vocabulary. """ return self._vocab_size def __call__(self, words): """ A wrapper method of `convert_to_idlist()` for `map` function, because this method is serializable. Args: words: A list of word tokens. Returns: A list of token ids with an extra `eos_id`. """ return self.convert_to_idlist(words) def bpe_encode(self, sentence): """ Applies BPE encoding. Args: sentence: A string of sentence or a list of word tokens. Returns: The BPE encoding result of the same type as `sentence`. """ if self._bpe: return self._bpe.encode(sentence) return sentence def convert_to_idlist(self, words, n_words=-1): """ Maps the sentence into sequence of ids. If BPE provided, apply BPE first. Args: words: A list of word tokens. n_words: An integer number. If provided and > 0, token id that exceed this value will be mapped into UNK id. Returns: A list of token ids with an extra `eos_id`. """ if self._bpe: words = self._bpe.encode(words) if not isinstance(words, list): words = words.split() ss = [self.vocab_dict[w] if w in self.vocab_dict else self.unk_id for w in words] if n_words > 0: ss = [w if w < n_words else self.unk_id for w in ss] if self._reverse_seq: ss = ss[::-1] ss += [self.eos_id] return ss def decorate_with_unk(self, words, unk_symbol=Constants.UNKOWN): """ Append (UNK) to the words that are not in the vocabulary. Args: words: A string or a list of word tokens. unk_symbol: A unk symbol. Returns: A string or a list of word tokens. """ if isinstance(words, list): return [w if w in self.vocab_dict else w + "({})".format(unk_symbol) for w in words] elif isinstance(words, six.string_types): return " ".join([w if w in self.vocab_dict else w + "({})".format(unk_symbol) for w in words.strip().split()]) else: raise ValueError("Unrecognized type: {}".format(type(words))) def convert_to_wordlist(self, pred_ids, bpe_decoding=True, reverse_seq=True): """ Converts list of token ids to list of word tokens. Args: pred_ids: A list of integers indicating token ids. bpe_decoding: Whether to recover from BPE. Set to false only when using this for displaying attention. reverse_seq: Whether to reverse the sequence after transformation. Set to false only when using this for displaying attention. Returns: A list of word tokens. """ pred_tokens = [self.vocab_r_dict[i] for i in pred_ids] if bpe_decoding and self._bpe: pred_tokens = self._bpe.decode(pred_tokens) if Constants.SEQUENCE_END in pred_tokens: if len(pred_tokens) == 1: return [''] pred_tokens = pred_tokens[:pred_tokens.index(Constants.SEQUENCE_END)] if reverse_seq and self._reverse_seq: return pred_tokens[::-1] return pred_tokens def __getitem__(self, item): """ Function for operator []. Args: item: A string or an integer number. Returns: The word token if `item` is an integer number, or token id if `item` is a string. """ if type(item) is int: if item >= self.vocab_size: raise ValueError("id {} exceeded the size of vocabulary (size={})".format(item, self.vocab_size)) return self.vocab_r_dict[item] elif isinstance(item, six.string_types): return self.vocab_dict[item] if item in self.vocab_dict else self.unk_id else: raise ValueError("Unrecognized type of item: %s" % str(type(item))) @staticmethod def equals(vocab1, vocab2): """ Compares two `Vocab` objects. Args: vocab1: A `Vocab` object. vocab2: A `Vocab` object. Returns: True if two objects are the same, False otherwise. """ if vocab1.vocab_size != vocab2.vocab_size: return False for key, val in vocab1.vocab_dict.items(): if key not in vocab2.vocab_dict: return False elif vocab2[key] != val: return False return True def equals_to(self, vocab): """ Compares `self` and `vocab` Args: vocab: A `Vocab object` Returns: True if two objects are the same, False otherwise. """ assert isinstance(vocab, Vocab) return Vocab.equals(self, vocab)