def __init__(self, vocab_path=None, tokens=None, max_len=0, lowercase=False, bos_token="<SEQ_BEG>", eos_token="<SEQ_END>", unk_token="<UNK>", delimiter=" ", reverse=False): """ Initialize SymbolsMapper Args: vocab_path: The path to the vocabulary file. Only one of `vocab_path` and `tokens` should be provided. tokens: The word tokens. Only one of `vocab_path` and `tokens` should be provided. max_len: The maximum sequence length. Sequence larger than this will be truncated. lowercase: A bool, whether to lowercase the word tokens. bos_token: The begin-of-sentence token. eos_token: The end-of-sentence token. unk_token: The token indicating unknown word. reverse: A bool, whether to reverse the sequence or not. """ if not ((vocab_path is None) ^ (tokens is None)): raise ValueError("Either `vocab_path` or `tokens` should be provided.") this_locals = copy.copy(locals()) if tokens is None: with tf.io.gfile.GFile(vocab_path, "r") as fp: tokens = [line.strip() for line in fp] this_locals["tokens"] = tokens this_locals["vocab_path"] = None self._params = extract_constructor_params(this_locals, verbose=False) # extract tokens cleaned_tokens = [] for t in tokens: t = t.strip() if ((t.startswith("'") and t.endswith("'")) or (t.startswith('"') and t.endswith('"'))): word = t[1:-1] else: word = t.strip().split()[0].strip() if word: cleaned_tokens.append(word) assert unk_token, "must provide `unk_token`" extra_tokens = [unk_token] # add bos assert bos_token != unk_token extra_tokens.append(bos_token) # add eos assert eos_token != unk_token != bos_token while eos_token in cleaned_tokens: eos_token += str(random.choice(list(range(0, 10)))) extra_tokens.append(eos_token) self.vocab = Vocab(tokens=cleaned_tokens, extra_tokens=extra_tokens, lowercase=lowercase) self.max_len = max_len self.eos_id = self.vocab.map_token_to_id(eos_token) self.bos_id = self.vocab.map_token_to_id(bos_token) self.unk_id = self.vocab.map_token_to_id(unk_token) self.reverse = reverse self.delimiter = delimiter
def __init__(self, vocab_path, language="en", tokenizer=None, subtokenizer=None, subtokenizer_codes=None, glossaries=None, reverse_sequence=False, **kwargs): """ Initializes the data pipeline for text data. Args: language: The language. vocab_path: The path to the vocabulary file, or a list of word tokens. tokenizer: The tokenizer name. subtokenizer: The name of tokenizer for subword encoding. subtokenizer_codes: The subword codes. glossaries: The glossaries that will not be split by tokenizer/subtokenizer. reverse_sequence: A bool, whether to reverse the sequence. """ DataPipeline.__init__(self, vocab_path=vocab_path, language=language, tokenizer=tokenizer, subtokenizer=subtokenizer, subtokenizer_codes=subtokenizer_codes, glossaries=glossaries, reverse_sequence=reverse_sequence, **kwargs) self._language = language self._reverse_sequence = reverse_sequence self._tokenizer = build_tokenizer(tokenizer, language=language, glossaries=glossaries) self._subtokenizer = None self._subtokenizer = build_tokenizer(subtokenizer, language=language, glossaries=glossaries, vocabulary=vocab_path) if self._subtokenizer is not None: if subtokenizer_codes is None: logging.info( "No codes provided for subtokenizer: {}. " "We assume this was done on purpose.".format(subtokenizer)) else: self._subtokenizer.init_subtokenizer(subtokenizer_codes) if isinstance(vocab_path, list): tokens = Vocab.load_tokens(tokens=vocab_path) else: tokens = Vocab.load_tokens(vocab_path=vocab_path) unk_token = Vocab.get_unique(tokens, "<UNK>") bos_token = Vocab.get_unique(tokens, "<SEQ_BEG>") eos_token = Vocab.get_unique(tokens, "<SEQ_END>") assert unk_token != bos_token != eos_token Vocab.__init__(self, tokens, [unk_token, bos_token, eos_token], lowercase=False) self._eos_id = Vocab.map_token_to_id(self, eos_token) self._bos_id = Vocab.map_token_to_id(self, bos_token) self._unk_id = Vocab.map_token_to_id(self, unk_token)
def recover(self, input): """ Recover one data sample. Args: input: A list of token ids, the output of neural model. Returns: A string, the recovered text. """ input = [int(x) for x in input] if input[0] == self._bos_id: input = input[1:] try: eos_pos = input.index(self._eos_id) input = input[:eos_pos] except ValueError: pass token_list = Vocab.map_id_to_token(self, input) if self._reverse_sequence: token_list = token_list[::-1] if self._subtokenizer is None: output = " ".join(token_list) else: output = self._subtokenizer.detokenize(token_list, return_str=True) if self._tokenizer: output = self._tokenizer.detokenize(output, return_str=True) return output
def process(self, input, is_processed=False): """ Process one data sample. Args: input: A text string. is_processed: Whether the data sample is already processed. Returns: A list of generated token IDs. """ input = DataPipeline.text_pre_normalize(self, self._language, input, is_processed=False) if not is_processed: if self._tokenizer: input = self._tokenizer.tokenize(input) if self._subtokenizer: input = self._subtokenizer.tokenize(input, return_str=False) if isinstance(input, str): input = input.split() token_ids = Vocab.map_token_to_id(self, input, unknown_default=self._unk_id) if self._reverse_sequence: token_ids = token_ids[::-1] return token_ids + [self._eos_id]
def __init__(self, name, language="en", vocab_path=None, tokens=None, **kwargs): """ Initializes the data pipeline for text data. Args: name: The key of the BERT model, for creating the tokenizer and loading vocabulary. language: The language. tokens: A list of word tokens. vocab_path: The path to the vocabulary file. """ if tokens is None and vocab_path is None: path = GoogleBert.download(name) if path is None: raise ValueError( f"Unknown BERT model name={name} for downloading.") vocab_path = os.path.join(path, "vocab.txt") else: if tokens is not None: vocab_path = None tokens = Vocab.load_tokens(vocab_path, tokens) vocab_path = None # to handle with customized vocabulary for spec_token in ["[UNK]", "[CLS]", "[SEP]", "[MASK]", "[PAD]"]: if spec_token not in tokens: tokens.insert(0, spec_token) assert tokens[0] == "[PAD]" Vocab.__init__(self, Vocab.load_tokens(vocab_path, tokens), lowercase=False) DataPipeline.__init__(self, name=name, language=language, tokens=self.tokens, vocab_path=None, **kwargs) self._language = language self._tokenizer = HuggingFaceTokenizer(language=language) self._tokenizer.init_subtokenizer(name) self._unk_id = Vocab.map_token_to_id(self, "[UNK]") self._pad_id = Vocab.map_token_to_id(self, "[PAD]") self._cls_id = Vocab.map_token_to_id(self, "[CLS]") self._sep_id = Vocab.map_token_to_id(self, "[SEP]") self._mask_id = Vocab.map_token_to_id(self, "[MASK]")
def test_file(): vocab_file = tempfile.NamedTemporaryFile(delete=False) with open(vocab_file.name, "w") as fw: for t in word_tokens: fw.write(t + "\t100\n") vocab = Vocab.load_from_file(vocab_file.name, extra_tokens=["UNK", "EOS"]) assert vocab._token_list == ["Hello", "World", "yes", "i", "I", "UNK", "EOS"] assert vocab.vocab_size == 7 assert vocab.map_token_to_id(["Hello", "world", "man"], unknown_default=100) == [0, 100, 100] assert vocab.map_id_to_token([1, 0, 3]) == ["World", "Hello", "i"] vocab = Vocab.load_from_file(vocab_file.name, extra_tokens=["UNK", "EOS"], lowercase=True) assert vocab._token_list == ["hello", "world", "yes", "i", "UNK", "EOS"] assert vocab.vocab_size == 6 assert vocab.map_token_to_id(["Hello", "world", "man", "EOS"], unknown_default=100) == [0, 1, 100, 5] assert vocab.map_id_to_token([1, 0, 3]) == ["world", "hello", "i"] os.remove(vocab_file.name)
def __init__(self, language="en", tokens=None, vocab_path=None): """ Initializes the data pipeline from OpenAI released GPT-2. Args: language: The language. tokens: A list of word tokens. vocab_path: The path to the vocabulary file. """ if tokens is None and vocab_path is None: path = OpenAIGPT2.download("117M") vocab_path = os.path.join(path, "encoder.json") Vocab.__init__(self, Vocab.load_tokens(vocab_path, tokens), lowercase=False) DataPipeline.__init__(self, language=language, tokens=self.tokens, vocab_path=None) self._language = language self._tokenizer = HuggingFaceTokenizer(language=language) self._tokenizer.init_subtokenizer("gpt2") self._eos_id = Vocab.map_token_to_id(self, "<|endoftext|>")
def _process(text): text = DataPipeline.text_pre_normalize(self, self._language, text, is_processed=False) if not is_processed: text = self._tokenizer.tokenize(text, return_str=False) elif isinstance(text, str): text = text.strip().split() token_ids = Vocab.map_token_to_id(self, text, unknown_default=self._unk_id) return token_ids + [self._sep_id]
def test(): vocab = Vocab(word_tokens, extra_tokens=["UNK", "EOS"]) assert vocab._token_list == ["Hello", "World", "yes", "i", "I", "UNK", "EOS"] assert vocab.vocab_size == 7 assert vocab.map_token_to_id(["Hello", "world", "man"], unknown_default=100) == [0, 100, 100] assert vocab.map_id_to_token([1, 0, 3]) == ["World", "Hello", "i"] vocab = Vocab(word_tokens, extra_tokens=["UNK", "EOS"], lowercase=True) assert vocab._token_list == ["hello", "world", "yes", "i", "UNK", "EOS"] assert vocab.vocab_size == 6 assert vocab.map_token_to_id(["Hello", "world", "man"], unknown_default=100) == [0, 1, 100] assert vocab.map_id_to_token([1, 0, 3]) == ["world", "hello", "i"]
def recover(self, input): """ Recover one data sample. Args: input: A list of token ids, the output of neural model. Returns: A string, the recovered text. """ try: eos_pos = input.index(self._eos_id) input = input[:eos_pos] except ValueError: pass output = Vocab.map_id_to_token(self, input) return self._tokenizer.detokenize(output, return_str=True)
def process(self, input, is_processed=False): """ Process one data sample. Args: input: A text string. is_processed: Whether the data sample is already processed. Returns: A list of generated token IDs. """ if not is_processed: input = self._tokenizer.tokenize(input, return_str=False) elif isinstance(input, str): input = input.strip().split() token_ids = [ x for x in Vocab.map_token_to_id(self, input) if x is not None ] return token_ids + [self._eos_id]
def __init__(self, vocab_path, spm_model, languages, reverse_sequence=False, **kwargs): """ Initializes the data pipeline for text data. Args: vocab_path: The path to the vocabulary file, or a list of word tokens. spm_model: The path to the sentence piece model. languages: A list of languages. The corresponding language tags will automatically append to the vocabulary. reverse_sequence: A bool, whether to reverse the sequence. """ DataPipeline.__init__(self, vocab_path=vocab_path, languages=languages, reverse_sequence=reverse_sequence, **kwargs) self._reverse_sequence = reverse_sequence self._tokenizer = SentencePiece() self._tokenizer.init_subtokenizer(spm_model) if isinstance(vocab_path, list): tokens = Vocab.load_tokens(tokens=vocab_path) else: tokens = Vocab.load_tokens(vocab_path=vocab_path) if isinstance(languages, str): languages = yaml.load(languages, Loader=yaml.FullLoader) assert isinstance(languages, list), ( f"`languages` must be a list of strings, but got {languages}") lang2tags = {} for lang in languages: lang2tags[lang] = Vocab.get_unique(tokens, "<" + lang + ">") unk_token = Vocab.get_unique(tokens, "<UNK>") bos_token = Vocab.get_unique(tokens, "<SEQ_BEG>") eos_token = Vocab.get_unique(tokens, "<SEQ_END>") assert unk_token != bos_token != eos_token Vocab.__init__(self, tokens, [unk_token, bos_token, eos_token] + list(lang2tags.values()), lowercase=False) self._eos_id = Vocab.map_token_to_id(self, eos_token) self._bos_id = Vocab.map_token_to_id(self, bos_token) self._unk_id = Vocab.map_token_to_id(self, unk_token) self._lang_ids = {lang: Vocab.map_token_to_id(self, lang2tags[lang]) for lang in languages}
class SymbolsMapper(object): def __init__(self, vocab_path=None, tokens=None, max_len=0, lowercase=False, bos_token="<SEQ_BEG>", eos_token="<SEQ_END>", unk_token="<UNK>", delimiter=" ", reverse=False): """ Initialize SymbolsMapper Args: vocab_path: The path to the vocabulary file. Only one of `vocab_path` and `tokens` should be provided. tokens: The word tokens. Only one of `vocab_path` and `tokens` should be provided. max_len: The maximum sequence length. Sequence larger than this will be truncated. lowercase: A bool, whether to lowercase the word tokens. bos_token: The begin-of-sentence token. eos_token: The end-of-sentence token. unk_token: The token indicating unknown word. reverse: A bool, whether to reverse the sequence or not. """ if not ((vocab_path is None) ^ (tokens is None)): raise ValueError("Either `vocab_path` or `tokens` should be provided.") this_locals = copy.copy(locals()) if tokens is None: with tf.io.gfile.GFile(vocab_path, "r") as fp: tokens = [line.strip() for line in fp] this_locals["tokens"] = tokens this_locals["vocab_path"] = None self._params = extract_constructor_params(this_locals, verbose=False) # extract tokens cleaned_tokens = [] for t in tokens: t = t.strip() if ((t.startswith("'") and t.endswith("'")) or (t.startswith('"') and t.endswith('"'))): word = t[1:-1] else: word = t.strip().split()[0].strip() if word: cleaned_tokens.append(word) assert unk_token, "must provide `unk_token`" extra_tokens = [unk_token] # add bos assert bos_token != unk_token extra_tokens.append(bos_token) # add eos assert eos_token != unk_token != bos_token while eos_token in cleaned_tokens: eos_token += str(random.choice(list(range(0, 10)))) extra_tokens.append(eos_token) self.vocab = Vocab(tokens=cleaned_tokens, extra_tokens=extra_tokens, lowercase=lowercase) self.max_len = max_len self.eos_id = self.vocab.map_token_to_id(eos_token) self.bos_id = self.vocab.map_token_to_id(bos_token) self.unk_id = self.vocab.map_token_to_id(unk_token) self.reverse = reverse self.delimiter = delimiter @property def meta_data(self): return { "vocab_size": self.vocab.vocab_size, "eos_id": self.eos_id, "bos_id": self.bos_id, "unk_id": self.unk_id, "pad_id": self.eos_id, } def get_config(self): return self._params def map_token_to_id(self, text, return_str=False, with_bos=False, with_eos=True): """ Map word tokens to id list Args: text: a string of a list of string tokens return_str: a bool, whether to return a string or not (a list). with_bos: a bool, whether to automatically plus bos token at the front or not. with_eos: a bool, whether to automatically plus eos token at the end or not. Returns: A list of word ids or a `delimiter` joined string. """ if isinstance(text, str): text = text.strip().split() assert isinstance(text, list), (type(text)) token_ids = self.vocab.map_token_to_id(text, unknown_default=self.unk_id) if self.reverse: token_ids = token_ids[::-1] if with_bos: token_ids = [self.bos_id] + token_ids if with_eos: token_ids += [self.eos_id] if return_str: return self.delimiter.join( [str(x) for x in token_ids]) return token_ids def map_id_to_token(self, text, return_str=False, reverse=True): """ Map token ids to token string Args: text: a string or a list of word token ids return_str: a bool, whether to return a string or not (a list). reverse: a bool, whether to recover the 'reverse' operation at `map_token_to_id` method. Returns: A `delimiter` joined string or a list of word tokens. """ if isinstance(text, str): text = text.strip().split() text = [int(x) for x in text] if text[0] == self.bos_id: text = text[1:] try: eos_pos = text.index(self.eos_id) text = text[:eos_pos] except ValueError: pass token_list = self.vocab.map_id_to_token(text) if reverse and self.reverse: token_list = token_list[::-1] if return_str: return self.delimiter.join(token_list) return token_list