Пример #1
0
    def __init__(self,
                 vocab_path=None,
                 tokens=None,
                 max_len=0,
                 lowercase=False,
                 bos_token="<SEQ_BEG>",
                 eos_token="<SEQ_END>",
                 unk_token="<UNK>",
                 delimiter=" ",
                 reverse=False):
        """ Initialize SymbolsMapper

        Args:
            vocab_path: The path to the vocabulary file. Only one of `vocab_path` and `tokens` should be provided.
            tokens: The word tokens. Only one of `vocab_path` and `tokens` should be provided.
            max_len: The maximum sequence length. Sequence larger than this will be truncated.
            lowercase: A bool, whether to lowercase the word tokens.
            bos_token: The begin-of-sentence token.
            eos_token: The end-of-sentence token.
            unk_token: The token indicating unknown word.
            reverse: A bool, whether to reverse the sequence or not.
        """
        if not ((vocab_path is None) ^ (tokens is None)):
            raise ValueError("Either `vocab_path` or `tokens` should be provided.")
        this_locals = copy.copy(locals())
        if tokens is None:
            with tf.io.gfile.GFile(vocab_path, "r") as fp:
                tokens = [line.strip() for line in fp]
            this_locals["tokens"] = tokens
            this_locals["vocab_path"] = None
        self._params = extract_constructor_params(this_locals, verbose=False)
        # extract tokens
        cleaned_tokens = []
        for t in tokens:
            t = t.strip()
            if ((t.startswith("'") and t.endswith("'"))
                or (t.startswith('"') and t.endswith('"'))):
                word = t[1:-1]
            else:
                word = t.strip().split()[0].strip()
            if word:
                cleaned_tokens.append(word)
        assert unk_token, "must provide `unk_token`"
        extra_tokens = [unk_token]
        # add bos
        assert bos_token != unk_token
        extra_tokens.append(bos_token)
        # add eos
        assert eos_token != unk_token != bos_token
        while eos_token in cleaned_tokens:
            eos_token += str(random.choice(list(range(0, 10))))
        extra_tokens.append(eos_token)
        self.vocab = Vocab(tokens=cleaned_tokens, extra_tokens=extra_tokens,
                           lowercase=lowercase)
        self.max_len = max_len
        self.eos_id = self.vocab.map_token_to_id(eos_token)
        self.bos_id = self.vocab.map_token_to_id(bos_token)
        self.unk_id = self.vocab.map_token_to_id(unk_token)
        self.reverse = reverse
        self.delimiter = delimiter
Пример #2
0
    def __init__(self,
                 vocab_path,
                 language="en",
                 tokenizer=None,
                 subtokenizer=None,
                 subtokenizer_codes=None,
                 glossaries=None,
                 reverse_sequence=False,
                 **kwargs):
        """ Initializes the data pipeline for text data.

        Args:
            language: The language.
            vocab_path: The path to the vocabulary file, or a list of word tokens.
            tokenizer: The tokenizer name.
            subtokenizer: The name of tokenizer for subword encoding.
            subtokenizer_codes: The subword codes.
            glossaries: The glossaries that will not be split by tokenizer/subtokenizer.
            reverse_sequence: A bool, whether to reverse the sequence.
        """
        DataPipeline.__init__(self,
                              vocab_path=vocab_path,
                              language=language,
                              tokenizer=tokenizer,
                              subtokenizer=subtokenizer,
                              subtokenizer_codes=subtokenizer_codes,
                              glossaries=glossaries,
                              reverse_sequence=reverse_sequence,
                              **kwargs)
        self._language = language
        self._reverse_sequence = reverse_sequence
        self._tokenizer = build_tokenizer(tokenizer,
                                          language=language,
                                          glossaries=glossaries)
        self._subtokenizer = None
        self._subtokenizer = build_tokenizer(subtokenizer,
                                             language=language,
                                             glossaries=glossaries,
                                             vocabulary=vocab_path)
        if self._subtokenizer is not None:
            if subtokenizer_codes is None:
                logging.info(
                    "No codes provided for subtokenizer: {}. "
                    "We assume this was done on purpose.".format(subtokenizer))
            else:
                self._subtokenizer.init_subtokenizer(subtokenizer_codes)
        if isinstance(vocab_path, list):
            tokens = Vocab.load_tokens(tokens=vocab_path)
        else:
            tokens = Vocab.load_tokens(vocab_path=vocab_path)
        unk_token = Vocab.get_unique(tokens, "<UNK>")
        bos_token = Vocab.get_unique(tokens, "<SEQ_BEG>")
        eos_token = Vocab.get_unique(tokens, "<SEQ_END>")
        assert unk_token != bos_token != eos_token
        Vocab.__init__(self,
                       tokens, [unk_token, bos_token, eos_token],
                       lowercase=False)
        self._eos_id = Vocab.map_token_to_id(self, eos_token)
        self._bos_id = Vocab.map_token_to_id(self, bos_token)
        self._unk_id = Vocab.map_token_to_id(self, unk_token)
Пример #3
0
    def recover(self, input):
        """ Recover one data sample.

        Args:
            input: A list of token ids, the output of neural model.

        Returns:
            A string, the recovered text.
        """
        input = [int(x) for x in input]
        if input[0] == self._bos_id:
            input = input[1:]
        try:
            eos_pos = input.index(self._eos_id)
            input = input[:eos_pos]
        except ValueError:
            pass
        token_list = Vocab.map_id_to_token(self, input)
        if self._reverse_sequence:
            token_list = token_list[::-1]
        if self._subtokenizer is None:
            output = " ".join(token_list)
        else:
            output = self._subtokenizer.detokenize(token_list, return_str=True)
        if self._tokenizer:
            output = self._tokenizer.detokenize(output, return_str=True)
        return output
Пример #4
0
    def process(self, input, is_processed=False):
        """ Process one data sample.

        Args:
            input: A text string.
            is_processed: Whether the data sample is already processed.

        Returns:
            A list of generated token IDs.
        """
        input = DataPipeline.text_pre_normalize(self,
                                                self._language,
                                                input,
                                                is_processed=False)
        if not is_processed:
            if self._tokenizer:
                input = self._tokenizer.tokenize(input)
            if self._subtokenizer:
                input = self._subtokenizer.tokenize(input, return_str=False)
        if isinstance(input, str):
            input = input.split()
        token_ids = Vocab.map_token_to_id(self,
                                          input,
                                          unknown_default=self._unk_id)
        if self._reverse_sequence:
            token_ids = token_ids[::-1]
        return token_ids + [self._eos_id]
Пример #5
0
    def __init__(self,
                 name,
                 language="en",
                 vocab_path=None,
                 tokens=None,
                 **kwargs):
        """ Initializes the data pipeline for text data.

        Args:
            name: The key of the BERT model, for creating the tokenizer and loading vocabulary.
            language: The language.
            tokens: A list of word tokens.
            vocab_path: The path to the vocabulary file.
        """
        if tokens is None and vocab_path is None:
            path = GoogleBert.download(name)
            if path is None:
                raise ValueError(
                    f"Unknown BERT model name={name} for downloading.")
            vocab_path = os.path.join(path, "vocab.txt")
        else:
            if tokens is not None:
                vocab_path = None
            tokens = Vocab.load_tokens(vocab_path, tokens)
            vocab_path = None
            # to handle with customized vocabulary
            for spec_token in ["[UNK]", "[CLS]", "[SEP]", "[MASK]", "[PAD]"]:
                if spec_token not in tokens:
                    tokens.insert(0, spec_token)
            assert tokens[0] == "[PAD]"
        Vocab.__init__(self,
                       Vocab.load_tokens(vocab_path, tokens),
                       lowercase=False)
        DataPipeline.__init__(self,
                              name=name,
                              language=language,
                              tokens=self.tokens,
                              vocab_path=None,
                              **kwargs)
        self._language = language
        self._tokenizer = HuggingFaceTokenizer(language=language)
        self._tokenizer.init_subtokenizer(name)
        self._unk_id = Vocab.map_token_to_id(self, "[UNK]")
        self._pad_id = Vocab.map_token_to_id(self, "[PAD]")
        self._cls_id = Vocab.map_token_to_id(self, "[CLS]")
        self._sep_id = Vocab.map_token_to_id(self, "[SEP]")
        self._mask_id = Vocab.map_token_to_id(self, "[MASK]")
Пример #6
0
def test_file():
    vocab_file = tempfile.NamedTemporaryFile(delete=False)
    with open(vocab_file.name, "w") as fw:
        for t in word_tokens:
            fw.write(t + "\t100\n")
    vocab = Vocab.load_from_file(vocab_file.name,
                                 extra_tokens=["UNK", "EOS"])
    assert vocab._token_list == ["Hello", "World", "yes", "i", "I", "UNK", "EOS"]
    assert vocab.vocab_size == 7
    assert vocab.map_token_to_id(["Hello", "world", "man"],
                                 unknown_default=100) == [0, 100, 100]
    assert vocab.map_id_to_token([1, 0, 3]) == ["World", "Hello", "i"]

    vocab = Vocab.load_from_file(vocab_file.name,
                                 extra_tokens=["UNK", "EOS"], lowercase=True)
    assert vocab._token_list == ["hello", "world", "yes", "i", "UNK", "EOS"]
    assert vocab.vocab_size == 6
    assert vocab.map_token_to_id(["Hello", "world", "man", "EOS"],
                                 unknown_default=100) == [0, 1, 100, 5]
    assert vocab.map_id_to_token([1, 0, 3]) == ["world", "hello", "i"]
    os.remove(vocab_file.name)
Пример #7
0
    def __init__(self, language="en", tokens=None, vocab_path=None):
        """ Initializes the data pipeline from OpenAI released GPT-2.

        Args:
            language: The language.
            tokens: A list of word tokens.
            vocab_path: The path to the vocabulary file.
        """
        if tokens is None and vocab_path is None:
            path = OpenAIGPT2.download("117M")
            vocab_path = os.path.join(path, "encoder.json")
        Vocab.__init__(self,
                       Vocab.load_tokens(vocab_path, tokens),
                       lowercase=False)
        DataPipeline.__init__(self,
                              language=language,
                              tokens=self.tokens,
                              vocab_path=None)
        self._language = language
        self._tokenizer = HuggingFaceTokenizer(language=language)
        self._tokenizer.init_subtokenizer("gpt2")
        self._eos_id = Vocab.map_token_to_id(self, "<|endoftext|>")
Пример #8
0
 def _process(text):
     text = DataPipeline.text_pre_normalize(self,
                                            self._language,
                                            text,
                                            is_processed=False)
     if not is_processed:
         text = self._tokenizer.tokenize(text, return_str=False)
     elif isinstance(text, str):
         text = text.strip().split()
     token_ids = Vocab.map_token_to_id(self,
                                       text,
                                       unknown_default=self._unk_id)
     return token_ids + [self._sep_id]
Пример #9
0
def test():
    vocab = Vocab(word_tokens,
                  extra_tokens=["UNK", "EOS"])
    assert vocab._token_list == ["Hello", "World", "yes", "i", "I", "UNK", "EOS"]
    assert vocab.vocab_size == 7
    assert vocab.map_token_to_id(["Hello", "world", "man"],
                                 unknown_default=100) == [0, 100, 100]
    assert vocab.map_id_to_token([1, 0, 3]) == ["World", "Hello", "i"]

    vocab = Vocab(word_tokens,
                  extra_tokens=["UNK", "EOS"], lowercase=True)
    assert vocab._token_list == ["hello", "world", "yes", "i", "UNK", "EOS"]
    assert vocab.vocab_size == 6
    assert vocab.map_token_to_id(["Hello", "world", "man"],
                                 unknown_default=100) == [0, 1, 100]
    assert vocab.map_id_to_token([1, 0, 3]) == ["world", "hello", "i"]
Пример #10
0
    def recover(self, input):
        """ Recover one data sample.

        Args:
            input: A list of token ids, the output of neural model.

        Returns:
            A string, the recovered text.
        """
        try:
            eos_pos = input.index(self._eos_id)
            input = input[:eos_pos]
        except ValueError:
            pass
        output = Vocab.map_id_to_token(self, input)
        return self._tokenizer.detokenize(output, return_str=True)
Пример #11
0
    def process(self, input, is_processed=False):
        """ Process one data sample.

        Args:
            input: A text string.
            is_processed: Whether the data sample is already processed.

        Returns:
            A list of generated token IDs.
        """
        if not is_processed:
            input = self._tokenizer.tokenize(input, return_str=False)
        elif isinstance(input, str):
            input = input.strip().split()
        token_ids = [
            x for x in Vocab.map_token_to_id(self, input) if x is not None
        ]
        return token_ids + [self._eos_id]
Пример #12
0
    def __init__(self,
                 vocab_path,
                 spm_model,
                 languages,
                 reverse_sequence=False,
                 **kwargs):
        """ Initializes the data pipeline for text data.

        Args:
            vocab_path: The path to the vocabulary file, or a list of word tokens.
            spm_model: The path to the sentence piece model.
            languages: A list of languages. The corresponding language tags will automatically
                append to the vocabulary.
            reverse_sequence: A bool, whether to reverse the sequence.
        """
        DataPipeline.__init__(self, vocab_path=vocab_path, languages=languages,
                              reverse_sequence=reverse_sequence, **kwargs)
        self._reverse_sequence = reverse_sequence
        self._tokenizer = SentencePiece()
        self._tokenizer.init_subtokenizer(spm_model)
        if isinstance(vocab_path, list):
            tokens = Vocab.load_tokens(tokens=vocab_path)
        else:
            tokens = Vocab.load_tokens(vocab_path=vocab_path)
        if isinstance(languages, str):
            languages = yaml.load(languages, Loader=yaml.FullLoader)
        assert isinstance(languages, list), (
            f"`languages` must be a list of strings, but got {languages}")
        lang2tags = {}
        for lang in languages:
            lang2tags[lang] = Vocab.get_unique(tokens, "<" + lang + ">")
        unk_token = Vocab.get_unique(tokens, "<UNK>")
        bos_token = Vocab.get_unique(tokens, "<SEQ_BEG>")
        eos_token = Vocab.get_unique(tokens, "<SEQ_END>")
        assert unk_token != bos_token != eos_token
        Vocab.__init__(self, tokens, [unk_token, bos_token, eos_token] + list(lang2tags.values()),
                       lowercase=False)
        self._eos_id = Vocab.map_token_to_id(self, eos_token)
        self._bos_id = Vocab.map_token_to_id(self, bos_token)
        self._unk_id = Vocab.map_token_to_id(self, unk_token)
        self._lang_ids = {lang: Vocab.map_token_to_id(self, lang2tags[lang])
                          for lang in languages}
Пример #13
0
class SymbolsMapper(object):
    def __init__(self,
                 vocab_path=None,
                 tokens=None,
                 max_len=0,
                 lowercase=False,
                 bos_token="<SEQ_BEG>",
                 eos_token="<SEQ_END>",
                 unk_token="<UNK>",
                 delimiter=" ",
                 reverse=False):
        """ Initialize SymbolsMapper

        Args:
            vocab_path: The path to the vocabulary file. Only one of `vocab_path` and `tokens` should be provided.
            tokens: The word tokens. Only one of `vocab_path` and `tokens` should be provided.
            max_len: The maximum sequence length. Sequence larger than this will be truncated.
            lowercase: A bool, whether to lowercase the word tokens.
            bos_token: The begin-of-sentence token.
            eos_token: The end-of-sentence token.
            unk_token: The token indicating unknown word.
            reverse: A bool, whether to reverse the sequence or not.
        """
        if not ((vocab_path is None) ^ (tokens is None)):
            raise ValueError("Either `vocab_path` or `tokens` should be provided.")
        this_locals = copy.copy(locals())
        if tokens is None:
            with tf.io.gfile.GFile(vocab_path, "r") as fp:
                tokens = [line.strip() for line in fp]
            this_locals["tokens"] = tokens
            this_locals["vocab_path"] = None
        self._params = extract_constructor_params(this_locals, verbose=False)
        # extract tokens
        cleaned_tokens = []
        for t in tokens:
            t = t.strip()
            if ((t.startswith("'") and t.endswith("'"))
                or (t.startswith('"') and t.endswith('"'))):
                word = t[1:-1]
            else:
                word = t.strip().split()[0].strip()
            if word:
                cleaned_tokens.append(word)
        assert unk_token, "must provide `unk_token`"
        extra_tokens = [unk_token]
        # add bos
        assert bos_token != unk_token
        extra_tokens.append(bos_token)
        # add eos
        assert eos_token != unk_token != bos_token
        while eos_token in cleaned_tokens:
            eos_token += str(random.choice(list(range(0, 10))))
        extra_tokens.append(eos_token)
        self.vocab = Vocab(tokens=cleaned_tokens, extra_tokens=extra_tokens,
                           lowercase=lowercase)
        self.max_len = max_len
        self.eos_id = self.vocab.map_token_to_id(eos_token)
        self.bos_id = self.vocab.map_token_to_id(bos_token)
        self.unk_id = self.vocab.map_token_to_id(unk_token)
        self.reverse = reverse
        self.delimiter = delimiter

    @property
    def meta_data(self):
        return {
            "vocab_size": self.vocab.vocab_size,
            "eos_id": self.eos_id,
            "bos_id": self.bos_id,
            "unk_id": self.unk_id,
            "pad_id": self.eos_id,
        }

    def get_config(self):
        return self._params

    def map_token_to_id(self, text, return_str=False,
                        with_bos=False, with_eos=True):
        """ Map word tokens to id list

        Args:
            text: a string of a list of string tokens
            return_str: a bool, whether to return a string or not (a list).
            with_bos: a bool, whether to automatically plus bos
                token at the front or not.
            with_eos: a bool, whether to automatically plus eos
                token at the end or not.

        Returns: A list of word ids or a `delimiter` joined string.
        """
        if isinstance(text, str):
            text = text.strip().split()
        assert isinstance(text, list), (type(text))
        token_ids = self.vocab.map_token_to_id(text, unknown_default=self.unk_id)
        if self.reverse:
            token_ids = token_ids[::-1]
        if with_bos:
            token_ids = [self.bos_id] + token_ids
        if with_eos:
            token_ids += [self.eos_id]
        if return_str:
            return self.delimiter.join(
                [str(x) for x in token_ids])
        return token_ids

    def map_id_to_token(self, text, return_str=False,
                        reverse=True):
        """ Map token ids to token string

        Args:
            text: a string or a list of word token ids
            return_str: a bool, whether to return a string or not (a list).
            reverse: a bool, whether to recover the 'reverse' operation
                at `map_token_to_id` method.

        Returns:
            A `delimiter` joined string or a list of word tokens.
        """
        if isinstance(text, str):
            text = text.strip().split()
        text = [int(x) for x in text]
        if text[0] == self.bos_id:
            text = text[1:]
        try:
            eos_pos = text.index(self.eos_id)
            text = text[:eos_pos]
        except ValueError:
            pass
        token_list = self.vocab.map_id_to_token(text)
        if reverse and self.reverse:
            token_list = token_list[::-1]
        if return_str:
            return self.delimiter.join(token_list)
        return token_list