예제 #1
0
    def __init__(self,
                 vocab_path,
                 language="en",
                 tokenizer=None,
                 subtokenizer=None,
                 subtokenizer_codes=None,
                 glossaries=None,
                 reverse_sequence=False,
                 **kwargs):
        """ Initializes the data pipeline for text data.

        Args:
            language: The language.
            vocab_path: The path to the vocabulary file, or a list of word tokens.
            tokenizer: The tokenizer name.
            subtokenizer: The name of tokenizer for subword encoding.
            subtokenizer_codes: The subword codes.
            glossaries: The glossaries that will not be split by tokenizer/subtokenizer.
            reverse_sequence: A bool, whether to reverse the sequence.
        """
        DataPipeline.__init__(self,
                              vocab_path=vocab_path,
                              language=language,
                              tokenizer=tokenizer,
                              subtokenizer=subtokenizer,
                              subtokenizer_codes=subtokenizer_codes,
                              glossaries=glossaries,
                              reverse_sequence=reverse_sequence,
                              **kwargs)
        self._language = language
        self._reverse_sequence = reverse_sequence
        self._tokenizer = build_tokenizer(tokenizer,
                                          language=language,
                                          glossaries=glossaries)
        self._subtokenizer = None
        self._subtokenizer = build_tokenizer(subtokenizer,
                                             language=language,
                                             glossaries=glossaries,
                                             vocabulary=vocab_path)
        if self._subtokenizer is not None:
            if subtokenizer_codes is None:
                logging.info(
                    "No codes provided for subtokenizer: {}. "
                    "We assume this was done on purpose.".format(subtokenizer))
            else:
                self._subtokenizer.init_subtokenizer(subtokenizer_codes)
        if isinstance(vocab_path, list):
            tokens = Vocab.load_tokens(tokens=vocab_path)
        else:
            tokens = Vocab.load_tokens(vocab_path=vocab_path)
        unk_token = Vocab.get_unique(tokens, "<UNK>")
        bos_token = Vocab.get_unique(tokens, "<SEQ_BEG>")
        eos_token = Vocab.get_unique(tokens, "<SEQ_END>")
        assert unk_token != bos_token != eos_token
        Vocab.__init__(self,
                       tokens, [unk_token, bos_token, eos_token],
                       lowercase=False)
        self._eos_id = Vocab.map_token_to_id(self, eos_token)
        self._bos_id = Vocab.map_token_to_id(self, bos_token)
        self._unk_id = Vocab.map_token_to_id(self, unk_token)
    def __init__(self,
                 vocab_path,
                 spm_model,
                 languages,
                 reverse_sequence=False,
                 **kwargs):
        """ Initializes the data pipeline for text data.

        Args:
            vocab_path: The path to the vocabulary file, or a list of word tokens.
            spm_model: The path to the sentence piece model.
            languages: A list of languages. The corresponding language tags will automatically
                append to the vocabulary.
            reverse_sequence: A bool, whether to reverse the sequence.
        """
        DataPipeline.__init__(self,
                              vocab_path=vocab_path,
                              languages=languages,
                              reverse_sequence=reverse_sequence,
                              **kwargs)
        self._reverse_sequence = reverse_sequence
        self._tokenizer = SentencePiece()
        self._tokenizer.init_subtokenizer(spm_model)
        if isinstance(vocab_path, list):
            tokens = Vocab.load_tokens(tokens=vocab_path)
        else:
            tokens = Vocab.load_tokens(vocab_path=vocab_path)
        if isinstance(languages, str):
            languages = yaml.load(languages, Loader=yaml.FullLoader)
        assert isinstance(languages, list), (
            f"`languages` must be a list of strings, but got {languages}")
        lang2tags = {}
        for lang in languages:
            lang2tags[lang] = Vocab.get_unique(tokens, "<" + lang + ">")
        unk_token = Vocab.get_unique(tokens, "<UNK>")
        bos_token = Vocab.get_unique(tokens, "<SEQ_BEG>")
        eos_token = Vocab.get_unique(tokens, "<SEQ_END>")
        assert unk_token != bos_token != eos_token
        Vocab.__init__(self,
                       tokens, [unk_token, bos_token, eos_token] +
                       list(lang2tags.values()),
                       lowercase=False)
        self._eos_id = Vocab.map_token_to_id(self, eos_token)
        self._bos_id = Vocab.map_token_to_id(self, bos_token)
        self._unk_id = Vocab.map_token_to_id(self, unk_token)
        self._lang_ids = {
            lang: Vocab.map_token_to_id(self, lang2tags[lang])
            for lang in languages
        }
예제 #3
0
    def __init__(self,
                 name,
                 language="en",
                 vocab_path=None,
                 tokens=None,
                 **kwargs):
        """ Initializes the data pipeline for text data.

        Args:
            name: The key of the BERT model, for creating the tokenizer and loading vocabulary.
            language: The language.
            tokens: A list of word tokens.
            vocab_path: The path to the vocabulary file.
        """
        if tokens is None and vocab_path is None:
            path = GoogleBert.download(name)
            if path is None:
                raise ValueError(
                    f"Unknown BERT model name={name} for downloading.")
            vocab_path = os.path.join(path, "vocab.txt")
        else:
            if tokens is not None:
                vocab_path = None
            tokens = Vocab.load_tokens(vocab_path, tokens)
            vocab_path = None
            # to handle with customized vocabulary
            for spec_token in ["[UNK]", "[CLS]", "[SEP]", "[MASK]", "[PAD]"]:
                if spec_token not in tokens:
                    tokens.insert(0, spec_token)
            assert tokens[0] == "[PAD]"
        Vocab.__init__(self,
                       Vocab.load_tokens(vocab_path, tokens),
                       lowercase=False)
        DataPipeline.__init__(self,
                              name=name,
                              language=language,
                              tokens=self.tokens,
                              vocab_path=None,
                              **kwargs)
        self._language = language
        self._tokenizer = HuggingFaceTokenizer(language=language)
        self._tokenizer.init_subtokenizer(name)
        self._unk_id = Vocab.map_token_to_id(self, "[UNK]")
        self._pad_id = Vocab.map_token_to_id(self, "[PAD]")
        self._cls_id = Vocab.map_token_to_id(self, "[CLS]")
        self._sep_id = Vocab.map_token_to_id(self, "[SEP]")
        self._mask_id = Vocab.map_token_to_id(self, "[MASK]")
예제 #4
0
    def __init__(self, language="en", tokens=None, vocab_path=None):
        """ Initializes the data pipeline from OpenAI released GPT-2.

        Args:
            language: The language.
            tokens: A list of word tokens.
            vocab_path: The path to the vocabulary file.
        """
        if tokens is None and vocab_path is None:
            path = OpenAIGPT2.download("117M")
            vocab_path = os.path.join(path, "encoder.json")
        Vocab.__init__(self,
                       Vocab.load_tokens(vocab_path, tokens),
                       lowercase=False)
        DataPipeline.__init__(self,
                              language=language,
                              tokens=self.tokens,
                              vocab_path=None)
        self._language = language
        self._tokenizer = HuggingFaceTokenizer(language=language)
        self._tokenizer.init_subtokenizer("gpt2")
        self._eos_id = Vocab.map_token_to_id(self, "<|endoftext|>")