예제 #1
0
    def __init__(self,
                 vocab_file,
                 merges_file,
                 unk_token="<|endoftext|>",
                 bos_token="<|endoftext|>",
                 eos_token="<|endoftext|>",
                 pad_to_max_length=False,
                 add_prefix_space=False,
                 max_length=None,
                 stride=0,
                 truncation_strategy="longest_first",
                 **kwargs):
        super(GPT2TokenizerFast, self).__init__(bos_token=bos_token,
                                                eos_token=eos_token,
                                                unk_token=unk_token,
                                                **kwargs)

        self._tokenizer = tk.Tokenizer(
            tk.models.BPE.from_files(vocab_file, merges_file))
        self._update_special_tokens()
        self._tokenizer.with_pre_tokenizer(
            tk.pre_tokenizers.ByteLevel.new(add_prefix_space=add_prefix_space))
        self._tokenizer.with_decoder(tk.decoders.ByteLevel.new())
        if max_length:
            self._tokenizer.with_truncation(max_length,
                                            stride=stride,
                                            strategy=truncation_strategy)
        self._tokenizer.with_padding(
            max_length=max_length if pad_to_max_length else None,
            direction=self.padding_side,
            pad_id=self.pad_token_id if self.pad_token_id is not None else 0,
            pad_type_id=self.pad_token_type_id,
            pad_token=self.pad_token if self.pad_token is not None else "",
        )
        self._decoder = tk.decoders.ByteLevel.new()
예제 #2
0
    def __init__(
        self,
        vocab_file,
        do_lower_case=True,
        do_basic_tokenize=True,
        never_split=None,
        unk_token="[UNK]",
        sep_token="[SEP]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        mask_token="[MASK]",
        tokenize_chinese_chars=True,
        max_length=None,
        pad_to_max_length=False,
        stride=0,
        truncation_strategy="longest_first",
        add_special_tokens=True,
        **kwargs
    ):
        super(BertTokenizerFast, self).__init__(
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            **kwargs,
        )

        self._tokenizer = tk.Tokenizer(tk.models.WordPiece.from_files(vocab_file, unk_token=unk_token))
        self._update_special_tokens()
        self._tokenizer.with_pre_tokenizer(
            tk.pre_tokenizers.BertPreTokenizer.new(
                do_basic_tokenize=do_basic_tokenize,
                do_lower_case=do_lower_case,
                tokenize_chinese_chars=tokenize_chinese_chars,
                never_split=never_split if never_split is not None else [],
            )
        )
        self._tokenizer.with_decoder(tk.decoders.WordPiece.new())

        if add_special_tokens:
            self._tokenizer.with_post_processor(
                tk.processors.BertProcessing.new(
                    (sep_token, self._tokenizer.token_to_id(sep_token)),
                    (cls_token, self._tokenizer.token_to_id(cls_token)),
                )
            )
        if max_length is not None:
            self._tokenizer.with_truncation(max_length, stride=stride, strategy=truncation_strategy)
        self._tokenizer.with_padding(
            max_length=max_length if pad_to_max_length else None,
            direction=self.padding_side,
            pad_id=self.pad_token_id,
            pad_type_id=self.pad_token_type_id,
            pad_token=self.pad_token,
        )
        self._decoder = tk.decoders.WordPiece.new()
예제 #3
0
 def test_works_in_simple_pipeline(self):
     pretok = self.dict.pre_tokenizer()
     vocab = {
         "[UNK]": 0,
         "京都": 1,
         "に": 2,
         "行く": 3
     }
     tok = tokenizers.Tokenizer(WordLevel(vocab, unk_token="[UNK]"))
     tok.pre_tokenizer = pretok
     res = tok.encode("京都へ行く")
     self.assertEqual(res.ids, [1, 0, 3])
예제 #4
0
 def test_with_handler(self):
     def _handler(index, sentence: tokenizers.NormalizedString, ml: MorphemeList):
         return [tokenizers.NormalizedString(ml[0].part_of_speech()[0]), tokenizers.NormalizedString(str(len(ml)))]
     pretok = self.dict.pre_tokenizer(sudachipy.SplitMode.A, handler=_handler)
     vocab = {
         "[UNK]": 0,
         "名詞": 6,
         "4": 7,
     }
     tok = tokenizers.Tokenizer(WordLevel(vocab, unk_token="[UNK]"))
     tok.pre_tokenizer = pretok
     res = tok.encode("外国人参政権")
     self.assertEqual(res.ids, [6, 7])
예제 #5
0
def make_tokenizer(k: int):
    """
    Make tokenizer for k-mer gene sequences.
    """
    keys = extras + list("".join(token)
                         for token in itertools.product(*(nucleotides
                                                          for _ in range(k))))
    values = range(len(keys))
    vocab = dict(zip(keys, values))
    tokenizer = tokenizers.Tokenizer(
        tokenizers.models.WordLevel(vocab=vocab, unk_token=unknown))
    tokenizer.enable_padding(pad_token=padding)
    tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.WhitespaceSplit()
    return tokenizer
예제 #6
0
 def test_works_with_different_split_mode(self):
     pretok = self.dict.pre_tokenizer(sudachipy.SplitMode.A)
     vocab = {
         "[UNK]": 0,
         "外国": 1,
         "参政": 2,
         "権": 3,
         "人": 5,
         "外国人参政権": 4
     }
     tok = tokenizers.Tokenizer(WordLevel(vocab, unk_token="[UNK]"))
     tok.pre_tokenizer = pretok
     res = tok.encode("外国人参政権")
     self.assertEqual(res.ids, [1, 5, 2, 3])
예제 #7
0
def train_tokenizer() -> Tuple[tokenizers.Tokenizer, Generator, int]:
    tokenizer = tokenizers.Tokenizer(models.WordPiece(unk_token="<unk>"))
    tokenizer.decoder = decoders.WordPiece()

    tokenizer.normalizer = normalizers.Sequence([
        normalizers.NFD(),  # NFD unicode normalizer
        normalizers.Lowercase(),
        normalizers.StripAccents()
    ])
    tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Sequence([
        pre_tokenizers.Whitespace(),
        pre_tokenizers.Digits(individual_digits=False)
    ])
    tokenizer.post_processor = processors.TemplateProcessing(
        single="$A </s>",
        pair="$A </s> [SEP] <s> $B:1",
        special_tokens=[("[SEP]", 1), ("<s>", 2), ("</s>", 3)])

    dataset = datasets.load_dataset("wikitext",
                                    "wikitext-103-raw-v1",
                                    split="test")

    def batch_iterator(batch_size=1000):
        for i in range(0, len(dataset), batch_size):
            yield dataset[i:i + batch_size]["text"]

    tokenizer.train_from_iterator(
        batch_iterator(),
        trainer=trainers.WordPieceTrainer(
            vocab_size=10000, special_tokens=["<unk>", "[SEP]", "<s>",
                                              "</s>"]))

    def generator():
        for record in dataset:
            if record['text'].strip() != '':
                for sentence in sent_tokenizer(record['text']):
                    yield sentence

    data = tf.data.Dataset.from_generator(generator,
                                          output_signature=(tf.TensorSpec(
                                              shape=(None), dtype=tf.string)))
    data = data.map(tf.strings.strip,
                    num_parallel_calls=tf.data.experimental.AUTOTUNE)
    return tokenizer, data
예제 #8
0
def train_tokenizer() -> tokenizers.Tokenizer:
    tokenizer = tokenizers.Tokenizer(models.WordPiece(unk_token="<unk>"))
    tokenizer.decoder = decoders.WordPiece()

    tokenizer.normalizer = normalizers.Sequence([
        normalizers.NFD(),  # NFD unicode normalizer
        normalizers.Lowercase(),
        normalizers.StripAccents()
    ])
    tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Sequence([
        pre_tokenizers.Whitespace(),
        pre_tokenizers.Digits(individual_digits=False)
    ])
    tokenizer.post_processor = processors.TemplateProcessing(
        single="$A </s>",
        pair="$A </s> [SEP] <s> $B:1",
        special_tokens=[("[SEP]", 1), ("<s>", 2), ("</s>", 3)])

    # dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1", split="train+test+validation")
    dataset = datasets.load_dataset("wikitext",
                                    "wikitext-103-raw-v1",
                                    split="validation")

    def batch_iterator(batch_size=1000):
        for i in range(0, len(dataset), batch_size):
            yield dataset[i:i + batch_size]["text"]

    tokenizer.train_from_iterator(
        batch_iterator(),
        trainer=trainers.WordPieceTrainer(
            vocab_size=10000, special_tokens=["<unk>", "[SEP]", "<s>",
                                              "</s>"]))

    def generator():
        for record in dataset:
            if record['text'].strip() != '':
                yield record['text']

    return tokenizer, generator
예제 #9
0
    def __init__(
        self,
        word_id_map={},
        pad_token_id=None,
        unk_token_id=None,
        unk_token="[UNK]",
        sep_token="[SEP]",
        cls_token="[CLS]",
        pad_token="[PAD]",
        lowercase: bool = False,
        unicode_normalizer=None,
    ):
        if pad_token_id:
            word_id_map[pad_token] = pad_token_id
        if unk_token_id:
            word_id_map[unk_token] = unk_token_id
        max_id = max(word_id_map.values())
        for idx, token in enumerate((unk_token, sep_token, cls_token, pad_token)):
            if token not in word_id_map:
                word_id_map[token] = max_id + idx
        # HuggingFace tokenizer expects a path to a `*.json` file to read the
        # vocab from. I think this is kind of a silly constraint, but for now
        # we write the vocab to a temporary file before initialization.
        word_list_file = tempfile.NamedTemporaryFile()
        word_list_file.write(json.dumps(word_id_map).encode())

        word_level = hf_tokenizers.models.WordLevel(
            word_list_file.name, unk_token=str(unk_token)
        )
        tokenizer = hf_tokenizers.Tokenizer(word_level)

        # Let the tokenizer know about special tokens if they are part of the vocab
        if tokenizer.token_to_id(str(unk_token)) is not None:
            tokenizer.add_special_tokens([str(unk_token)])
        if tokenizer.token_to_id(str(sep_token)) is not None:
            tokenizer.add_special_tokens([str(sep_token)])
        if tokenizer.token_to_id(str(cls_token)) is not None:
            tokenizer.add_special_tokens([str(cls_token)])
        if tokenizer.token_to_id(str(pad_token)) is not None:
            tokenizer.add_special_tokens([str(pad_token)])

        # Check for Unicode normalization first (before everything else)
        normalizers = []

        if unicode_normalizer:
            normalizers += [
                hf_tokenizers.normalizers.unicode_normalizer_from_str(
                    unicode_normalizer
                )
            ]

        if lowercase:
            normalizers += [hf_tokenizers.normalizers.Lowercase()]

        # Create the normalizer structure
        if len(normalizers) > 0:
            if len(normalizers) > 1:
                tokenizer.normalizer = hf_tokenizers.normalizers.Sequence(normalizers)
            else:
                tokenizer.normalizer = normalizers[0]

        tokenizer.pre_tokenizer = hf_tokenizers.pre_tokenizers.WhitespaceSplit()

        sep_token_id = tokenizer.token_to_id(str(sep_token))
        if sep_token_id is None:
            raise TypeError("sep_token not found in the vocabulary")
        cls_token_id = tokenizer.token_to_id(str(cls_token))
        if cls_token_id is None:
            raise TypeError("cls_token not found in the vocabulary")

        tokenizer.post_processor = hf_tokenizers.processors.BertProcessing(
            (str(sep_token), sep_token_id), (str(cls_token), cls_token_id)
        )

        parameters = {
            "model": "WordLevel",
            "unk_token": unk_token,
            "sep_token": sep_token,
            "cls_token": cls_token,
            "pad_token": pad_token,
            "lowercase": lowercase,
            "unicode_normalizer": unicode_normalizer,
        }

        self.unk_token = unk_token
        self.pad_token = pad_token

        super().__init__(tokenizer, parameters)
예제 #10
0
    train_csv_df.to_csv(config['train_csv'], index=False, header=True)

    # Labelled test CSV file
    print("Save labelled csv for inference ", config['test_csv'])
    test_csv_df.to_csv(config['test_csv'], index=False, header=True)

print("Setup tokenizers...")

unknown_word = 'unknown_word'
full_set = set(list(count_vector.vocabulary_.keys()) + list(word_list.keys()))
#full_set = set(list(count_vector.vocabulary_.keys()))

print("Number of words : (This has to be in config)", len(full_set) + 2)

vocab = {
    w: i
    for i, w in enumerate([unknown_word, 'dumb_token'] + list(full_set))
}
tokenizer = tokenizers.Tokenizer(WordLevel(vocab, unknown_word))
tokenizer.pre_tokenizer = Whitespace()

print("Use padding length ", config['padding_length'])
tokenizer.enable_padding(length=int(config['padding_length']))

# Save tokenizer
recompute = False
if recompute:
    print("Save tokenizer ", config['token_config'])
    tokenizer.save(config['token_config'])
    tokenizer = tokenizers.Tokenizer.from_file(config['token_config'])
예제 #11
0
    pre_tokenizer = Whitespace()
    tokenized_texts = [[w for w, _ in pre_tokenizer.pre_tokenize_str(t)]
                       for t in texts]

    c = Counter()
    for text in tokenized_texts:
        c.update(text)

    token2id = {
        word: i + 1
        for i, (word, count) in enumerate(c.most_common(max_vocab_size))
    }
    # usually, UNK is assigned index 0 or 1
    token2id[unk_token] = 0

    tokenizer = tokenizers.Tokenizer(WordLevel(token2id, unk_token))
    tokenizer.pre_tokenizer = pre_tokenizer
    return tokenizer


def accuracy(probs, targets):
    """Computes accuracy given predicted probabilities and expected labels.

    Args:
        probs: torch.FloatTensor[batch_size, 1], probabilities of a positive class
        targets: torch.LongTensor[batch_size, 1], true classes

    Returns:
        0 <= float <= 1, proportion of correct predictions
    """
    predictions = (probs >= 0.5).flatten()