def test_get_set_components(self):
        toki = Tokenizer(models.BPE())
        toki.normalizer = normalizers.NFC()
        toki.pre_tokenizer = pre_tokenizers.ByteLevel()
        toki.post_processor = processors.BertProcessing(("A", 0), ("B", 1))
        toki.decoder = decoders.ByteLevel()

        tokenizer = BaseTokenizer(toki)

        assert isinstance(tokenizer.model, models.BPE)
        assert isinstance(tokenizer.normalizer, normalizers.NFC)
        assert isinstance(tokenizer.pre_tokenizer, pre_tokenizers.ByteLevel)
        assert isinstance(tokenizer.post_processor, processors.BertProcessing)
        assert isinstance(tokenizer.decoder, decoders.ByteLevel)

        tokenizer.model = models.Unigram()
        assert isinstance(tokenizer.model, models.Unigram)
        tokenizer.normalizer = normalizers.NFD()
        assert isinstance(tokenizer.normalizer, normalizers.NFD)
        tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
        assert isinstance(tokenizer.pre_tokenizer, pre_tokenizers.Whitespace)
        tokenizer.post_processor = processors.ByteLevel()
        assert isinstance(tokenizer.post_processor, processors.ByteLevel)
        tokenizer.decoder = decoders.WordPiece()
        assert isinstance(tokenizer.decoder, decoders.WordPiece)
예제 #2
0
    def __init__(
        self,
        vocab_file: Optional[str] = None,
        add_special_tokens: bool = True,
        unk_token: str = "[UNK]",
        sep_token: str = "[SEP]",
        cls_token: str = "[CLS]",
        clean_text: bool = True,
        handle_chinese_chars: bool = True,
        strip_accents: bool = True,
        lowercase: bool = True,
        wordpieces_prefix: str = "##",
    ):

        if vocab_file is not None:
            tokenizer = Tokenizer(
                WordPiece.from_files(vocab_file, unk_token=unk_token))
        else:
            tokenizer = Tokenizer(WordPiece.empty())

        tokenizer.add_special_tokens([unk_token, sep_token, cls_token])
        tokenizer.normalizer = BertNormalizer(
            clean_text=clean_text,
            handle_chinese_chars=handle_chinese_chars,
            strip_accents=strip_accents,
            lowercase=lowercase,
        )
        tokenizer.pre_tokenizer = BertPreTokenizer()

        if add_special_tokens and vocab_file is not None:
            sep_token_id = tokenizer.token_to_id(sep_token)
            if sep_token_id is None:
                raise TypeError("sep_token not found in the vocabulary")
            cls_token_id = tokenizer.token_to_id(cls_token)
            if cls_token_id is None:
                raise TypeError("cls_token not found in the vocabulary")

            tokenizer.post_processor = BertProcessing(
                (sep_token, sep_token_id), (cls_token, cls_token_id))
        tokenizer.decoders = decoders.WordPiece(prefix=wordpieces_prefix)

        parameters = {
            "model": "BertWordPiece",
            "add_special_tokens": add_special_tokens,
            "unk_token": unk_token,
            "sep_token": sep_token,
            "cls_token": cls_token,
            "clean_text": clean_text,
            "handle_chinese_chars": handle_chinese_chars,
            "strip_accents": strip_accents,
            "lowercase": lowercase,
            "wordpieces_prefix": wordpieces_prefix,
        }

        super().__init__(tokenizer, parameters)
예제 #3
0
def tokenize_corpus(
        input_file: str,
        output_file: str,
        vocab_file: str,
        unk_token: str = '<unk>',
        control_tokens: List[str] = []):
    r"""Tokenize corpus sentences through trained **WordPiece** model.

    Arguments:
        input_file (str): Input corpus file path.
        output_file (str): Output file path.
        vocab_file (str): Trained vocabulary file path.
        unk_token (str): Unknown token in the vocabulary.
        control_tokens (list): Control tokens in the vocabulary.
    """
    # Create `WordPiece` model and add special tokens. Note that `unk_token`
    # is also a special token.normalizer and pre-tokenizer.
    tokenizer = Tokenizer(models.WordPiece(vocab_file, unk_token=unk_token))
    tokenizer.add_special_tokens([unk_token] + control_tokens)

    # Use BERT-specific normalizer, pre-tokenizer and **WordPiece** decoder.
    tokenizer.normalizer = BertNormalizer(strip_accents=False)
    tokenizer.pre_tokenizer = BertPreTokenizer()
    tokenizer.decoder = decoders.WordPiece(prefix='##')

    with open(input_file, 'r', encoding='utf-8') as src, \
            open(output_file, 'w', encoding='utf-8') as dst:
        # Count total lines in corpus.
        total_lines = 0
        for _ in src:
            total_lines += 1

        # Move the corpus file to first.
        src.seek(0)

        buffer = []
        for line in tqdm.tqdm(src,
                              desc='[*] tokenize corpus',
                              total=total_lines):
            buffer.append(line)

            # Tokenize buffered sentences and write to `output_file`.
            if len(buffer) > 10000:
                for t in tokenizer.encode_batch(buffer):
                    dst.write(' '.join(t.tokens) + '\n')
                buffer.clear()

        # Process the remained buffer.
        if buffer:
            for t in tokenizer.encode_batch(buffer):
                dst.write(' '.join(t.tokens) + '\n')
    def converted(self) -> Tokenizer:
        vocab = self.original_tokenizer.vocab
        tokenizer = Tokenizer(
            WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token)))

        # # Let the tokenizer know about special tokens if they are part of the vocab
        # if tokenizer.token_to_id(str(self.original_tokenizer.unk_token)) is not None:
        #     tokenizer.add_special_tokens([str(self.original_tokenizer.unk_token)])
        # if tokenizer.token_to_id(str(self.original_tokenizer.sep_token)) is not None:
        #     tokenizer.add_special_tokens([str(self.original_tokenizer.sep_token)])
        # if tokenizer.token_to_id(str(self.original_tokenizer.cls_token)) is not None:
        #     tokenizer.add_special_tokens([str(self.original_tokenizer.cls_token)])
        # if tokenizer.token_to_id(str(self.original_tokenizer.pad_token)) is not None:
        #     tokenizer.add_special_tokens([str(self.original_tokenizer.pad_token)])
        # if tokenizer.token_to_id(str(self.original_tokenizer.mask_token)) is not None:
        #     tokenizer.add_special_tokens([str(self.original_tokenizer.mask_token)])

        tokenize_chinese_chars = False
        strip_accents = False
        do_lower_case = False
        if hasattr(self.original_tokenizer, "basic_tokenizer"):
            tokenize_chinese_chars = self.original_tokenizer.basic_tokenizer.tokenize_chinese_chars
            strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents
            do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case

        tokenizer.normalizer = normalizers.BertNormalizer(
            clean_text=True,
            handle_chinese_chars=tokenize_chinese_chars,
            strip_accents=strip_accents,
            lowercase=do_lower_case,
        )
        tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()

        cls = str(self.original_tokenizer.cls_token)
        sep = str(self.original_tokenizer.sep_token)
        cls_token_id = self.original_tokenizer.cls_token_id
        sep_token_id = self.original_tokenizer.sep_token_id

        tokenizer.post_processor = processors.TemplateProcessing(
            single=
            f"{cls}:2 $A:0 {sep}:0",  # token_type_id is 2 for Funnel transformer
            pair=f"{cls}:2 $A:0 {sep}:0 $B:1 {sep}:1",
            special_tokens=[
                (cls, cls_token_id),
                (sep, sep_token_id),
            ],
        )
        tokenizer.decoder = decoders.WordPiece(prefix="##")

        return tokenizer
    def converted(self) -> Tokenizer:
        vocab = self.original_tokenizer.vocab
        tokenizer = Tokenizer(
            WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token)))

        tokenize_chinese_chars = False
        strip_accents = False
        do_lower_case = False
        if hasattr(self.original_tokenizer, "basic_tokenizer"):
            tokenize_chinese_chars = self.original_tokenizer.basic_tokenizer.tokenize_chinese_chars
            strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents
            do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case

        tokenizer.normalizer = normalizers.BertNormalizer(
            clean_text=True,
            handle_chinese_chars=tokenize_chinese_chars,
            strip_accents=strip_accents,
            lowercase=do_lower_case,
        )
        tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()

        cls = str(self.original_tokenizer.cls_token)
        sep = str(self.original_tokenizer.sep_token)
        question = str(self.original_tokenizer.question_token)
        dot = "."
        cls_token_id = self.original_tokenizer.cls_token_id
        sep_token_id = self.original_tokenizer.sep_token_id
        question_token_id = self.original_tokenizer.question_token_id
        dot_token_id = self.original_tokenizer.convert_tokens_to_ids(".")

        if self.original_tokenizer.padding_side == "right":
            pair = f"{cls}:0 $A:0 {question} {dot} {sep}:0 $B:1 {sep}:1"
        else:
            pair = f"{cls}:0 $A:0 {sep}:0 $B:1 {question} {dot} {sep}:1"

        tokenizer.post_processor = processors.TemplateProcessing(
            single=f"{cls}:0 $A:0 {sep}:0",
            pair=pair,
            special_tokens=[
                (cls, cls_token_id),
                (sep, sep_token_id),
                (question, question_token_id),
                (dot, dot_token_id),
            ],
        )
        tokenizer.decoder = decoders.WordPiece(prefix="##")

        return tokenizer
def create_tokenizer(sentence_list):
    filename = f'temp_{time.strftime("%Y%m%d-%H%M%S")}.txt'
    with open(filename, 'w') as f:
        for s in sentence_list:
            f.write(f'{s}\n')

    tokenizer = Tokenizer(WordPiece())
    tokenizer.pre_tokenizer = Whitespace()
    tokenizer.decoder = decoders.WordPiece()
    tokenizer.enable_padding(pad_token='[PAD]', pad_id=0)

    trainer = WordPieceTrainer(
        vocab_size=3000, special_tokens=['[PAD]', '[S]', '[/S]', '[UNK]'])
    tokenizer.train(trainer, [filename])

    os.remove(filename)

    return tokenizer
예제 #7
0
def train_tokenizer() -> Tuple[tokenizers.Tokenizer, Generator, int]:
    tokenizer = tokenizers.Tokenizer(models.WordPiece(unk_token="<unk>"))
    tokenizer.decoder = decoders.WordPiece()

    tokenizer.normalizer = normalizers.Sequence([
        normalizers.NFD(),  # NFD unicode normalizer
        normalizers.Lowercase(),
        normalizers.StripAccents()
    ])
    tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Sequence([
        pre_tokenizers.Whitespace(),
        pre_tokenizers.Digits(individual_digits=False)
    ])
    tokenizer.post_processor = processors.TemplateProcessing(
        single="$A </s>",
        pair="$A </s> [SEP] <s> $B:1",
        special_tokens=[("[SEP]", 1), ("<s>", 2), ("</s>", 3)])

    dataset = datasets.load_dataset("wikitext",
                                    "wikitext-103-raw-v1",
                                    split="test")

    def batch_iterator(batch_size=1000):
        for i in range(0, len(dataset), batch_size):
            yield dataset[i:i + batch_size]["text"]

    tokenizer.train_from_iterator(
        batch_iterator(),
        trainer=trainers.WordPieceTrainer(
            vocab_size=10000, special_tokens=["<unk>", "[SEP]", "<s>",
                                              "</s>"]))

    def generator():
        for record in dataset:
            if record['text'].strip() != '':
                for sentence in sent_tokenizer(record['text']):
                    yield sentence

    data = tf.data.Dataset.from_generator(generator,
                                          output_signature=(tf.TensorSpec(
                                              shape=(None), dtype=tf.string)))
    data = data.map(tf.strings.strip,
                    num_parallel_calls=tf.data.experimental.AUTOTUNE)
    return tokenizer, data
예제 #8
0
def train_wordpiece_bert():
    """
    Sample code from: https://huggingface.co/docs/tokenizers/python/latest/pipeline.html
    """
    from tokenizers.models import WordPiece
    bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

    from tokenizers import normalizers
    from tokenizers.normalizers import Lowercase, NFD, StripAccents
    bert_tokenizer.normalizer = normalizers.Sequence(
        [NFD(), Lowercase(), StripAccents()])

    from tokenizers.pre_tokenizers import Whitespace
    bert_tokenizer.pre_tokenizer = Whitespace()

    from tokenizers.processors import TemplateProcessing
    bert_tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", 1),
            ("[SEP]", 2),
        ],
    )

    bert_tokenizer.decoder = decoders.WordPiece()

    from tokenizers.trainers import WordPieceTrainer
    trainer = WordPieceTrainer(
        vocab_size=30522,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
    files = [
        DIR_DATA + os.sep + 'wikitext-103' + os.sep + 'wiki.%s.raw' % a
        for a in ["test", "train", "valid"]
    ]
    bert_tokenizer.train(files, trainer)
    bert_tokenizer.save(DIR_TOKENIZERS + os.sep + 'bert_wiki.json')

    return bert_tokenizer
예제 #9
0
def train_tokenizer() -> tokenizers.Tokenizer:
    tokenizer = tokenizers.Tokenizer(models.WordPiece(unk_token="<unk>"))
    tokenizer.decoder = decoders.WordPiece()

    tokenizer.normalizer = normalizers.Sequence([
        normalizers.NFD(),  # NFD unicode normalizer
        normalizers.Lowercase(),
        normalizers.StripAccents()
    ])
    tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Sequence([
        pre_tokenizers.Whitespace(),
        pre_tokenizers.Digits(individual_digits=False)
    ])
    tokenizer.post_processor = processors.TemplateProcessing(
        single="$A </s>",
        pair="$A </s> [SEP] <s> $B:1",
        special_tokens=[("[SEP]", 1), ("<s>", 2), ("</s>", 3)])

    # dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1", split="train+test+validation")
    dataset = datasets.load_dataset("wikitext",
                                    "wikitext-103-raw-v1",
                                    split="validation")

    def batch_iterator(batch_size=1000):
        for i in range(0, len(dataset), batch_size):
            yield dataset[i:i + batch_size]["text"]

    tokenizer.train_from_iterator(
        batch_iterator(),
        trainer=trainers.WordPieceTrainer(
            vocab_size=10000, special_tokens=["<unk>", "[SEP]", "<s>",
                                              "</s>"]))

    def generator():
        for record in dataset:
            if record['text'].strip() != '':
                yield record['text']

    return tokenizer, generator
예제 #10
0
def train_tokenizer_vocab(dataset, style='BPE', force_retrain=True):
    """
    if force_retrain: overwrite the stored tokenizer from tokenizers dir (by retraining)
    else: load the tokenizer if it exists
    """
    assert dataset in VALID_DATASETS
    assert style in VALID_TOKENIZATIONS

    tpath_expected = default_tpath(dataset, style)

    train = True
    if not force_retrain and os.path.isfile(tpath_expected):
        tokenizer = Tokenizer.from_file(tpath_expected)
        train = False
    else:
        print('%s tokenizer file does not exist; training new tokenizer' %
              tpath_expected)

    if train:

        # load data associated with one of the valid datasets (from /data/ directory)
        datafiles = load_dataset(dataset)

        # Steps for each algo (e.g. BPE):
        # - init Tokenizer using algo
        # - specify algo specific trainer
        # - specify any pre-processing of text (will affect decoding)
        #   see: https://huggingface.co/docs/tokenizers/python/latest/components.html#decoders
        # - different training calls if its the arxiv dataset or wikitext
        #   see https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/

        if style == 'BPE':
            tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
            trainer = BpeTrainer(
                special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
            tokenizer.pre_tokenizer = ByteLevel()
            if dataset == 'arxiv':
                tokenizer.train_from_iterator(datafiles, trainer=trainer)
            else:
                tokenizer.train(datafiles, trainer=trainer)
            tokenizer.decoder = decoders.ByteLevel()

        else:
            assert style == 'WordLevel'
            tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
            trainer = WordLevelTrainer(
                special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
            tokenizer.pre_tokenizer = Whitespace()
            if dataset == 'arxiv':
                tokenizer.train_from_iterator(datafiles, trainer=trainer)
            else:
                tokenizer.train(datafiles, trainer=trainer)
            tokenizer.decoder = decoders.WordPiece(
            )  # WordPiece seems to work (adds back spaces)

        # Save to tokenizers directory
        tokenizer.save(tpath_expected)

    # Generate vocab object based on tokenizer.decoder() method
    # ... TODO implement the same vocabulary functionality, or ensure it is present in Tokenizer and then code it elsewhere...
    # Features we need to match:
    #   from torchtext.legacy.vocab import Vocab as RetiredVocab
    #   ntokens = len(vocab.stoi) ---> ntokens = tokenizer.(...)
    #   data = [torch.tensor([vocab[token] for token in tokenizer(item)],
    #                         dtype=torch.long) for item in raw_text_iter]
    #   tokenized_text_ints = torch.tensor([vocab[token] for token in tokenized_text], dtype=torch.long)
    #   running_context_string = ' '.join([vocab.itos[src[k]] for k in range(src.shape[0])])
    #   unk_index = vocab.unk_index
    vocab = None

    return tokenizer, vocab
예제 #11
0
def train_custom_tokenizer(dataset,
                           token_model,
                           tknzr_file,
                           vocab_size,
                           vocab=None,
                           pretrain_fast=False,
                           max_input_chars_per_word=None,
                           eos_token=None,
                           bos_token=None,
                           pad_token=None,
                           mask_token=None,
                           unk_token=None):
    """
    Building a Tokenizer using HuggingFace library. The pipeline seems to be:

        - Model           : algorithm that tokenizes, it is a mandatory
                            component. There are only 4 models implemented
                            (BPE, Unigram, WordLevel, WordPiece)
        - Normalizer      : some preprocessing that could happen before, but
                            doesn't necessarily have to
        - Pre-Tokenizer   : splitting the input according to some rules
        - Post-Processing : needing to add some tokens/input after (mostly seems
                            to be eos, bos tokens)
        - Decoder         : certain previous pipeline steps need to be reversed
                            for proper decoding
        - Trainer         : The corresponding training algorithm for the model

    Note : Some pre-processing might need to happen beforehand in previous
            functions (might be easier using pandas before)

    Input
        token_model (str)        : algorithm to use for tokenization
        dataset (class)          : a python iterator that goes through the data
                                    to be used for training
        token_dir (str)          : directory with tokenizers
        vocab_size (int)         : size of the vocabulary to use
        tokenFilename (str)     : filename of particular token we want to
                                    train. Will overwrite previously save files.
        vocab (list of str)      : models other than BPE can use non-mandatory
                                    vocab as input
        max_input_chars_per_word : used for WordPiece

    Output
        tokenizer                : huggingFace Tokenizer object, our fully
                                    trainer tokenizer

    """
    special_token_lst = [
        pad_token, bos_token, eos_token, mask_token, unk_token
    ]

    # NFKC
    normalizer_lst = []
    pre_tokenizer_lst = [Whitespace, ByteLevel]
    decoder_lst = []

    bos_idx = special_token_lst.index(bos_token)
    eos_idx = special_token_lst.index(eos_token)

    if token_model == 'BPE':
        model = BPE(unk_token=unk_token)
        Trainer = BpeTrainer
    elif token_model == 'Unigram':
        model = Unigram(vocab=vocab)
        Trainer = UnigramTrainer
    elif token_model == 'WordLevel':
        model = WordLevel(unk_token=unk_token, vocab=vocab)
        Trainer = WordLevelTrainer
    elif token_model == 'WordPiece':
        model = WordPiece(unk_token=unk_token,
                          vocab=vocab,
                          max_input_chars_per_word=max_input_chars_per_word)
        Trainer = WordPieceTrainer
    else:
        error_msg = f'Error: token_model ({token_model}) not an algorithm in%s' \
                    % VALID_TOKENIZATIONS
        raise SystemExit(error_msg)

    # instantiation
    tokenizer = Tokenizer(model)

    # Select a tokenization trainer
    if vocab_size is None:
        trainer = Trainer(show_progress=True, special_tokens=special_token_lst)
    else:
        trainer = Trainer(vocab_size=vocab_size,
                          show_progress=True,
                          special_tokens=special_token_lst)

    # Set the normalizer
    tokenizer.normalizer = normalizers.Sequence(
        [fcn() for fcn in normalizer_lst])

    # Set the pre-tokenizer
    tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
        [fcn() for fcn in pre_tokenizer_lst])

    # Set the post-processing
    tokenizer.post_processor = processors.TemplateProcessing(
        single=bos_token + " $A " + eos_token,
        special_tokens=[(bos_token, bos_idx), (eos_token, eos_idx)],
        #  pair=bos_token+" $A "+eos_token" $B:1 "+eos_token+":1",
    )

    # Set the decoder
    if ByteLevel in pre_tokenizer_lst:
        tokenizer.decoder = decoders.ByteLevel()
    if Metaspace in pre_tokenizer_lst:
        tokenizer.decoder = decoders.Metaspace()
    if token_model == 'WordPiece':
        tokenizer.decoder = decoders.WordPiece()

    # creating iterator
    def batch_iterator():
        for i in np.arange(0, len(dataset)):
            yield dataset[i]

    # train call
    tokenizer.train_from_iterator(trainer=trainer,
                                  iterator=batch_iterator(),
                                  length=len(dataset))

    if Path(tknzr_file).exists():
        print(f"Warning : overwriting previously save tokenizer with\
                        same filename ( {tknzr_file} ).")
    tokenizer.save(tknzr_file)

    if pretrain_fast:
        tokenizer = PreTrainedTokenizerFast(tokenizer_file=tknzr_file)
    else:
        tokenizer = PreTrainedTokenizer(tokenizer_file=tknzr_file)
    tokenizer.pad_token = pad_token
    tokenizer.mask_token = mask_token

    return tokenizer
예제 #12
0
def preprocess_data(args):

    label_counter = Counter([])
    examples_per_file = Counter()

    print("Reading all files for labels.")
    for input_file in args.input_files:
        with xopen(input_file, "rt") as f:
            for example, labels in input_readers[args.task](f):
                examples_per_file[input_file] += 1
                label_counter.update(labels)

    if args.top_n_labels > 0:
        mlb_full = MultiLabelBinarizer(sparse_output=True)
        mlb_full = mlb_full.fit(label_counter.keys())
        label_counter = dict(label_counter.most_common(args.top_n_labels))

    mlb = MultiLabelBinarizer(sparse_output=True)
    # Passing a list in a list because that's what the function wants.
    if args.labels_in:
        labels = json.load(open(args.labels_in))
        mlb = mlb.fit([labels])
    else:
        mlb = mlb.fit([[pair for pair in label_counter]])

    # Save list of partial -> full mapping if doing top N labels.
    if args.top_n_labels > 0:

        label_mapping = np.where(np.in1d(mlb_full.classes_,
                                         mlb.classes_))[0].tolist()

        with xopen(args.label_mapping, "wt") as f:
            f.write(json.dumps(label_mapping))

        # Also save the full labels.
        with xopen(args.full_labels, "wt") as f:
            f.write(json.dumps(list(mlb_full.classes_)))

    # Save list of labels.
    with xopen(args.labels_out, "wt") as f:
        f.write(json.dumps(list(mlb.classes_)))

    # Set parallel tokenization thread count.
    os.environ["RAYON_NUM_THREADS"] = str(args.processes)

    from tokenizers import Tokenizer, decoders, trainers
    from tokenizers.models import WordPiece
    from tokenizers.normalizers import BertNormalizer
    from tokenizers.pre_tokenizers import BertPreTokenizer
    from tokenizers.processors import BertProcessing

    if args.task == 'cafa':
        # Define our custom tokenizer.
        # It is exactly the same as the default BERT tokenizer, except for max_input_chars_per_word
        # being 20000 instead of 100. This tokenizer is very slow on the long protein sequences.
        tokenizer = WordPiece.from_files(args.vocab,
                                         unk_token="[UNK]",
                                         max_input_chars_per_word=20000)
        tokenizer = Tokenizer(tokenizer)
        tokenizer.add_special_tokens(["[UNK]", "[SEP]", "[CLS]"])
        tokenizer.normalizer = BertNormalizer(lowercase=args.do_lower_case)
        tokenizer.pre_tokenizer = BertPreTokenizer()
        tokenizer.post_processor = BertProcessing(
            ("[SEP]", tokenizer.token_to_id("[SEP]")),
            ("[CLS]", tokenizer.token_to_id("[CLS]")))
        tokenizer.decoder = decoders.WordPiece(prefix='##')
    else:
        tokenizer = BertWordPieceTokenizer(args.vocab,
                                           lowercase=args.do_lower_case)

    tokenizer.enable_padding(max_length=args.seq_len)
    tokenizer.enable_truncation(max_length=args.seq_len)

    for input_file in args.input_files:
        with xopen(input_file, 'rt') as in_f:

            file_name = generate_out_filename(input_file, args)

            with xopen(file_name, "wt") as out_f:
                print("Processing to: ", file_name)

                # Write the shape as the first row, useful for the finetuning.
                if args.labels_in:
                    n_labels = len(json.load(open(args.labels_in)))
                else:
                    n_labels = len(label_counter)
                out_f.write(
                    json.dumps((examples_per_file[input_file], n_labels)) +
                    '\n')

                batch_size = min(examples_per_file[input_file],
                                 args.processes * 100)
                example_batch = []
                labels_batch = []
                doc_idx_batch = []

                with ParallelGenerator(input_readers[args.task](in_f),
                                       max_lookahead=batch_size) as g:
                    START_POS = int(args.window_start) / 100
                    for doc_idx, (example, labels) in enumerate(g):
                        #example = ' '.join(example.split(' ')[-510:])
                        example_batch.append(example)
                        labels_batch.append(labels)
                        doc_idx_batch.append(doc_idx)

                        if len(example_batch) == batch_size:
                            example_batch = tokenizer.encode_batch(
                                example_batch)
                            labels_batch = mlb.transform(labels_batch)

                            for example, labels, doc_idx in zip(
                                    example_batch, labels_batch,
                                    doc_idx_batch):
                                # Convert sparse arrays to python lists for json dumping.
                                # print(labels);input()
                                labels = labels.nonzero()[1].tolist()
                                """try:
                                    [][0]
                                    print("DOC_LEN:",len(example.overflowing)+1)
                                    mid = len(example.overflowing)//2
                                    out_f.write(json.dumps( [example.overflowing[mid].ids, labels, len(example.overflowing)+1] ) + '\n')
                                except IndexError:
                                    out_f.write(json.dumps( [example.ids, labels, len(example.overflowing)+1] ) + '\n')"""

                                if args.all_blocks or args.n_blocks > 0:
                                    blocks = [example.ids] + [
                                        blk.ids for blk in example.overflowing
                                    ]
                                    #print("BLOCKS:%d,TOKENS:%d" % (len(list(blocks)), sum([len(list(tokens)) for tokens in blocks])))
                                    for b, block in enumerate(blocks, 2):
                                        if b > args.n_blocks and args.n_blocks > 0:
                                            break
                                        out_f.write(
                                            json.dumps(
                                                [block, labels, doc_idx]) +
                                            '\n')
                                else:
                                    window = get_window(example, START_POS)
                                    assert len(window) == 512
                                    assert all(
                                        [type(y) is int for y in window])
                                    out_f.write(
                                        json.dumps([window, labels]) + '\n')

                            example_batch = []
                            labels_batch = []

                    # Write out whatever is left in the last smaller batch.
                    example_batch = tokenizer.encode_batch(example_batch)
                    labels_batch = mlb.transform(labels_batch)

                    for example, labels, doc_idx in zip(
                            example_batch, labels_batch, doc_idx_batch):
                        # Convert sparse arrays to python lists for json dumping.
                        # print(labels);input()
                        labels = labels.nonzero()[1].tolist()
                        """try:
                            [][0]
                            print("DOC_LEN:",len(example.overflowing)+1)
                            mid = len(example.overflowing)//2
                            out_f.write(json.dumps( [example.overflowing[mid].ids, labels, len(example.overflowing)+1] ) + '\n')
                        except IndexError:
                            out_f.write(json.dumps( [example.ids, labels, len(example.overflowing)+1] ) + '\n')"""

                        if args.all_blocks or args.n_blocks > 0:
                            blocks = [example.ids] + [
                                blk.ids for blk in example.overflowing
                            ]
                            #print("BLOCKS:%d,TOKENS:%d" % (len(list(blocks)), sum([len(list(tokens)) for tokens in blocks])))
                            for b, block in enumerate(blocks, 2):
                                if b > args.n_blocks and args.n_blocks > 0:
                                    break
                                out_f.write(
                                    json.dumps([block, labels, doc_idx]) +
                                    '\n')
                        else:
                            out_f.write(
                                json.dumps(
                                    [get_window(example, START_POS), labels]) +
                                '\n')
예제 #13
0
    def __init__(self,
                 vocab_file: Optional[str] = None,
                 unk_token: Union[str, AddedToken] = "<unk>",
                 sep_token: Union[str, AddedToken] = "</s>",
                 cls_token: Union[str, AddedToken] = "<s>",
                 nl_token: Union[str, AddedToken] = "<nl>",
                 pad_token: Union[str, AddedToken] = "<pad>",
                 mask_token: Union[str, AddedToken] = "<mask>",
                 clean_text: bool = True,
                 handle_chinese_chars: bool = True,
                 separate_numbers: bool = True,
                 strip_accents: bool = True,
                 lowercase: bool = True,
                 wordpieces_prefix: str = "##",
                 special_chars: str = SPECIAL_CHARS,
                 zh_norm: bool = True,
                 handle_simpl: bool = True,
                 do_postprocess: bool = False):

        if vocab_file is not None:
            tokenizer = Tokenizer(
                WordPiece(vocab_file, unk_token=str(unk_token)))
        else:
            tokenizer = Tokenizer(WordPiece())

        # Let the tokenizer know about special tokens if they are part of the vocab
        if tokenizer.token_to_id(str(unk_token)) is not None:
            tokenizer.add_special_tokens([str(unk_token)])
        if tokenizer.token_to_id(str(sep_token)) is not None:
            tokenizer.add_special_tokens([str(sep_token)])
        if tokenizer.token_to_id(str(cls_token)) is not None:
            tokenizer.add_special_tokens([str(cls_token)])
        if tokenizer.token_to_id(str(pad_token)) is not None:
            tokenizer.add_special_tokens([str(pad_token)])
        if tokenizer.token_to_id(str(nl_token)) is not None:
            tokenizer.add_special_tokens([str(nl_token)])
        if tokenizer.token_to_id(str(mask_token)) is not None:
            tokenizer.add_special_tokens([str(mask_token)])
        if tokenizer.token_to_id(str(mask_token)) is not None:
            tokenizer.add_special_tokens([str(mask_token)])

        tokenizer.normalizer = Sequence([
            NFKC(),
            BertNormalizer(clean_text=clean_text,
                           handle_chinese_chars=handle_chinese_chars,
                           separate_numbers=separate_numbers,
                           strip_accents=strip_accents,
                           lowercase=lowercase,
                           special_chars=special_chars,
                           zh_norm=zh_norm,
                           handle_simpl=handle_simpl)
        ])
        tokenizer.pre_tokenizer = BertPreTokenizer()

        if vocab_file is not None and do_postprocess:
            sep_token_id = tokenizer.token_to_id(str(sep_token))
            if sep_token_id is None:
                raise TypeError("sep_token not found in the vocabulary")
            cls_token_id = tokenizer.token_to_id(str(cls_token))
            if cls_token_id is None:
                raise TypeError("cls_token not found in the vocabulary")
            tokenizer.post_processor = BertProcessing(
                (str(sep_token), sep_token_id), (str(cls_token), cls_token_id))

        tokenizer.decoder = decoders.WordPiece(prefix=wordpieces_prefix)

        parameters = {
            "model": "BertWordPiece",
            "unk_token": unk_token,
            "sep_token": sep_token,
            "cls_token": cls_token,
            "nl_token": nl_token,
            "pad_token": pad_token,
            "mask_token": mask_token,
            "clean_text": clean_text,
            "handle_chinese_chars": handle_chinese_chars,
            "separate_numbers": separate_numbers,
            "strip_accents": strip_accents,
            "lowercase": lowercase,
            "special_chars": special_chars,
            "zh_norm": zh_norm,
            "handle_simpl": handle_simpl,
            "wordpieces_prefix": wordpieces_prefix,
        }

        super().__init__(tokenizer, parameters)