Exemplo n.º 1
0
def get_recurrent_tokenizer(vocab,
                            max_context_tokens,
                            unk_token,
                            pad_token,
                            device="cpu"):
    """
    Return a tokenizer to be used with recurrent-based models
    """
    question_tokenizer = Tokenizer(WordLevel(vocab, unk_token=unk_token))
    question_tokenizer.normalizer = Sequence(
        [StripAccents(), Lowercase(), Strip()])
    question_tokenizer.pre_tokenizer = PreSequence(
        [Whitespace(), Punctuation()])
    question_tokenizer.enable_padding(direction="right",
                                      pad_id=vocab[pad_token],
                                      pad_type_id=1,
                                      pad_token=pad_token)

    context_tokenizer = Tokenizer(WordLevel(vocab, unk_token=unk_token))
    context_tokenizer.normalizer = Sequence(
        [StripAccents(), Lowercase(), Strip()])
    context_tokenizer.pre_tokenizer = PreSequence(
        [Whitespace(), Punctuation()])
    context_tokenizer.enable_padding(
        direction="right",
        pad_id=vocab[pad_token],
        pad_type_id=1,
        pad_token=pad_token,
    )
    context_tokenizer.enable_truncation(max_context_tokens)

    return RecurrentSquadTokenizer(question_tokenizer,
                                   context_tokenizer,
                                   device=device)
Exemplo n.º 2
0
    def __init__(self, glove_fp=None, embedding=None):
        assert glove_fp or embedding
        super().__init__()

        self.pre_token = Whitespace()  # i.e. tokenize on whitespace + punct

        if glove_fp:
            self.model = GloveEmbedding(location=glove_fp)
        else:
            self.model = embedding
Exemplo n.º 3
0
 def __init__(
     self,
     load_from: str = None,
     vocab_size: int = 10000,
     max_example_len: int = 128,
     batch_size: int = 16,
     num_stopwords: int = 250,
     mask_output_len: int = 4,
 ):
     self.char_dict: Dict[str, int] = {}
     self.char_rev: Dict[int, str] = {}
     self.token_dict: Dict[str, int] = {}
     self.token_rev: Dict[str, int] = {}
     self.vocab_size = vocab_size
     self.max_example_len = max_example_len
     self.batch_size = batch_size
     self.num_stopwords = num_stopwords
     self.mask_output_len = mask_output_len
     self.tokenizer_fit = False
     self.tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
     self.tokenizer.pre_tokenizer = Whitespace()
     self.tokenizer.normalizer = Sequence(
         [NFD(), Lowercase(), StripAccents()])
     self.tok_trainer = BpeTrainer(special_tokens=["[UNK]", "[MASK]"],
                                   vocab_size=self.vocab_size)
     if load_from:
         self._load(load_from)
Exemplo n.º 4
0
def get_daily_dialog_tokenizer(tokenizer_location=None):
    '''
    Get the daily dialog tokenizer. Trains a new one if no location is provided
    :param tokenizer_location: Json containing information about the tokenizer.
    :return:
    '''
    if tokenizer_location:
        tokenizer = Tokenizer.from_file(tokenizer_location, )
        tokenizer.enable_padding()
        return tokenizer
    else:
        dataset_train = datasets.load_dataset("daily_dialog", split="train", )
        utterances = [special_tokens["sep_token"].join(dialogue["dialog"]) for dialogue in dataset_train]

        trainer = WordPieceTrainer(
            vocab_size = 2048, 
            special_tokens = token_utils.special_tokens.values()
        )

        custom_tokenizer = Tokenizer(WordPiece(unk_token=special_tokens["unk_token"], ))
        custom_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
        custom_tokenizer.pre_tokenizer = Whitespace()
        custom_tokenizer.train_from_iterator(utterances, trainer, )
        custom_tokenizer.enable_padding()

        # Write every dialogue to file
        location = './daily_dialog/'
        custom_tokenizer.save(location + "tokenizer.json")

        return custom_tokenizer
Exemplo n.º 5
0
 def __init__(self, path):
     self.path = path
     text_paths = [
         str(x) for x in Path("./dataset/corpus/").glob("**/*.txt")
     ]
     savedpath = "./dataset/tok_model/MALBERT-vocab.txt"
     if os.path.exists(savedpath):
         self.tokenizer = tokenizers.BertWordPieceTokenizer(
             "./dataset/tok_model/MALBERT-vocab.txt", )
     else:
         self.tokenizer = tokenizers.BertWordPieceTokenizer()
         self.tokenizer.train(
             files=text_paths,
             special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
             vocab_size=14200)
         self.tokenizer.save_model("./dataset/tok_model", "MALBERT")
     self.tokenizer.enable_truncation(max_length=512)
     self.pretokenizer = tokenizers.pre_tokenizers.Sequence(
         [Whitespace(), Digits(individual_digits=True)])
     self.vocab = self.tokenizer.get_vocab()
     self.mask_index = self.vocab.get("[MASK]")
     self.pad_index = self.vocab.get("[PAD]")
     self.eos_index = self.vocab.get("[SEP]")
     self.sos_index = self.vocab.get("[CLS]")
     self.unk_index = self.vocab.get("[UNK]")
Exemplo n.º 6
0
def train_tokenizer(lang, dataset, vocab_size):
    # Byte-pair encoding
    tokenizer = Tokenizer(BPE(unk_token='[UNK]'))

    # trainer
    trainer = BpeTrainer(
        special_tokens=['[MASK]', '[CLS]', '[SEP]', '[PAD]', '[UNK]'],
        vocab_size=vocab_size)

    # pre tokenizer with whitespace
    tokenizer.pre_tokenizer = Whitespace()

    # train
    tokenizer.train_from_iterator(dataset[lang], trainer)

    # post process start/end tokens
    tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", tokenizer.token_to_id("[CLS]")),
            ("[SEP]", tokenizer.token_to_id("[SEP]")),
        ],
    )
    return tokenizer
Exemplo n.º 7
0
def train():
    """Source: https://huggingface.co/docs/tokenizers/pipeline"""

    base = os.environ['DATA_ROOT']
    corpus_path = base + 'MimicIII/Encounters/Text/'

    bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

    # input to tokenizer.encode() goes through this pipeline:
    # normalization, pre-tokenization, model, post-processing
    bert_tokenizer.normalizer = normalizers.Sequence(
        [NFD(), Lowercase(), StripAccents()])
    bert_tokenizer.pre_tokenizer = Whitespace()
    bert_tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[("[CLS]", 1), ("[SEP]", 2)])

    files = [str(file) for file in Path(corpus_path).glob('*.txt')]
    trainer = WordPieceTrainer(
        vocab_size=30522,
        show_progress=True,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
    bert_tokenizer.train(files, trainer)

    os.mkdir('./Tokenizer')
    bert_tokenizer.save("Tokenizer/tokenizer.json")
Exemplo n.º 8
0
def main() -> None:
    args = parse_args()

    special_tokens = list(SPECIAL_TOKENS)

    if args.reserved < len(special_tokens):
        raise AssertionError(
            f"number of reserved tokens should be more than number of f{len(special_tokens)}")
    for i in range(len(special_tokens), args.reserved):
        special_tokens.append(f"[unused{i:03d}]")

    all_filenames = get_all_filenames(args.input)
    # "C:\Users\demianmedich\data\wiki\20191120.en\pp_cased/"

    tokenizer = Tokenizer(get_model(args.model))
    tokenizer.normalizer = normalizers.Sequence([
        NFKC(), StripAccents(), Lowercase()
    ])

    tokenizer.pre_tokenizer = Whitespace()

    trainer = WordPieceTrainer(
        vocab_size=args.vocab_size,
        special_tokens=special_tokens)
    tokenizer.train(trainer, all_filenames)

    model_files = tokenizer.model.save()

    sys.exit(0)
Exemplo n.º 9
0
def load_janome_tokenizer(tokenizer_path) -> Tokenizer:
    tokenizer = Tokenizer.from_file(str(tokenizer_path))
    tokenizer.pre_tokenizer = Sequence([
        Whitespace(),
        PreTokenizer.custom(JanomePreTokenizer()),
    ])
    tokenizer.decoder = Decoder.custom(JanomeDecoder())
    return tokenizer
Exemplo n.º 10
0
def tokenize_cards(
        files=['./dataset/cards_train.txt', './dataset/cards_val.txt'],
        output_dir='./tokenizer'):
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.pre_tokenizer = Whitespace()

    tokenizer.train(files=files, special_tokens=SPECIAL_TOKENS + OTHER_TOKENS)
    tokenizer.save_model(output_dir)
Exemplo n.º 11
0
def tokenize(dt, df):
    from tokenizers import Tokenizer
    from tokenizers.models import WordPiece
    from tokenizers.pre_tokenizers import Whitespace
    from tokenizers import normalizers
    from tokenizers.normalizers import NFD, StripAccents
    from tokenizers.processors import TemplateProcessing
    from tokenizers.trainers import WordPieceTrainer

    #print(df.head())
    #print(df.query_text.head())
    #print(df.query_text.to_list())
    #exit(0)
    data_source = get_data_source(dt)
    token_file = Path(data_dir, data_source, 'tokenizer.json')
    vocab_file = Path(data_dir, data_source, 'vocab.txt')
    corpus_file = Path(data_dir, data_source, 'corpus.txt')
    if vocab_file.is_file() and corpus_file.is_file():
        print("corpus and token files already generated")
        return 0

    bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
    bert_tokenizer.normalizer = normalizers.Sequence([NFD(), StripAccents()])
    bert_tokenizer.pre_tokenizer = Whitespace()
    bert_tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", 1),
            ("[SEP]", 2),
        ],
    )
    trainer = WordPieceTrainer(
        vocab_size=25000,
        min_frequency=3,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
    #print(df.query_text.to_list())
    bert_tokenizer.train_from_iterator(df.query_text.to_list(), trainer)
    bert_tokenizer.save(str(token_file))
    #bert_tokenizer.save_model(directory=data_dir,name='tokenizer')
    df['range_idx'] = range(0, df.shape[0])
    df['mean_rank_group'] = df.groupby(
        ['session_id'], sort=False)['range_idx'].transform(np.mean)
    df['separate_column'] = df['range_idx'] < df['mean_rank_group']
    df = df.groupby(['session_id', 'separate_column'],
                    as_index=False,
                    sort=False)['query_text'].agg(
                        ' '.join).drop(columns='separate_column')
    #df = df.groupby('session_id').agg({'query_text':' '.join}).reset_index()
    df.query_text.to_csv(corpus_file, header=False, index=False)
    with open(token_file) as token_f:
        jdata = json.load(token_f)
        with open(vocab_file, "w") as fd:
            for k in jdata['model']['vocab'].keys():
                print(k, file=fd)
Exemplo n.º 12
0
def inference(checkpoint_path,
              hyperparameters_path,
              tokenizer_path,
              input='In 1691 Moscow established ',
              generated_length=64,
              random_selection=True):

    # Iitialize tokenizer and model from files
    tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
    tokenizer.pre_tokenizer = Whitespace()

    tokenizer = tokenizer.from_file(tokenizer_path)

    #tokenizer2 = Tokenizer(BPE(unk_token="[UNK]"))
    #tokenizer2.pre_tokenizer2 = Whitespace()
    #tokenizer2 = Tokenizer.from_file("example/tokenizer.json")

    #initialize model
    model = LMModel.load_from_checkpoint(checkpoint_path=checkpoint_path,
                                         hparams_file=hyperparameters_path)

    # Tokenize input sample
    encoded_sample = tokenizer.encode(input).ids

    for i in range(generated_length):
        input_ids = torch.unsqueeze(torch.tensor(encoded_sample,
                                                 dtype=torch.long),
                                    axis=0)

        # Inference
        output, attn = model(input_ids)
        last_word = output[0][-1]

        if not random_selection:
            # Pick highest probability token from probability distributions
            prediction = torch.argmax(output,
                                      axis=2).squeeze(axis=0).tolist()[-1]
        else:
            # Pick Tokens acording to their probabilities
            prediction = torch.multinomial(torch.softmax(last_word, 0)**10,
                                           1)[0]
        # Add prediciton to sequence
        encoded_sample.append(prediction)

    # Detokenize output sample
    decoded_output = tokenizer.decode(encoded_sample)
    #decoded_output2 = tokenizer2.decode(encoded_sample)

    output_tokens = [tokenizer.id_to_token(int(id)) for id in encoded_sample]
    #output_tokens2 = [tokenizer2.id_to_token(int(id)) for id in encoded_sample]
    #print('\n========================\n      ORIGINAL BPE        \n========================')
    #print(output_tokens2, decoded_output2, sep='\n')
    #print('\n========================\n      MODIFIED BPE        \n========================')
    return decoded_output, output_tokens, attn
Exemplo n.º 13
0
 def _prepare_pipeline(self):
     self.tokenizer.normalizer = normalizers.Sequence(
         [NFD(), Lowercase(), StripAccents()])
     self.tokenizer.pre_tokenizer = Whitespace()
     self.tokenizer.post_processor = TemplateProcessing(
         single="[CLS] $A [SEP]",
         pair="[CLS] $A [SEP] $B:1 [SEP]:1",
         special_tokens=[
             ("[CLS]", 1),
             ("[SEP]", 2),
         ],
     )
     self.tokenizer.enable_padding(
         pad_id=self.__class__.SPECIAL_TOKENS.index("[PAD]"),
         pad_token="[PAD]")
Exemplo n.º 14
0
def main(args):
    # from tokenizers import BertWordPieceTokenizer
    from tokenizers import Tokenizer
    from tokenizers.models import WordPiece

    bert_tokenizer = Tokenizer(WordPiece())
    # bert_tokenizer = Tokenizer(MBartTokenizer())

    from tokenizers import normalizers

    from tokenizers.normalizers import Lowercase, NFD, StripAccents

    bert_tokenizer.normalizer = normalizers.Sequence(
        [NFD(), Lowercase(), StripAccents()])

    from tokenizers.pre_tokenizers import Whitespace

    bert_tokenizer.pre_tokenizer = Whitespace()

    # from tokenizers.processors import TemplateProcessing
    #
    # bert_tokenizer.post_processor = TemplateProcessing(
    #     single="[CLS] $A [SEP]",
    #     pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    #     special_tokens=[
    #         ("[CLS]", 1),
    #         ("[SEP]", 2),
    #     ],
    # )

    from tokenizers.trainers import WordPieceTrainer

    trainer = WordPieceTrainer(
        vocab_size=10000,
        special_tokens=["[UNK]", "[CLS]", "[PAD]",
                        "[MASK]"]  # "[SEP]", "[PAD]", "[MASK]"]
    )
    files = glob.glob(args.text_raw_files_pattern)
    bert_tokenizer.train(trainer, files)

    os.makedirs(args.output_dir, exist_ok=True)
    model_files = bert_tokenizer.model.save(args.output_dir,
                                            "bert-tokenizer-kr")
    bert_tokenizer.model = WordPiece.from_file(*model_files, unk_token="[UNK]")

    bert_tokenizer.save(os.path.join(args.output_dir,
                                     "bert-tokenizer-kr.json"))
Exemplo n.º 15
0
def load_tokenizer(vocab='./tokenizer/vocab.json', merges='./tokenizer/merges.txt', gpt=False, load_from=None):
    if gpt:
        if load_from:
            tokenizer = GPT2Tokenizer.from_pretrained(load_from)
        else:
            tokenizer = GPT2Tokenizer(
                vocab, merges, 
                bos_token=CARD_BEGIN, eos_token=CARD_END, sep_token=CARD_END,
                unk_token=UNK, pad_token=CARD_PAD, mask_token=CARD_MASK, padding_side="left"
            )
    else:
        tokenizer = ByteLevelBPETokenizer(vocab, merges)
        tokenizer.add_special_tokens(SPECIAL_TOKENS + OTHER_TOKENS)
        tokenizer.mask_token = CARD_MASK
    
    tokenizer.pre_tokenizer = Whitespace()
    return tokenizer
def create_tokenizer(sentence_list):
    filename = f'temp_{time.strftime("%Y%m%d-%H%M%S")}.txt'
    with open(filename, 'w') as f:
        for s in sentence_list:
            f.write(f'{s}\n')

    tokenizer = Tokenizer(WordPiece())
    tokenizer.pre_tokenizer = Whitespace()
    tokenizer.decoder = decoders.WordPiece()
    tokenizer.enable_padding(pad_token='[PAD]', pad_id=0)

    trainer = WordPieceTrainer(
        vocab_size=3000, special_tokens=['[PAD]', '[S]', '[/S]', '[UNK]'])
    tokenizer.train(trainer, [filename])

    os.remove(filename)

    return tokenizer
Exemplo n.º 17
0
class GloveTokenizer():
    """Glove Tokenizer that maps from text -> vectors.

    GloVe embeddings are not contextual (are just a lookup) so in the
    huggingface paradim, they are actually part of the tokenizer. Doing this
    properly would involve creating a pre-trained tokenizer
    that is formed using something to the effect of:
    ```python
    import tokenizer
    from tokenizers.models import WordPiece
    tokenizer = tokenizers.Tokenizer(WordPiece())
    # add normalizer/pre_tokenizer
    # ...
    ```
    ... but this feels like overkill here. I'll make my own version here
    but it will quite a bit slower.
    """
    def __init__(self, glove_fp=None, embedding=None):
        assert glove_fp or embedding
        super().__init__()

        self.pre_token = Whitespace()  # i.e. tokenize on whitespace + punct

        if glove_fp:
            self.model = GloveEmbedding(location=glove_fp)
        else:
            self.model = embedding

    def normalize(self, text):
        """Lowercases text."""
        return text.lower()

    def to_words(self, text):
        """Sentence to tokens (on whitespace)"""
        tokens = self.pre_token.pre_tokenize(text)
        # tokens is List[Tuple] where tuple: (token, position).
        return [x for x, _ in tokens]

    def __call__(self, text: str):
        text = self.normalize(text)
        words = self.to_words(text)
        ids = self.model.words_to_ids(words)
        ids = torch.LongTensor(ids)
        return self.model(ids), words
def create_train_bpe_tokenizer(
        bpe_vocab_size,
        asr_text_filepath='asr.txt',
        ttx_text_filepath='ttx.txt',
        save_tokenizer=True,
        tokenizer_filename=".\\data\\tokenizer-test.json"):
    tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
    trainer = BpeTrainer(
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
        vocab_size=bpe_vocab_size)
    tokenizer.pre_tokenizer = Whitespace()
    files = [asr_text_filepath, ttx_text_filepath]
    files = [file for file in files if file]  # Get rid of None's
    tokenizer.train(files, trainer)

    if save_tokenizer:
        tokenizer.save(tokenizer_filename)

    return tokenizer
def main(args):
    # copy from https://github.com/xinjli/allosaurus
    ipa0 = [
        'I', 'a', 'aː', 'ã', 'ă', 'b', 'bʲ', 'bʲj', 'bʷ', 'bʼ', 'bː', 'b̞',
        'b̤', 'b̥', 'c', 'd', 'dʒ', 'dʲ', 'dː', 'd̚', 'd̥', 'd̪', 'd̯', 'd͡z',
        'd͡ʑ', 'd͡ʒ', 'd͡ʒː', 'd͡ʒ̤', 'e', 'eː', 'e̞', 'f', 'fʲ', 'fʷ', 'fː',
        'g', 'gʲ', 'gʲj', 'gʷ', 'gː', 'h', 'hʷ', 'i', 'ij', 'iː', 'i̞', 'i̥',
        'i̯', 'j', 'k', 'kx', 'kʰ', 'kʲ', 'kʲj', 'kʷ', 'kʷʼ', 'kʼ', 'kː',
        'k̟ʲ', 'k̟̚', 'k͡p̚', 'l', 'lʲ', 'lː', 'l̪', 'm', 'mʲ', 'mʲj', 'mʷ',
        'mː', 'n', 'nj', 'nʲ', 'nː', 'n̪', 'n̺', 'o', 'oː', 'o̞', 'o̥', 'p',
        'pf', 'pʰ', 'pʲ', 'pʲj', 'pʷ', 'pʷʼ', 'pʼ', 'pː', 'p̚', 'q', 'r', 'rː',
        's', 'sʲ', 'sʼ', 'sː', 's̪', 't', 'ts', 'tsʰ', 'tɕ', 'tɕʰ', 'tʂ',
        'tʂʰ', 'tʃ', 'tʰ', 'tʲ', 'tʷʼ', 'tʼ', 'tː', 't̚', 't̪', 't̪ʰ', 't̪̚',
        't͡s', 't͡sʼ', 't͡ɕ', 't͡ɬ', 't͡ʃ', 't͡ʃʲ', 't͡ʃʼ', 't͡ʃː', 'u', 'uə',
        'uː', 'u͡w', 'v', 'vʲ', 'vʷ', 'vː', 'v̞', 'v̞ʲ', 'w', 'x', 'x̟ʲ', 'y',
        'z', 'zj', 'zʲ', 'z̪', 'ä', 'æ', 'ç', 'çj', 'ð', 'ø', 'ŋ', 'ŋ̟', 'ŋ͡m',
        'œ', 'œ̃', 'ɐ', 'ɐ̞', 'ɑ', 'ɑ̱', 'ɒ', 'ɓ', 'ɔ', 'ɔ̃', 'ɕ', 'ɕː', 'ɖ̤',
        'ɗ', 'ə', 'ɛ', 'ɛ̃', 'ɟ', 'ɡ', 'ɡʲ', 'ɡ̤', 'ɡ̥', 'ɣ', 'ɣj', 'ɤ', 'ɤɐ̞',
        'ɤ̆', 'ɥ', 'ɦ', 'ɨ', 'ɪ', 'ɫ', 'ɯ', 'ɯ̟', 'ɯ̥', 'ɰ', 'ɱ', 'ɲ', 'ɳ',
        'ɴ', 'ɵ', 'ɸ', 'ɹ', 'ɹ̩', 'ɻ', 'ɻ̩', 'ɽ', 'ɾ', 'ɾj', 'ɾʲ', 'ɾ̠', 'ʀ',
        'ʁ', 'ʁ̝', 'ʂ', 'ʃ', 'ʃʲː', 'ʃ͡ɣ', 'ʈ', 'ʉ̞', 'ʊ', 'ʋ', 'ʋʲ', 'ʌ', 'ʎ',
        'ʏ', 'ʐ', 'ʑ', 'ʒ', 'ʒ͡ɣ', 'ʔ', 'ʝ', 'ː', 'β', 'β̞', 'θ', 'χ', 'ә', 'ḁ'
    ]
    ipa1, ipa2, ipa3 = ipa0.copy(), ipa0.copy(), ipa0.copy()
    random.shuffle(ipa1)
    random.shuffle(ipa2)
    random.shuffle(ipa3)
    # randomly joined to form training data
    passage0 = ' '.join(ipa0)
    passage1 = ' '.join(ipa1)
    passage2 = ' '.join(ipa2)
    passage3 = ' '.join(ipa3)
    data = [passage0, passage1, passage2, passage3]
    # setup
    tokenizer = Tokenizer(WordLevel(unk_token="<unk>"))
    # trainer = WordLevelTrainer(vocab_size=300, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
    trainer = WordLevelTrainer(
        vocab_size=300,
        special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])
    tokenizer.pre_tokenizer = Whitespace()
    # train the tokenizer
    tokenizer.train_from_iterator(data, trainer=trainer)
    tokenizer.save(args.outdir + '/ipa_tokenizer.json')
Exemplo n.º 20
0
    def get_tokenizer_trainer():
        # START init_tokenizer
        from tokenizers import Tokenizer
        from tokenizers.models import BPE

        tokenizer = Tokenizer(BPE())
        # END init_tokenizer
        # START init_trainer
        from tokenizers.trainers import BpeTrainer

        trainer = BpeTrainer(
            special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
        # END init_trainer
        # START init_pretok
        from tokenizers.pre_tokenizers import Whitespace

        tokenizer.pre_tokenizer = Whitespace()
        # END init_pretok
        return tokenizer, trainer
Exemplo n.º 21
0
def train_tokenizer(sentences: List[str], serialize_path: str = "", vocab_size: int = 8000) -> Tokenizer:
    bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
    bert_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
    bert_tokenizer.pre_tokenizer = Whitespace()
    bert_tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", 1),
            ("[SEP]", 2),
        ],
    )
    trainer = WordPieceTrainer(
        vocab_size=vocab_size,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
    )
    bert_tokenizer.train_from_iterator(sentences, trainer=trainer)
    if serialize_path:
        bert_tokenizer.save(serialize_path)
    return bert_tokenizer
Exemplo n.º 22
0
def get_french_vocab(model_name):
    root = Path(os.getcwd()).parent.parent.parent
    french_corpus = "Datasets/corpora/fr/text"
    fr_corpus_path = os.path.join(root, french_corpus)
    files = []
    for dir_ in os.listdir(fr_corpus_path):
        fr_corpus_dir = os.path.join(fr_corpus_path, dir_)
        for text_file in os.listdir(fr_corpus_dir):
            text_file = os.path.join(fr_corpus_dir, text_file)
            files.append(text_file)

    tokenizer = ByteLevelBPETokenizer(add_prefix_space=True)
    tokenizer.pre_tokenizer = Whitespace()

    tokenizer.train(files,
                    vocab_size=20000,
                    min_frequency=2,
                    show_progress=True,
                    special_tokens=["<sos>", "<pad>", "<eos>", "<unk>"])

    print(tokenizer.encode("c'est la meilleure des phrases françaises").tokens)
    tokenizer.save(model_name)
Exemplo n.º 23
0
def train_wordpiece_bert():
    """
    Sample code from: https://huggingface.co/docs/tokenizers/python/latest/pipeline.html
    """
    from tokenizers.models import WordPiece
    bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

    from tokenizers import normalizers
    from tokenizers.normalizers import Lowercase, NFD, StripAccents
    bert_tokenizer.normalizer = normalizers.Sequence(
        [NFD(), Lowercase(), StripAccents()])

    from tokenizers.pre_tokenizers import Whitespace
    bert_tokenizer.pre_tokenizer = Whitespace()

    from tokenizers.processors import TemplateProcessing
    bert_tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", 1),
            ("[SEP]", 2),
        ],
    )

    bert_tokenizer.decoder = decoders.WordPiece()

    from tokenizers.trainers import WordPieceTrainer
    trainer = WordPieceTrainer(
        vocab_size=30522,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
    files = [
        DIR_DATA + os.sep + 'wikitext-103' + os.sep + 'wiki.%s.raw' % a
        for a in ["test", "train", "valid"]
    ]
    bert_tokenizer.train(files, trainer)
    bert_tokenizer.save(DIR_TOKENIZERS + os.sep + 'bert_wiki.json')

    return bert_tokenizer
Exemplo n.º 24
0
def build_new_vocab():

    tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

    trainer = BpeTrainer(
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
    tokenizer.pre_tokenizer = Whitespace()

    # files = [f"/daintlab/home/moo/NLU/biobert-pytorch/datasets/QA/BioASQ/BioASQ-{split}-factoid-7b.json" for split in ["train_split", "dev"]]
    files = "/daintlab/home/moo/NLU/biobert-pytorch/datasets/QA/BioASQ/BioASQ-train-factoid-7b.json"

    with open(files) as f:
        file = json.load(f)
    contexts = []
    for question in file['data']:
        for paragraph in question['paragraphs']:
            contexts.append(paragraph['context'])

    tokenizer.train_from_iterator(contexts, trainer)
    additional_vocab = [k for k, v in tokenizer.get_vocab().items()]

    tokenizer.save("tokenizer/tokenizer-bioasq.json")
    return additional_vocab
Exemplo n.º 25
0
    def __init__(self, tokenizer: PreTrainedTokenizerFast, cased: bool,
                 target_vocab_size: int):
        """
        Args:
            tokenizer: A Rust-based 🤗 Tokenizer
            cased: If False, ignore uppercases in corpus
            target_vocab_size: Size of augmented vocabulary

        Raises:
            ValueError: If :obj:`target_vocab_size` is larger or equal to the existing vocabulary of :obj:`tokenizer`
            RuntimeError: If :obj:`tokenizer` uses an unsupported tokenization model
        """
        if target_vocab_size <= tokenizer.vocab_size:
            raise ValueError(
                f"Ensure that `target_vocab_size` is larger than tokenizer's vocab size."
            )
        self.tokenizer = tokenizer
        self.cased = cased
        self.target_vocab_size = target_vocab_size
        self.model_cls: Type[
            BaseTokenizer] = tokenizer.backend_tokenizer.model.__class__

        # Instantiate rust tokenizer
        rust_tokenizer = Tokenizer(self.model_cls())
        if not cased:
            rust_tokenizer.normalizer = Lowercase()
        rust_tokenizer.pre_tokenizer = Whitespace()
        self.rust_tokenizer = rust_tokenizer

        # Instantiate the appropriate Trainer based on `self.model` (i.e. BPE, WordPiece, etc)
        trainer_cls = self.supported_trainers.get(self.model_cls, None)
        if trainer_cls is None:
            raise RuntimeError(f"{self.model_cls} is not supported")
        self.trainer = trainer_cls(
            vocab_size=self.target_vocab_size,
            special_tokens=list(self.tokenizer.special_tokens_map.values()),
        )
Exemplo n.º 26
0
    "CHEF_DO": 7,
    "MOVE_CONTENTS": 8,
}
k = len(output_vocab)
with open("../data/res2idx.json", 'r') as f:
    for w, i in json.load(f).items():
        output_vocab[w] = k
        k += 1
with open("../data/arg2idx.json", 'r') as f:
    for w, i in json.load(f).items():
        output_vocab[w.replace('-', '_')] = k
        k += 1

output_vocab = {w: i for i, w in enumerate(output_vocab)}
output_tokenizer = Tokenizer(WordLevel(output_vocab, ))
output_tokenizer.pre_tokenizer = Whitespace()

t = output_tokenizer.encode_batch(
    ["SERVE MOVE_CONTENTS", "SERVE MOVE_CONTENTS PUT"])
# print (t)

csv_file = '../data/seq2seq_4335716.csv'
input_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
input_tokenizer.bos_token = input_tokenizer.cls_token
input_tokenizer.eos_token = input_tokenizer.sep_token

val_data = load_dataset('csv', data_files=csv_file, split='train[90%:]')
train_data = load_dataset('csv', data_files=csv_file, split='train[:90%]')
# print(val_data)
# print(train_data)
Exemplo n.º 27
0
 def test_instantiate(self):
     assert Whitespace() is not None
     assert isinstance(Whitespace(), PreTokenizer)
     assert isinstance(Whitespace(), Whitespace)
     assert isinstance(pickle.loads(pickle.dumps(Whitespace())), Whitespace)
Exemplo n.º 28
0
        print(v)

    # Start vocabulary with all standard special tokens. (PAD=0!)
    vocab = {}
    for special_token in ["[PAD]", "[CLS]", "[SEP]", "[UNK]", "[MASK]", "[BOS]", "[EOS]"]:
        vocab[special_token] = len(vocab)
    # Add other words - if not already present.
    for w in words:
        if w not in vocab:
            vocab[w] = len(vocab)
    print(vocab)

    # New tokenizer.
    init_tokenizer = BertWordPieceTokenizer(vocab=vocab) 
    init_tokenizer.normalizer = Sequence([Replace("(", " ( "), Replace(")", " ) "), BertNormalizer()])
    init_tokenizer.pre_tokenizer = Whitespace()
    #init_tokenizer.pad_token_id = vocab["[PAD]"]
    #print("Created tokenizer: ", init_tokenizer)

    # Save the created tokenizer.
    init_tokenizer.save(decoder_tokenizer_path)
    print("Tokenizer saved to: ", decoder_tokenizer_path)

# Load from tokenizer file.
tokenizer = PreTrainedTokenizerFast(tokenizer_file=decoder_tokenizer_path)
tokenizer.add_special_tokens({'pad_token': '[PAD]', 'cls_token': '[CLS]', 'sep_token': '[SEP]',
    'unk_token': '[UNK]', 'mask_token': '[MASK]', 'bos_token': '[BOS]', 'eos_token': '[EOS]'
    })

print(f"\nLoaded tokenizer vocabulary ({len(tokenizer.get_vocab())}):\n" + "-"*50)
for k, v in tokenizer.get_vocab().items():
from tokenizers import Tokenizer
from tokenizers.models import BPE, WordPiece, Unigram
from tokenizers.normalizers import Lowercase
from tokenizers.pre_tokenizers import Whitespace, Digits, Sequence
from tokenizers.trainers import BpeTrainer, WordPieceTrainer, UnigramTrainer

TRAIN_DATA_PATH = 'data/data_fusion_train.parquet'
OUTPUT_PATH = 'data/tokenizers/'

# Prepare data
train = pd.read_parquet(TRAIN_DATA_PATH, columns=['item_name'])
item_names = train.item_name.drop_duplicates().tolist()

# WordPiece tokenizer
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits()])
tokenizer.normalizer = Lowercase()

trainer = WordPieceTrainer(
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
    vocab_size=70000)
tokenizer.train_from_iterator(item_names, trainer)
tokenizer.save(os.path.join(OUTPUT_PATH, 'wordpiece_70k.json'))

# BPE tokenizer
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits()])
tokenizer.normalizer = Lowercase()

trainer = BpeTrainer(
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
from tokenizers import Tokenizer, normalizers
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordLevelTrainer
from tokenizers.processors import TemplateProcessing

t = Tokenizer(WordLevel(unk_token="[UNK]"))
t.pre_tokenizer = Whitespace()

trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[CLS]", "[SEP]"])
t.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    # ,
    # pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", 2),
        ("[SEP]", 3),
    ])

files = ['tok-train-shuf-tgt.tsv']
t.train(files, trainer)

t.save("code_tokenizer.json")