def converted(self) -> Tokenizer: tokenizer_info_str = "#version:" token_suffix = "</w>" vocab = self.original_tokenizer.encoder merges = list(self.original_tokenizer.bpe_ranks.keys()) if tokenizer_info_str in merges[0][0]: merges = merges[1:] tokenizer = Tokenizer( BPE( vocab, merges, dropout=None, unk_token=self.original_tokenizer.unk_token, end_of_word_suffix=token_suffix, )) tokenizer.normalizer = normalizers.BertNormalizer(lowercase=False, strip_accents=False) tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer() tokenizer.decoder = decoders.BPEDecoder(suffix=token_suffix) tokenizer.post_processor = processors.BertProcessing( sep=(self.original_tokenizer.sep_token, self.original_tokenizer.sep_token_id), cls=(self.original_tokenizer.cls_token, self.original_tokenizer.cls_token_id), ) return tokenizer
def converted(self) -> Tokenizer: vocab = self.original_tokenizer.vocab tokenizer = Tokenizer( WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token))) # # Let the tokenizer know about special tokens if they are part of the vocab # if tokenizer.token_to_id(str(self.original_tokenizer.unk_token)) is not None: # tokenizer.add_special_tokens([str(self.original_tokenizer.unk_token)]) # if tokenizer.token_to_id(str(self.original_tokenizer.sep_token)) is not None: # tokenizer.add_special_tokens([str(self.original_tokenizer.sep_token)]) # if tokenizer.token_to_id(str(self.original_tokenizer.cls_token)) is not None: # tokenizer.add_special_tokens([str(self.original_tokenizer.cls_token)]) # if tokenizer.token_to_id(str(self.original_tokenizer.pad_token)) is not None: # tokenizer.add_special_tokens([str(self.original_tokenizer.pad_token)]) # if tokenizer.token_to_id(str(self.original_tokenizer.mask_token)) is not None: # tokenizer.add_special_tokens([str(self.original_tokenizer.mask_token)]) tokenize_chinese_chars = False strip_accents = False do_lower_case = False if hasattr(self.original_tokenizer, "basic_tokenizer"): tokenize_chinese_chars = self.original_tokenizer.basic_tokenizer.tokenize_chinese_chars strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case tokenizer.normalizer = normalizers.BertNormalizer( clean_text=True, handle_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, lowercase=do_lower_case, ) tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer() cls = str(self.original_tokenizer.cls_token) sep = str(self.original_tokenizer.sep_token) cls_token_id = self.original_tokenizer.cls_token_id sep_token_id = self.original_tokenizer.sep_token_id tokenizer.post_processor = processors.TemplateProcessing( single= f"{cls}:2 $A:0 {sep}:0", # token_type_id is 2 for Funnel transformer pair=f"{cls}:2 $A:0 {sep}:0 $B:1 {sep}:1", special_tokens=[ (cls, cls_token_id), (sep, sep_token_id), ], ) tokenizer.decoder = decoders.WordPiece(prefix="##") return tokenizer
def converted(self) -> Tokenizer: vocab = self.original_tokenizer.vocab tokenizer = Tokenizer( WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token))) tokenize_chinese_chars = False strip_accents = False do_lower_case = False if hasattr(self.original_tokenizer, "basic_tokenizer"): tokenize_chinese_chars = self.original_tokenizer.basic_tokenizer.tokenize_chinese_chars strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case tokenizer.normalizer = normalizers.BertNormalizer( clean_text=True, handle_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, lowercase=do_lower_case, ) tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer() cls = str(self.original_tokenizer.cls_token) sep = str(self.original_tokenizer.sep_token) question = str(self.original_tokenizer.question_token) dot = "." cls_token_id = self.original_tokenizer.cls_token_id sep_token_id = self.original_tokenizer.sep_token_id question_token_id = self.original_tokenizer.question_token_id dot_token_id = self.original_tokenizer.convert_tokens_to_ids(".") if self.original_tokenizer.padding_side == "right": pair = f"{cls}:0 $A:0 {question} {dot} {sep}:0 $B:1 {sep}:1" else: pair = f"{cls}:0 $A:0 {sep}:0 $B:1 {question} {dot} {sep}:1" tokenizer.post_processor = processors.TemplateProcessing( single=f"{cls}:0 $A:0 {sep}:0", pair=pair, special_tokens=[ (cls, cls_token_id), (sep, sep_token_id), (question, question_token_id), (dot, dot_token_id), ], ) tokenizer.decoder = decoders.WordPiece(prefix="##") return tokenizer
def converted(self) -> Tokenizer: vocab = self.original_tokenizer.vocab tokenizer = Tokenizer( WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token))) tokenize_chinese_chars = False strip_accents = False do_lower_case = False if hasattr(self.original_tokenizer, "basic_tokenizer"): tokenize_chinese_chars = self.original_tokenizer.basic_tokenizer.tokenize_chinese_chars strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case tokenizer.normalizer = normalizers.BertNormalizer( clean_text=True, handle_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, lowercase=do_lower_case, ) tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer() cls = str(self.original_tokenizer.cls_token) sep = str(self.original_tokenizer.sep_token) cls_token_id = self.original_tokenizer.cls_token_id sep_token_id = self.original_tokenizer.sep_token_id tokenizer.post_processor = processors.TemplateProcessing( single=f"{cls}:0 $A:0 {sep}:0", pair= f"{cls}:0 $A:0 {sep}:0 {sep}:0 $B:1 {sep}:1", # MPNet uses two [SEP] tokens special_tokens=[ (cls, cls_token_id), (sep, sep_token_id), ], ) tokenizer.decoder = decoders.WordPiece(prefix="##") return tokenizer
def __init__(self): self._pre_tokenizer = pre_tokenizers.BertPreTokenizer() spacy.prefer_gpu() spacy_tokenizer = spacy.load("en_core_web_lg") #spacy_tokenizer.add_pipe(spacy_tokenizer.create_pipe("merge_noun_chunks")) # spacy_tokenizer.add_pipe("merge_noun_chunks") # spacy_tokenizer.add_pipe("merge_entities") # verb phrase # verb + preposition: 'sit up' pattern1 = [{ 'POS': 'VERB' }, { 'POS': 'PRON', 'OP': '?' }, { 'POS': 'ADP', 'OP': '+' }] # preposition phrase: prep + noun + prep pattern2 = [{'POS': 'ADP'}, {'POS': 'NOUN'}, {'POS': 'ADP'}] # instantiate a Matcher instance phrase_matcher = Matcher(spacy_tokenizer.vocab) #matcher.add("Verb ADP ADP phrase", None, pattern2) phrase_matcher.add("Verb Phrase", None, pattern1) phrase_matcher.add("Preposition Phrase", None, pattern2) spacy_tokenizer.add_pipe(spacy_tokenizer.create_pipe("merge_entities")) spacy_tokenizer.add_pipe(phrase_matcher) self.spacy_tokenizer = spacy_tokenizer self.spacy_tokenizer.tokenizer = self._custom_tokenizer print(self.spacy_tokenizer.pipe_names)
def converted(self) -> Tokenizer: vocab = self.original_tokenizer.encoder merges = list(self.original_tokenizer.bpe_ranks.keys()) unk_token = self.original_tokenizer.unk_token tokenizer = Tokenizer( BPE( vocab=vocab, merges=merges, dropout=None, unk_token=str(unk_token), end_of_word_suffix="</w>", fuse_unk=False, )) if tokenizer.token_to_id(str(unk_token)) is not None: tokenizer.add_special_tokens([str(unk_token)]) tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True) tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer() tokenizer.decoder = decoders.BPEDecoder(suffix="</w>") return tokenizer
elif args.type == "bert": print("Running Bert tokenizer") tok_p = BertTokenizer.from_pretrained(args.vocab) tok_r = Tokenizer( WordPiece.from_files(args.vocab, unk_token="[UNK]", max_input_chars_per_word=100)) tok_r.normalizer = BertNormalizer( clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True, ) # tok_r.pre_tokenizer = pre_tokenizers.Whitespace() tok_r.pre_tokenizer = pre_tokenizers.BertPreTokenizer() tok_r.decoder = decoders.WordPiece() tok_r.post_processor = BertProcessing( ("[SEP]", tok_r.token_to_id("[SEP]")), ("[CLS]", tok_r.token_to_id("[CLS]")), ) else: raise Exception(f"Unknown type {args.type}") def tokenize_r(): return tok_r.encode_batch(text) def tokenize_p(): return [