def __init__(self, vocab_file, spm_file): self.vocab = load_vocab(vocab_file) self.inv_vocab = {v: k for k, v in self.vocab.items()} self.bpe = BPE(vocab_file) self.s = spm.SentencePieceProcessor() self.s.Load(spm_file)
class ThaiTokenizer(object): """Tokenizes Thai texts.""" def __init__(self, vocab_file, spm_file): self.vocab = load_vocab(vocab_file) self.inv_vocab = {v: k for k, v in self.vocab.items()} self.bpe = BPE(vocab_file) self.s = spm.SentencePieceProcessor() self.s.Load(spm_file) def tokenize(self, text): bpe_tokens = self.bpe.encode(text).split(' ') spm_tokens = self.s.EncodeAsPieces(text) tokens = bpe_tokens if len(bpe_tokens) < len(spm_tokens) else spm_tokens split_tokens = [] for token in tokens: new_token = token if token.startswith('_') and not token in self.vocab: split_tokens.append('_') new_token = token[1:] if not new_token in self.vocab: split_tokens.append('<unk>') else: split_tokens.append(new_token) return split_tokens def convert_tokens_to_ids(self, tokens): return convert_by_vocab(self.vocab, tokens) def convert_ids_to_tokens(self, ids): return convert_by_vocab(self.inv_vocab, ids)
def __init__(self, vocab_file, spm_file, unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]", mask_token="[MASK]", **kwargs): # copy the super clause from huggingface repo super(ThaiTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, **kwargs) self.vocab = load_vocab(vocab_file) self.inv_vocab = {v: k for k, v in self.vocab.items()} self.bpe = BPE(vocab_file) self.s = spm.SentencePieceProcessor() self.s.Load(spm_file) self.pad_token = pad_token # Straigth copied from hugging face repo self.max_len = 512 # Hard code 512 for BERT architecture self.max_len_single_sentence = self.max_len - 2 # take into account special tokens self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens # Added from allennlp wordpiece id self._token_min_padding_length = 0 # Copy from token_indexer self.has_warned_for_as_padded_tensor = False # Copy from token_indexer self._added_to_vocabulary = False namespace = "wordpiece" self._namespace = namespace self.max_pieces = 512 self.use_starting_offsets = False self._truncate_long_sequences = False # Set False to use sliding window self._warned_about_truncation = False separator_token = "[SEP]" start_tokens = ["[CLS]"] end_tokens = ["[SEP]"] # Convert the start_tokens and end_tokens to wordpiece_ids self._start_piece_ids = [ self.vocab[wordpiece] for token in (start_tokens or []) # for wordpiece in wordpiece_tokenizer(token) for wordpiece in self.tokenize(token) ] self._end_piece_ids = [ self.vocab[wordpiece] for token in (end_tokens or []) # for wordpiece in wordpiece_tokenizer(token) for wordpiece in self.tokenize(token) ] # Convert the separator_token to wordpiece_ids self._separator_ids = [ # vocab[wordpiece] for wordpiece in wordpiece_tokenizer(separator_token) self.vocab[wordpiece] for wordpiece in self.tokenize(separator_token) ]
class ThaiTokenizer(PreTrainedTokenizer): """Tokenizes Thai texts.""" def __init__(self, vocab_file, spm_file, unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]", mask_token="[MASK]", **kwargs): # copy the super clause from huggingface repo super(ThaiTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, **kwargs) self.vocab = load_vocab(vocab_file) self.inv_vocab = {v: k for k, v in self.vocab.items()} self.bpe = BPE(vocab_file) self.s = spm.SentencePieceProcessor() self.s.Load(spm_file) self.pad_token = pad_token # Straigth copied from hugging face repo self.max_len = 512 # Hard code 512 for BERT architecture self.max_len_single_sentence = self.max_len - 2 # take into account special tokens self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens # Added from allennlp wordpiece id self._token_min_padding_length = 0 # Copy from token_indexer self.has_warned_for_as_padded_tensor = False # Copy from token_indexer self._added_to_vocabulary = False namespace = "wordpiece" self._namespace = namespace self.max_pieces = 512 self.use_starting_offsets = False self._truncate_long_sequences = False # Set False to use sliding window self._warned_about_truncation = False separator_token = "[SEP]" start_tokens = ["[CLS]"] end_tokens = ["[SEP]"] # Convert the start_tokens and end_tokens to wordpiece_ids self._start_piece_ids = [ self.vocab[wordpiece] for token in (start_tokens or []) # for wordpiece in wordpiece_tokenizer(token) for wordpiece in self.tokenize(token) ] self._end_piece_ids = [ self.vocab[wordpiece] for token in (end_tokens or []) # for wordpiece in wordpiece_tokenizer(token) for wordpiece in self.tokenize(token) ] # Convert the separator_token to wordpiece_ids self._separator_ids = [ # vocab[wordpiece] for wordpiece in wordpiece_tokenizer(separator_token) self.vocab[wordpiece] for wordpiece in self.tokenize(separator_token) ] def tokenize(self, text, add_special_tokens=None): bpe_tokens = self.bpe.encode(text).split(' ') spm_tokens = self.s.EncodeAsPieces(text) tokens = bpe_tokens if len(bpe_tokens) < len( spm_tokens) else spm_tokens split_tokens = [] for token in tokens: new_token = token if token.startswith('_') and not token in self.vocab: split_tokens.append('_') new_token = token[1:] if not new_token in self.vocab: split_tokens.append('<unk>') else: split_tokens.append(new_token) return split_tokens # Added from single_id_token_indexer from allennlp def count_vocab_items(self, token: Token, counter: Dict[str, Dict[str, int]]): # If `text_id` is set on the token (e.g., if we're using some kind of hash-based word # encoding), we will not be using the vocab for this token. if getattr(token, "text_id", None) is None: text = token.text # if self.lowercase_tokens: # Does not need to lowercase in Thai language # text = text.lower() # counter[self.namespace][text] += 1 counter['bert_token'][text] += 1 def _add_encoding_to_vocabulary(self, vocabulary: Vocabulary) -> None: for word, idx in self.vocab.items(): vocabulary._token_to_index[self._namespace][word] = idx vocabulary._index_to_token[self._namespace][idx] = word def _warn_about_truncation(self, tokens: List[Token]) -> None: if not self._warned_about_truncation: logger.warning( "Too many wordpieces, truncating sequence. " "If you would like a sliding window, set `truncate_long_sequences` to False." f"The offending input was: {str([token.text for token in tokens])}." "To avoid polluting your logs we will not warn about this again." ) self._warned_about_truncation = True def get_empty_token_list( self ) -> IndexedTokenList: # Copied from allenlp WordpieceTokenizer return { "input_ids": [], "offsets": [], "token_type_ids": [], "mask": [] } def _add_start_and_end(self, wordpiece_ids: List[int]) -> List[int]: return self._start_piece_ids + wordpiece_ids + self._end_piece_ids def _extend(self, token_type_ids: List[int]) -> List[int]: """ Extend the token type ids by len(start_piece_ids) on the left and len(end_piece_ids) on the right. """ first = token_type_ids[0] if token_type_ids else 0 last = token_type_ids[-1] if token_type_ids else 0 return ([first for _ in self._start_piece_ids] + token_type_ids + [last for _ in self._end_piece_ids]) # Copyied from single_id_token_indexers def get_padding_lengths(self, token: int) -> Dict[str, int]: # pylint: disable=unused-argument return {} def get_token_min_padding_length( self) -> int: # copied from allennlp token_indexer """ This method returns the minimum padding length required for this TokenIndexer. For example, the minimum padding length of `SingleIdTokenIndexer` is the largest size of filter when using `CnnEncoder`. """ return self._token_min_padding_length def as_padded_tensor( self, tokens: Dict[str, List[int]], desired_num_tokens: Dict[str, int], padding_lengths: Dict[str, int] ) -> Dict[str, torch.Tensor]: # pylint: disable=unused-argument return { key: torch.LongTensor( pad_sequence_to_length(val, desired_num_tokens[key])) for key, val in tokens.items() } def convert_by_vocab_allenlp(self, vocab_allennlp, items): # Copied from allennlp wordpiece indexer if not self._added_to_vocabulary: self._add_encoding_to_vocabulary(vocab_allennlp) self._added_to_vocabulary = True # Obtain a nested sequence of wordpieces, each represented by a list of wordpiece ids # Change from wordpiece_tokenizer to thaibert_tokenizer token_wordpiece_ids = [ [self.vocab[wordpiece] for wordpiece in self.tokenize(token.text)] # Add .text after token for thaibert tokenizer for token in items ] # Flattened list of wordpieces. In the end, the output of the model (e.g., BERT) should # have a sequence length equal to the length of this list. However, it will first be split into # chunks of length `self.max_pieces` so that they can be fit through the model. After packing # and passing through the model, it should be unpacked to represent the wordpieces in this list. flat_wordpiece_ids = [ wordpiece for token in token_wordpiece_ids for wordpiece in token ] # Similarly, we want to compute the token_type_ids from the flattened wordpiece ids before # we do the windowing; otherwise [SEP] tokens would get counted multiple times. flat_token_type_ids = _get_token_type_ids(flat_wordpiece_ids, self._separator_ids) # The code below will (possibly) pack the wordpiece sequence into multiple sub-sequences by using a sliding # window `window_length` that overlaps with previous windows according to the `stride`. Suppose we have # the following sentence: "I went to the store to buy some milk". Then a sliding window of length 4 and # stride of length 2 will split them up into: # "[I went to the] [to the store to] [store to buy some] [buy some milk [PAD]]". # This is to ensure that the model has context of as much of the sentence as possible to get accurate # embeddings. Finally, the sequences will be padded with any start/end piece ids, e.g., # "[CLS] I went to the [SEP] [CLS] to the store to [SEP] ...". # The embedder should then be able to split this token sequence by the window length, # pass them through the model, and recombine them. # Specify the stride to be half of `self.max_pieces`, minus any additional start/end wordpieces window_length = self.max_pieces - len(self._start_piece_ids) - len( self._end_piece_ids) stride = window_length // 2 # offsets[i] will give us the index into wordpiece_ids # for the wordpiece "corresponding to" the i-th input token. offsets = [] # If we're using initial offsets, we want to start at offset = len(text_tokens) # so that the first offset is the index of the first wordpiece of tokens[0]. # Otherwise, we want to start at len(text_tokens) - 1, so that the "previous" # offset is the last wordpiece of "tokens[-1]". offset = (len(self._start_piece_ids) if self.use_starting_offsets else len(self._start_piece_ids) - 1) # Count amount of wordpieces accumulated pieces_accumulated = 0 for token in token_wordpiece_ids: # Truncate the sequence if specified, which depends on where the offsets are next_offset = 1 if self.use_starting_offsets else 0 if (self._truncate_long_sequences and offset + len(token) - 1 >= window_length + next_offset): break # For initial offsets, the current value of `offset` is the start of # the current wordpiece, so add it to `offsets` and then increment it. if self.use_starting_offsets: offsets.append(offset) offset += len(token) # For final offsets, the current value of `offset` is the end of # the previous wordpiece, so increment it and then add it to `offsets`. else: offset += len(token) offsets.append(offset) pieces_accumulated += len(token) if len(flat_wordpiece_ids) <= window_length: # If all the wordpieces fit, then we don't need to do anything special wordpiece_windows = [self._add_start_and_end(flat_wordpiece_ids)] token_type_ids = self._extend(flat_token_type_ids) elif self._truncate_long_sequences: # self._warn_about_truncation(tokens) self._warn_about_truncation(items) wordpiece_windows = [ self._add_start_and_end( flat_wordpiece_ids[:pieces_accumulated]) ] token_type_ids = self._extend( flat_token_type_ids[:pieces_accumulated]) else: # Create a sliding window of wordpieces of length `max_pieces` that advances by `stride` steps and # add start/end wordpieces to each window # TODO: this currently does not respect word boundaries, so words may be cut in half between windows # However, this would increase complexity, as sequences would need to be padded/unpadded in the middle wordpiece_windows = [ self._add_start_and_end(flat_wordpiece_ids[i:i + window_length]) for i in range(0, len(flat_wordpiece_ids), stride) ] token_type_windows = [ self._extend(flat_token_type_ids[i:i + window_length]) for i in range(0, len(flat_token_type_ids), stride) ] # Check for overlap in the last window. Throw it away if it is redundant. last_window = wordpiece_windows[-1][1:] penultimate_window = wordpiece_windows[-2] if last_window == penultimate_window[-len(last_window):]: wordpiece_windows = wordpiece_windows[:-1] token_type_windows = token_type_windows[:-1] token_type_ids = [ token_type for window in token_type_windows for token_type in window ] # Flatten the wordpiece windows wordpiece_ids = [ wordpiece for sequence in wordpiece_windows for wordpiece in sequence ] # Our mask should correspond to the original tokens, # because calling util.get_text_field_mask on the # "wordpiece_id" tokens will produce the wrong shape. # However, because of the max_pieces constraint, we may # have truncated the wordpieces; accordingly, we want the mask # to correspond to the remaining tokens after truncation, which # is captured by the offsets. mask = [1 for _ in offsets] return { "input_ids": wordpiece_ids, "offsets": offsets, "token_type_ids": token_type_ids, "mask": mask, } # output = [] # if items != '[PAD]' and items != '[CLS]' and items != '[SEP]': #Added to fix bug in tokenization step # for item in items: # pdb.set_trace() # #if isinstance(item,list):pdb.set_trace() # output.append(vocab[item.text]) #add .text to get the string from allennlp.Token # #pdb.set_trace() # else: # output.append(vocab[items]) # #pdb.set_trace() # return output # def convert_tokens_to_ids(self, tokens): # return convert_by_vocab(self.vocab, tokens) def tokens_to_indices( self, tokens, vocabulary, indexer_name): # Added in to use BERT in conjunction with BIDAF # Add a dummy vocabulary to make the code works # indexer_name is just a dummy argument to make the code work return self.convert_by_vocab_allenlp(vocabulary, tokens) def convert_ids_to_tokens(self, ids): return convert_by_vocab(self.inv_vocab, ids) # Added in function from BERT tokenizer in huggingface repo @property def vocab_size(self): return len(self.vocab) def _tokenize(self, text): split_tokens = [] if self.do_basic_tokenize: for token in self.basic_tokenizer.tokenize( text, never_split=self.all_special_tokens): for sub_token in self.wordpiece_tokenizer.tokenize(token): split_tokens.append(sub_token) else: split_tokens = self.wordpiece_tokenizer.tokenize(text) return split_tokens def _convert_token_to_id(self, token): """ Converts a token (str/unicode) in an id using the vocab. """ return self.vocab.get(token, self.vocab.get(self.unk_token)) def _convert_id_to_token(self, index): """Converts an index (integer) in a token (string/unicode) using the vocab.""" return self.ids_to_tokens.get(index, self.unk_token) def convert_tokens_to_string(self, tokens): """ Converts a sequence of tokens (string) in a single string. """ out_string = ' '.join(tokens).replace(' ##', '').strip() return out_string def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. A BERT sequence has the following format: single sequence: [CLS] X [SEP] pair of sequences: [CLS] A [SEP] B [SEP] """ if token_ids_1 is None: return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] cls = [self.cls_token_id] sep = [self.sep_token_id] return cls + token_ids_0 + sep + token_ids_1 + sep def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. Args: token_ids_0: list of ids (must not contain special tokens) token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids for sequence pairs already_has_special_tokens: (default False) Set to True if the token list is already formated with special tokens for the model Returns: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: if token_ids_1 is not None: raise ValueError( "You should not supply a second sequence if the provided sequence of " "ids is already formated with special tokens for the model." ) return list( map( lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) if token_ids_1 is not None: return [1] + ([0] * len(token_ids_0)) + [1] + ( [0] * len(token_ids_1)) + [1] return [1] + ([0] * len(token_ids_0)) + [1] def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence pair mask has the following format: 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 | first sequence | second sequence if token_ids_1 is None, only returns the first portion of the mask (0's). """ sep = [self.sep_token_id] cls = [self.cls_token_id] if token_ids_1 is None: return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] def save_vocabulary(self, vocab_path): """Save the tokenizer vocabulary to a directory or file.""" index = 0 if os.path.isdir(vocab_path): vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file']) else: vocab_file = vocab_path with open(vocab_file, "w", encoding="utf-8") as writer: for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): if index != token_index: logger.warning( "Saving vocabulary to {}: vocabulary indices are not consecutive." " Please check that the vocabulary is not corrupted!". format(vocab_file)) index = token_index writer.write(token + u'\n') index += 1 return (vocab_file, )