def as_padded_tensor_dict( self, tokens: IndexedTokenList, padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]: # Different transformers use different padding values for tokens, but for mask and type id, the padding # value is always 0. tokens = tokens.copy() padding_lengths = padding_lengths.copy() offsets_tokens = tokens.pop("offsets") offsets_padding_lengths = padding_lengths.pop("offsets") tensor_dict = { key: torch.LongTensor( pad_sequence_to_length( val, padding_lengths[key], default_value=lambda: 0 if "mask" in key or "type-ids" in key else self._tokenizer.pad_token_id, )) for key, val in tokens.items() } tensor_dict["offsets"] = torch.LongTensor( pad_sequence_to_length(offsets_tokens, offsets_padding_lengths, default_value=lambda: (0, 0))) return tensor_dict
def as_padded_tensor_dict( self, tokens: IndexedTokenList, padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]: tensor_dict = {} for key, val in tokens.items(): if key == "type_ids": padding_value = 0 mktensor = torch.LongTensor elif key == "mask" or key == "wordpiece_mask": padding_value = False mktensor = torch.BoolTensor elif len(val) > 0 and isinstance(val[0], bool): padding_value = False mktensor = torch.BoolTensor else: padding_value = self._tokenizer.pad_token_id if padding_value is None: padding_value = ( 0 # Some tokenizers don't have padding tokens and rely on the mask only. ) mktensor = torch.LongTensor tensor = mktensor( pad_sequence_to_length(val, padding_lengths[key], default_value=lambda: padding_value)) tensor_dict[key] = tensor return tensor_dict
def as_padded_tensor_dict( self, tokens: IndexedTokenList, padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]: tokens = tokens.copy() padding_lengths = padding_lengths.copy() offsets_tokens = tokens.pop("offsets") offsets_padding_lengths = padding_lengths.pop("offsets") tensor_dict = self._matched_indexer.as_padded_tensor_dict( tokens, padding_lengths) tensor_dict["offsets"] = torch.LongTensor( pad_sequence_to_length(offsets_tokens, offsets_padding_lengths, default_value=lambda: (0, 0))) return tensor_dict
def indices_to_tokens(self, indexed_tokens: IndexedTokenList, vocabulary: Vocabulary) -> List[Token]: token_ids = indexed_tokens["token_ids"] type_ids = indexed_tokens.get("type_ids") return [ Token( text=vocabulary.get_token_from_index(token_ids[i], self._namespace), text_id=token_ids[i], type_id=type_ids[i] if type_ids is not None else None, ) for i in range(len(token_ids)) ]
def as_padded_tensor_dict( self, tokens: IndexedTokenList, padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]: # Different transformers use different padding values for tokens, but for mask and type id, the padding # value is always 0. return { key: torch.LongTensor( pad_sequence_to_length( val, padding_lengths[key], default_value=lambda: 0 if key == 'attention-mask' else (0 if "mask" in key or "type-ids" in key else self. _tokenizer.pad_token_id), )) for key, val in tokens.items() }
def as_padded_tensor_dict( self, tokens: IndexedTokenList, padding_lengths: Dict[str, int], ) -> Dict[str, torch.Tensor]: # Different transformers use different padding values for tokens, but for mask and type id, the padding # value is always 0. return { key: torch.LongTensor( pad_sequence_to_length( val, padding_lengths[key], default_value=lambda: 0 if key in { "mask", "type_ids", "wordpiece_mask", "segment_concat_mask" } else self._tokenizer.pad_token_id, )) for key, val in tokens.items() }
def as_padded_tensor_dict( self, tokens: IndexedTokenList, padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]: # Different transformers use different padding values for tokens, but for mask and type id, the padding # value is always False/0. tensor_dict = {} for key, val in tokens.items(): if val and isinstance(val[0], bool): tensor = torch.BoolTensor( pad_sequence_to_length(val, padding_lengths[key], default_value=lambda: False)) else: tensor = torch.LongTensor( pad_sequence_to_length( val, padding_lengths[key], default_value=lambda: 0 if key == "type_ids" else self._tokenizer.pad_token_id, ), ) tensor_dict[key] = tensor return tensor_dict
def as_padded_tensor_dict( self, tokens: IndexedTokenList, padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]: # Different transformers use different padding values for tokens, but for mask and type id, the padding # value is always 0. # 改写: # tensor= {} # for key, val in tokens.items(): # if key in {'mask','type_ids'}: # pass # else: # self._tokenizer.pad_token_id return { key: torch.LongTensor( pad_sequence_to_length( val, padding_lengths[key], default_value=lambda: 0 if key in {"mask", "type_ids"} else self._tokenizer.pad_token_id, )) for key, val in tokens.items() }
def _postprocess_output(self, output: IndexedTokenList) -> IndexedTokenList: """ Takes an IndexedTokenList about to be returned by `tokens_to_indices()` and adds any necessary postprocessing, e.g. long sequence splitting. The input should have a `"token_ids"` key corresponding to the token indices. They should have special tokens already inserted. """ if self._max_length is not None: # We prepare long indices by converting them to (assuming max_length == 5) # [CLS] A B C [SEP] [CLS] D E F [SEP] ... # Embedder is responsible for folding this 1-d sequence to 2-d and feed to the # transformer model. # TODO(zhaofengw): we aren't respecting word boundaries when segmenting wordpieces. indices = output["token_ids"] type_ids = output.get("type_ids", [0] * len(indices)) # Strips original special tokens indices = indices[ self._num_added_start_tokens : len(indices) - self._num_added_end_tokens ] type_ids = type_ids[ self._num_added_start_tokens : len(type_ids) - self._num_added_end_tokens ] # Folds indices folded_indices = [ indices[i : i + self._effective_max_length] for i in range(0, len(indices), self._effective_max_length) ] folded_type_ids = [ type_ids[i : i + self._effective_max_length] for i in range(0, len(type_ids), self._effective_max_length) ] # Adds special tokens to each segment folded_indices = [ self._tokenizer.build_inputs_with_special_tokens(segment) for segment in folded_indices ] single_sequence_start_type_ids = [ t.type_id for t in self._allennlp_tokenizer.single_sequence_start_tokens ] single_sequence_end_type_ids = [ t.type_id for t in self._allennlp_tokenizer.single_sequence_end_tokens ] folded_type_ids = [ single_sequence_start_type_ids + segment + single_sequence_end_type_ids for segment in folded_type_ids ] assert all( len(segment_indices) == len(segment_type_ids) for segment_indices, segment_type_ids in zip(folded_indices, folded_type_ids) ) # Flattens indices = [i for segment in folded_indices for i in segment] type_ids = [i for segment in folded_type_ids for i in segment] output["token_ids"] = indices output["type_ids"] = type_ids output["segment_concat_mask"] = [True] * len(indices) return output