def pad_token_sequence( self, tokens: Dict[str, List[List[int]]], desired_num_tokens: Dict[str, int], padding_lengths: Dict[str, int]) -> Dict[str, List[List[int]]]: # Pad the tokens. # tokens has only one key... key = list(tokens.keys())[0] padded_tokens = pad_sequence_to_length( tokens[key], desired_num_tokens[key], default_value=self.get_padding_token) # Pad the characters within the tokens. desired_token_length = padding_lengths['num_token_characters'] longest_token: List[int] = max(tokens[key], key=len, default=[]) padding_value = 0 if desired_token_length > len(longest_token): # Since we want to pad to greater than the longest token, we add a # "dummy token" so we can take advantage of the fast implementation of itertools.zip_longest. padded_tokens.append([padding_value] * desired_token_length) # pad the list of lists to the longest sublist, appending 0's padded_tokens = list( zip(*itertools.zip_longest(*padded_tokens, fillvalue=padding_value))) if desired_token_length > len(longest_token): # Removes the "dummy token". padded_tokens.pop() # Truncates all the tokens to the desired length, and return the result. return { key: [list(token[:desired_token_length]) for token in padded_tokens] }
def pad_token_sequence( self, tokens: Dict[str, List[int]], desired_num_tokens: Dict[str, int], padding_lengths: Dict[str, int] ) -> Dict[str, List[int]]: # pylint: disable=unused-argument return { key: pad_sequence_to_length(val, desired_num_tokens[key]) for key, val in tokens.items() }
def as_tensor(self, padding_lengths: Dict[str, int]) -> DataArray: padded_field_list = pad_sequence_to_length(self.field_list, padding_lengths['num_fields'], self.field_list[0].empty_field) # Here we're removing the scoping on the padding length keys that we added in # `get_padding_lengths`; see the note there for more detail. child_padding_lengths = {key.replace('list_', '', 1): value for key, value in padding_lengths.items() if key.startswith('list_')} padded_fields = [field.as_tensor(child_padding_lengths) for field in padded_field_list] return self.field_list[0].batch_tensors(padded_fields)
def as_tensor(self, padding_lengths: Dict[str, int]) -> torch.Tensor: desired_num_tokens = padding_lengths['num_tokens'] padded_tags = pad_sequence_to_length(self._indexed_labels, desired_num_tokens) tensor = torch.LongTensor(padded_tags) return tensor