def split_by_doc(self) -> List[TransformerData]: """Split a TransformerData that represents a batch into a list with one TransformerData per Doc. """ flat_spans = [] for doc_spans in self.spans: flat_spans.extend(doc_spans) token_positions = get_token_positions(flat_spans) outputs = [] start = 0 prev_tokens = 0 for doc_spans in self.spans: if len(doc_spans) == 0 or len(doc_spans[0]) == 0: outputs.append(TransformerData.empty()) continue start_i = token_positions[doc_spans[0][0]] end_i = token_positions[doc_spans[-1][-1]] + 1 end = start + len(doc_spans) doc_tokens = self.wordpieces[start:end] doc_align = self.align[start_i:end_i] doc_align.data = doc_align.data - prev_tokens if self.attention: attn = [torch2xp(t[start:end]) for t in self.attention] else: attn = None outputs.append( TransformerData( wordpieces=doc_tokens, tensors=[torch2xp(t[start:end]) for t in self.tensors], align=doc_align, attention=attn, )) prev_tokens += doc_tokens.input_ids.size start += len(doc_spans) return outputs
def split_by_doc(self) -> List[TransformerData]: """Split a TransformerData that represents a batch into a list with one TransformerData per Doc. """ flat_spans = [] for doc_spans in self.spans: flat_spans.extend(doc_spans) token_positions = get_token_positions(flat_spans) outputs = [] start = 0 prev_tokens = 0 for doc_spans in self.spans: if len(doc_spans) == 0 or len(doc_spans[0]) == 0: outputs.append(TransformerData.empty()) token_count = 0 else: start_i = token_positions[doc_spans[0][0]] end_i = token_positions[doc_spans[-1][-1]] + 1 end = start + len(doc_spans) doc_tokens = slice_hf_tokens(self.tokens, start, end) doc_tensors = [torch2xp(t[start:end]) for t in self.tensors] doc_align = self.align[start_i:end_i] doc_align.data = doc_align.data - prev_tokens outputs.append( TransformerData( tokens=doc_tokens, tensors=doc_tensors, # type: ignore align=doc_align, )) token_count = sum( len(texts) for texts in doc_tokens["input_texts"]) prev_tokens += token_count start += len(doc_spans) return outputs
def test_pytorch_roundtrip_conversion(): import torch xp_tensor = numpy.zeros((2, 3), dtype="f") torch_tensor = xp2torch(xp_tensor) assert isinstance(torch_tensor, torch.Tensor) new_xp_tensor = torch2xp(torch_tensor) assert numpy.array_equal(xp_tensor, new_xp_tensor)
def from_batch_encoding(cls, token_data: BatchEncoding) -> "WordpieceBatch": assert (isinstance(token_data, BatchEncoding) or isinstance(token_data, dict)) pad_token = token_data.get("pad_token", "[PAD]") lengths = [ len([tok for tok in tokens if tok != pad_token]) for tokens in token_data["input_texts"] ] n_seq = len(lengths) return cls( strings=token_data["input_texts"], input_ids=torch2xp(token_data["input_ids"]).reshape((n_seq, -1)), attention_mask=torch2xp(token_data["attention_mask"]).reshape( (n_seq, -1)), lengths=lengths, token_type_ids=(torch2xp(token_data["token_type_ids"]).reshape( (n_seq, -1)) if "token_type_ids" in token_data else None))
def split_by_doc(self) -> List[TransformerData]: """Split a TransformerData that represents a batch into a list with one TransformerData per Doc. """ flat_spans = [] for doc_spans in self.spans: flat_spans.extend(doc_spans) token_positions = get_token_positions(flat_spans) outputs = [] start = 0 prev_tokens = 0 for doc_spans in self.spans: if len(doc_spans) == 0 or len(doc_spans[0]) == 0: outputs.append(TransformerData.empty()) continue start_i = token_positions[doc_spans[0][0]] end_i = token_positions[doc_spans[-1][-1]] + 1 end = start + len(doc_spans) doc_tokens = self.wordpieces[start:end] doc_align = self.align[start_i:end_i] doc_align.data = doc_align.data - prev_tokens model_output = ModelOutput() last_hidden_state = self.model_output.last_hidden_state for key, output in self.model_output.items(): if isinstance(output, torch.Tensor): model_output[key] = torch2xp(output[start:end]) elif (isinstance(output, tuple) and all(isinstance(t, torch.Tensor) for t in output) and all(t.shape[0] == last_hidden_state.shape[0] for t in output)): model_output[key] = [ torch2xp(t[start:end]) for t in output ] outputs.append( TransformerData( wordpieces=doc_tokens, model_output=model_output, align=doc_align, )) prev_tokens += doc_tokens.input_ids.size start += len(doc_spans) return outputs
def convert_transformer_outputs(model, inputs_outputs, is_train): layer_inputs, torch_outputs = inputs_outputs torch_tokvecs: torch.Tensor = torch_outputs[0] # Free the memory as soon as we can torch_outputs = None lengths = list(layer_inputs.input_len) tokvecs: List[Floats2d] = model.ops.unpad(torch2xp(torch_tokvecs), lengths) # Remove the BOS and EOS markers. tokvecs = [arr[1:-1] for arr in tokvecs] def backprop(d_tokvecs: List[Floats2d]) -> ArgsKwargs: # Restore entries for bos and eos markers. row = model.ops.alloc2f(1, d_tokvecs[0].shape[1]) d_tokvecs = [model.ops.xp.vstack((row, arr, row)) for arr in d_tokvecs] return ArgsKwargs( args=(torch_tokvecs,), kwargs={"grad_tensors": xp2torch(model.ops.pad(d_tokvecs))}, ) return tokvecs, backprop