def __call__(self, item): texts = item["text"] if not isinstance(texts, list): texts = [texts] processed = [] for idx, text in enumerate(texts): sample = Sample() processed_text = super().__call__({"text": text}) sample.update(processed_text) sample.segment_ids.fill_(idx) processed.append(sample) # Use SampleList to convert list of tensors to stacked tensors processed = SampleList(processed) processed.input_ids = processed.input_ids.view(-1) processed.input_mask = processed.input_mask.view(-1) processed.segment_ids = processed.segment_ids.view(-1) return processed.to_dict()
def __call__(self, item: Dict[str, Any]): texts = item["text"] if not isinstance(texts, list): texts = [texts] processed = [] for idx, text in enumerate(texts): sample = Sample() processed_text = self.tokenizer({"text": text}) sample.update(processed_text) sample.segment_ids.fill_(idx) processed.append(sample) # Use SampleList to convert list of tensors to stacked tensors processed = SampleList(processed) if self.fusion_strategy == "concat": processed.input_ids = processed.input_ids.view(-1) processed.input_mask = processed.input_mask.view(-1) processed.segment_ids = processed.segment_ids.view(-1) processed.lm_label_ids = processed.lm_label_ids.view(-1) return processed.to_dict()
def __call__(self, batch): # Create and return sample list with proper name # and type set if it is already not a sample list # (case of batched iterators) sample_list = batch if ( # Check if batch is a list before checking batch[0] # or len as sometimes batch is already SampleList isinstance(batch, list) and len(batch) == 1 and isinstance(batch[0], SampleList)): sample_list = batch[0] elif not isinstance(batch, SampleList): sample_list = SampleList(batch) if sample_list._get_tensor_field() is None: sample_list = SampleList(sample_list.to_dict()) sample_list.dataset_name = self._dataset_name sample_list.dataset_type = self._dataset_type return sample_list