def test_default_with_dict(self): features = [{"label": i, "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)] batch = default_data_collator(features) self.assertTrue(batch["labels"].equal(torch.tensor(list(range(8))))) self.assertEqual(batch["labels"].dtype, torch.long) self.assertEqual(batch["inputs"].shape, torch.Size([8, 6])) # With label_ids features = [{"label_ids": [0, 1, 2], "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)] batch = default_data_collator(features) self.assertTrue(batch["labels"].equal(torch.tensor([[0, 1, 2]] * 8))) self.assertEqual(batch["labels"].dtype, torch.long) self.assertEqual(batch["inputs"].shape, torch.Size([8, 6])) # Features can already be tensors features = [{"label": i, "inputs": torch.randint(10, [10])} for i in range(8)] batch = default_data_collator(features) self.assertTrue(batch["labels"].equal(torch.tensor(list(range(8))))) self.assertEqual(batch["labels"].dtype, torch.long) self.assertEqual(batch["inputs"].shape, torch.Size([8, 10])) # Labels can already be tensors features = [{"label": torch.tensor(i), "inputs": torch.randint(10, [10])} for i in range(8)] batch = default_data_collator(features) self.assertEqual(batch["labels"].dtype, torch.long) self.assertTrue(batch["labels"].equal(torch.tensor(list(range(8))))) self.assertEqual(batch["labels"].dtype, torch.long) self.assertEqual(batch["inputs"].shape, torch.Size([8, 10]))
def test_default_with_no_labels(self): features = [{"label": None, "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)] batch = default_data_collator(features) self.assertTrue("labels" not in batch) self.assertEqual(batch["inputs"].shape, torch.Size([8, 6])) # With label_ids features = [{"label_ids": None, "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)] batch = default_data_collator(features) self.assertTrue("labels" not in batch) self.assertEqual(batch["inputs"].shape, torch.Size([8, 6]))
def __call__(self, video_id, video_feature, text_feature): from transformers import default_data_collator if self.subsampling is not None and self.subsampling > 1: batch = [] for _ in range(self.subsampling): centerclip_idx = random.randint(0, len(text_feature["start"]) - 1) sampled_max_text_len = random.randint(self.sampled_min_len, self.sampled_max_len) batch.append( self.sampling( video_id, video_feature, text_feature, centerclip_idx, sampled_max_text_len, )) batch = self.batch_post_processing(batch, video_feature) batch = default_data_collator(batch) else: batch = self.sampling(video_id, video_feature, text_feature) batch = self.batch_post_processing(batch, video_feature) batch["video_id"] = video_id if isinstance(video_id, str) \ else video_id[0] return batch
def __call__(self, batch): size = len(batch) batch = default_data_collator(batch) if size < self.batch_size: for k in batch.keys(): batch[k] = self.pad_tensor(batch[k], self.padding_val_dict[k]) return batch
def get_data_from_features_or_inputs( tokenizer: BertTokenizer, label_list: List[str], feature: Optional[InputFeatures] = None, inputs: Optional[Dict[str, torch.Tensor]] = None, ) -> Tuple[str, str, str]: if feature is not None and inputs is None: inputs = default_data_collator([feature]) elif feature is None and inputs is not None: pass elif feature is None and inputs is None: raise ValueError elif feature is not None and inputs is not None: raise ValueError X, Y = decode_one_example(tokenizer=tokenizer, label_list=label_list, inputs=inputs, logits=None) premise, hypothesis = X.split("[CLS]")[1].split("[SEP]")[:2] return premise.strip(), hypothesis.strip(), Y
def __call__(self, video_id, video_feature, text_feature): from transformers import default_data_collator video_idx = video_id[1] if self.subsampling is not None and self.subsampling >= 1: batch = [] for _ in range(self.subsampling): centerclip_idx = random.randint(0, len(text_feature["start"]) - 1) batch.append( self.sampling(video_idx, video_feature, text_feature, centerclip_idx, self._get_text_maxlen())) batch = self.batch_post_processing(batch, video_feature) batch = default_data_collator(batch) else: raise ValueError( "dataset.subsampling must be >= 1 for efficient video loading." ) batch = self.sampling(video_idx, video_feature, text_feature) batch = self.batch_post_processing(batch, video_feature) batch["video_id"] = video_id if isinstance(video_id, str) \ else video_id[0] # e2e: make sure frame ids is into tensor. assert torch.is_tensor(batch["vfeats"]) return batch
def collate(self, samples: Any) -> Tensor: """Override to convert a set of samples to a batch.""" if self.padding != "max_length": data_collator = DataCollatorWithPadding( AutoTokenizer.from_pretrained(self.backbone, use_fast=True)) return data_collator(samples) return default_data_collator(samples)
def sample_batch_of_heuristic( self, mode: str, heuristic: str, size: int, return_raw_data: bool = False) -> np.ndarray: if mode not in ["train", "eval"]: raise ValueError if mode == "train": dataset = self._hans_train_dataset else: dataset = self._hans_eval_dataset if dataset is None: raise ValueError("`dataset` is None") indices = self.get_indices_of_heuristic( mode=mode, heuristic=heuristic) sampled_indices = np.random.choice( indices, size=size, replace=False) sampled_data = [dataset[index] for index in sampled_indices] batched_data = default_data_collator(sampled_data) if return_raw_data is False: return batched_data return batched_data, sampled_data
def hans_data_collator( features: List[InputFeatures]) -> Dict[str, torch.Tensor]: """ Data collator that removes the "pairID" key if present. """ batch = default_data_collator(features) _ = batch.pop("pairID", None) return batch
def test_default_with_no_labels(self): features = [{ "label": None, "inputs": [0, 1, 2, 3, 4, 5] } for i in range(8)] batch = default_data_collator(features, return_tensors="np") self.assertTrue("labels" not in batch) self.assertEqual(batch["inputs"].shape, (8, 6)) # With label_ids features = [{ "label_ids": None, "inputs": [0, 1, 2, 3, 4, 5] } for i in range(8)] batch = default_data_collator(features, return_tensors="np") self.assertTrue("labels" not in batch) self.assertEqual(batch["inputs"].shape, (8, 6))
def test_default_with_dict(self): features = [{ "label": i, "inputs": [0, 1, 2, 3, 4, 5] } for i in range(8)] batch = default_data_collator(features, return_tensors="np") self.assertEqual(batch["labels"].tolist(), list(range(8))) self.assertEqual(batch["labels"].dtype, np.int64) self.assertEqual(batch["inputs"].shape, (8, 6)) # With label_ids features = [{ "label_ids": [0, 1, 2], "inputs": [0, 1, 2, 3, 4, 5] } for i in range(8)] batch = default_data_collator(features, return_tensors="np") self.assertEqual(batch["labels"].tolist(), [[0, 1, 2]] * 8) self.assertEqual(batch["labels"].dtype, np.int64) self.assertEqual(batch["inputs"].shape, (8, 6)) # Features can already be tensors features = [{ "label": i, "inputs": np.random.randint(0, 10, [10]) } for i in range(8)] batch = default_data_collator(features, return_tensors="np") self.assertEqual(batch["labels"].tolist(), list(range(8))) self.assertEqual(batch["labels"].dtype, np.int64) self.assertEqual(batch["inputs"].shape, (8, 10)) # Labels can already be tensors features = [{ "label": np.array(i), "inputs": np.random.randint(0, 10, [10]) } for i in range(8)] batch = default_data_collator(features, return_tensors="np") self.assertEqual(batch["labels"].dtype, np.int64) self.assertEqual(batch["labels"].tolist(), (list(range(8)))) self.assertEqual(batch["labels"].dtype, np.int64) self.assertEqual(batch["inputs"].shape, (8, 10))
def __call__(self, sharded_video_idxs, video_features, text_features): from transformers import default_data_collator batch, video_ids = [], [] for video_id, video_feature, text_feature in \ zip(sharded_video_idxs, video_features, text_features): sub_batch = super().__call__(video_id, video_feature, text_feature) batch.append(sub_batch) if isinstance(video_id, tuple): video_id = video_id[0] video_ids.append(video_id) batch = default_data_collator(batch) batch["video_id"] = video_ids return batch
def meta_text_labels(self, config): from transformers import default_data_collator from ..utils import get_local_rank text_processor = TextProcessor(config) binarizer = MetaTextBinarizer(config) # TODO: add prompts to .yaml. text_labels = [label for label in self.text_labels] if get_local_rank() == 0: print(text_labels) outputs = [] for text_label in text_labels: text_feature = text_processor(text_label) outputs.append(binarizer(text_feature)) return default_data_collator(outputs)
def collate(self, samples: Any) -> Tensor: """Override to convert a set of samples to a batch""" if isinstance(samples, dict): samples = [samples] return default_data_collator(samples)
def one_experiment( use_parallel: bool, train_heuristic: str, eval_heuristics: List[str], experiment_type: str, hans_helper: HansHelper, train_dataset: CustomGlueDataset, task_model: torch.nn.Module, faiss_index: faiss_utils.FAISSIndex, s_test_damp: float, s_test_scale: float, s_test_num_samples: int, trainer: transformers.Trainer, version: str, version_2_num_datapoints: Optional[int], version_2_learning_rate: Optional[float], hans_eval_heuristic_inputs: Dict[str, Any], hans_eval_heuristic_raw_inputs: List[InputFeatures], ) -> Tuple[Dict[str, Any], Optional[torch.nn.Module]]: if task_model.device.type != "cuda": raise ValueError("The model is supposed to be on CUDA") if version_2_num_datapoints is None: raise ValueError if version_2_learning_rate is None: raise ValueError if experiment_type in ["most-harmful", "most-helpful"]: influences = influence_helpers.compute_influences_simplified( k=DEFAULT_KNN_K, faiss_index=faiss_index, model=task_model, inputs=hans_eval_heuristic_inputs, train_dataset=train_dataset, use_parallel=use_parallel, s_test_damp=s_test_damp, s_test_scale=s_test_scale, s_test_num_samples=s_test_num_samples, device_ids=[1, 2, 3], precomputed_s_test=None, faiss_index_use_mean_features_as_query=True, ) helpful_indices, harmful_indices = misc_utils.get_helpful_harmful_indices_from_influences_dict( influences, n=version_2_num_datapoints) if experiment_type == "most-helpful": datapoint_indices = helpful_indices if experiment_type == "most-harmful": datapoint_indices = harmful_indices if experiment_type == "random": # s_test = None influences = None hans_eval_heuristic_inputs = None # Essentially shuffle the indices datapoint_indices = np.random.choice(len(train_dataset), size=len(train_dataset), replace=False) loss_collections = {} accuracy_collections = {} # num_datapoints = 1 # learning_rate = 1e-4 num_datapoints = version_2_num_datapoints learning_rate = version_2_learning_rate if version == "new-only-z": datapoints = [ train_dataset[index] for index in datapoint_indices[:num_datapoints] ] if version == "new-only-ztest": datapoints = hans_eval_heuristic_raw_inputs if version == "new-z-and-ztest": datapoints = [ train_dataset[index] for index in datapoint_indices[:num_datapoints] ] + hans_eval_heuristic_raw_inputs batch = default_data_collator(datapoints) new_model, _ = pseudo_gradient_step(model=task_model, inputs=batch, learning_rate=learning_rate) for heuristic in eval_heuristics: new_model_loss, new_model_accuracy = evaluate_heuristic( hans_helper=hans_helper, heuristic=heuristic, trainer=trainer, model=new_model) loss_collections[heuristic] = new_model_loss accuracy_collections[heuristic] = new_model_accuracy # print(f"Finished {num_datapoints}-{learning_rate}") output_collections = { # "s_test": s_test, "influences": influences, "loss": loss_collections, "accuracy": accuracy_collections, "datapoint_indices": datapoint_indices, "learning_rate": learning_rate, "num_datapoints": num_datapoints, "hans_eval_heuristic_inputs": hans_eval_heuristic_inputs, } # Warning: Check again whether using this `new_model` is a good idea return output_collections, new_model
def collate(self, samples: Any) -> Tensor: """Override to convert a set of samples to a batch.""" return default_data_collator(samples)
def _glue_data_collator(features): """Wrap the data collator when we return tuple from dataset.""" # since we are returning two dicts from the dataset, we have to zip it inputs, labels = zip(*features) return default_data_collator(inputs), torch.tensor(labels)