Exemplo n.º 1
0
    def test_default_with_dict(self):
        features = [{"label": i, "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)]
        batch = default_data_collator(features)
        self.assertTrue(batch["labels"].equal(torch.tensor(list(range(8)))))
        self.assertEqual(batch["labels"].dtype, torch.long)
        self.assertEqual(batch["inputs"].shape, torch.Size([8, 6]))

        # With label_ids
        features = [{"label_ids": [0, 1, 2], "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)]
        batch = default_data_collator(features)
        self.assertTrue(batch["labels"].equal(torch.tensor([[0, 1, 2]] * 8)))
        self.assertEqual(batch["labels"].dtype, torch.long)
        self.assertEqual(batch["inputs"].shape, torch.Size([8, 6]))

        # Features can already be tensors
        features = [{"label": i, "inputs": torch.randint(10, [10])} for i in range(8)]
        batch = default_data_collator(features)
        self.assertTrue(batch["labels"].equal(torch.tensor(list(range(8)))))
        self.assertEqual(batch["labels"].dtype, torch.long)
        self.assertEqual(batch["inputs"].shape, torch.Size([8, 10]))

        # Labels can already be tensors
        features = [{"label": torch.tensor(i), "inputs": torch.randint(10, [10])} for i in range(8)]
        batch = default_data_collator(features)
        self.assertEqual(batch["labels"].dtype, torch.long)
        self.assertTrue(batch["labels"].equal(torch.tensor(list(range(8)))))
        self.assertEqual(batch["labels"].dtype, torch.long)
        self.assertEqual(batch["inputs"].shape, torch.Size([8, 10]))
Exemplo n.º 2
0
    def test_default_with_no_labels(self):
        features = [{"label": None, "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)]
        batch = default_data_collator(features)
        self.assertTrue("labels" not in batch)
        self.assertEqual(batch["inputs"].shape, torch.Size([8, 6]))

        # With label_ids
        features = [{"label_ids": None, "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)]
        batch = default_data_collator(features)
        self.assertTrue("labels" not in batch)
        self.assertEqual(batch["inputs"].shape, torch.Size([8, 6]))
Exemplo n.º 3
0
 def __call__(self, video_id, video_feature, text_feature):
     from transformers import default_data_collator
     if self.subsampling is not None and self.subsampling > 1:
         batch = []
         for _ in range(self.subsampling):
             centerclip_idx = random.randint(0,
                                             len(text_feature["start"]) - 1)
             sampled_max_text_len = random.randint(self.sampled_min_len,
                                                   self.sampled_max_len)
             batch.append(
                 self.sampling(
                     video_id,
                     video_feature,
                     text_feature,
                     centerclip_idx,
                     sampled_max_text_len,
                 ))
         batch = self.batch_post_processing(batch, video_feature)
         batch = default_data_collator(batch)
     else:
         batch = self.sampling(video_id, video_feature, text_feature)
         batch = self.batch_post_processing(batch, video_feature)
     batch["video_id"] = video_id if isinstance(video_id, str) \
         else video_id[0]
     return batch
Exemplo n.º 4
0
 def __call__(self, batch):
     size = len(batch)
     batch = default_data_collator(batch)
     if size < self.batch_size:
         for k in batch.keys():
             batch[k] = self.pad_tensor(batch[k], self.padding_val_dict[k])
     return batch
def get_data_from_features_or_inputs(
    tokenizer: BertTokenizer,
    label_list: List[str],
    feature: Optional[InputFeatures] = None,
    inputs: Optional[Dict[str, torch.Tensor]] = None,
) -> Tuple[str, str, str]:

    if feature is not None and inputs is None:
        inputs = default_data_collator([feature])

    elif feature is None and inputs is not None:
        pass

    elif feature is None and inputs is None:
        raise ValueError

    elif feature is not None and inputs is not None:
        raise ValueError

    X, Y = decode_one_example(tokenizer=tokenizer,
                              label_list=label_list,
                              inputs=inputs,
                              logits=None)
    premise, hypothesis = X.split("[CLS]")[1].split("[SEP]")[:2]
    return premise.strip(), hypothesis.strip(), Y
Exemplo n.º 6
0
    def __call__(self, video_id, video_feature, text_feature):
        from transformers import default_data_collator
        video_idx = video_id[1]
        if self.subsampling is not None and self.subsampling >= 1:
            batch = []
            for _ in range(self.subsampling):
                centerclip_idx = random.randint(0,
                                                len(text_feature["start"]) - 1)
                batch.append(
                    self.sampling(video_idx, video_feature, text_feature,
                                  centerclip_idx, self._get_text_maxlen()))
            batch = self.batch_post_processing(batch, video_feature)
            batch = default_data_collator(batch)
        else:
            raise ValueError(
                "dataset.subsampling must be >= 1 for efficient video loading."
            )
            batch = self.sampling(video_idx, video_feature, text_feature)
            batch = self.batch_post_processing(batch, video_feature)

        batch["video_id"] = video_id if isinstance(video_id, str) \
            else video_id[0]
        # e2e: make sure frame ids is into tensor.
        assert torch.is_tensor(batch["vfeats"])
        return batch
Exemplo n.º 7
0
 def collate(self, samples: Any) -> Tensor:
     """Override to convert a set of samples to a batch."""
     if self.padding != "max_length":
         data_collator = DataCollatorWithPadding(
             AutoTokenizer.from_pretrained(self.backbone, use_fast=True))
         return data_collator(samples)
     return default_data_collator(samples)
    def sample_batch_of_heuristic(
            self,
            mode: str,
            heuristic: str,
            size: int,
            return_raw_data: bool = False) -> np.ndarray:

        if mode not in ["train", "eval"]:
            raise ValueError

        if mode == "train":
            dataset = self._hans_train_dataset
        else:
            dataset = self._hans_eval_dataset

        if dataset is None:
            raise ValueError("`dataset` is None")

        indices = self.get_indices_of_heuristic(
            mode=mode, heuristic=heuristic)

        sampled_indices = np.random.choice(
            indices, size=size, replace=False)

        sampled_data = [dataset[index] for index in sampled_indices]
        batched_data = default_data_collator(sampled_data)
        if return_raw_data is False:
            return batched_data

        return batched_data, sampled_data
Exemplo n.º 9
0
def hans_data_collator(
        features: List[InputFeatures]) -> Dict[str, torch.Tensor]:
    """
    Data collator that removes the "pairID" key if present.
    """
    batch = default_data_collator(features)
    _ = batch.pop("pairID", None)
    return batch
    def test_default_with_no_labels(self):
        features = [{
            "label": None,
            "inputs": [0, 1, 2, 3, 4, 5]
        } for i in range(8)]
        batch = default_data_collator(features, return_tensors="np")
        self.assertTrue("labels" not in batch)
        self.assertEqual(batch["inputs"].shape, (8, 6))

        # With label_ids
        features = [{
            "label_ids": None,
            "inputs": [0, 1, 2, 3, 4, 5]
        } for i in range(8)]
        batch = default_data_collator(features, return_tensors="np")
        self.assertTrue("labels" not in batch)
        self.assertEqual(batch["inputs"].shape, (8, 6))
    def test_default_with_dict(self):
        features = [{
            "label": i,
            "inputs": [0, 1, 2, 3, 4, 5]
        } for i in range(8)]
        batch = default_data_collator(features, return_tensors="np")
        self.assertEqual(batch["labels"].tolist(), list(range(8)))
        self.assertEqual(batch["labels"].dtype, np.int64)
        self.assertEqual(batch["inputs"].shape, (8, 6))

        # With label_ids
        features = [{
            "label_ids": [0, 1, 2],
            "inputs": [0, 1, 2, 3, 4, 5]
        } for i in range(8)]
        batch = default_data_collator(features, return_tensors="np")
        self.assertEqual(batch["labels"].tolist(), [[0, 1, 2]] * 8)
        self.assertEqual(batch["labels"].dtype, np.int64)
        self.assertEqual(batch["inputs"].shape, (8, 6))

        # Features can already be tensors
        features = [{
            "label": i,
            "inputs": np.random.randint(0, 10, [10])
        } for i in range(8)]
        batch = default_data_collator(features, return_tensors="np")
        self.assertEqual(batch["labels"].tolist(), list(range(8)))
        self.assertEqual(batch["labels"].dtype, np.int64)
        self.assertEqual(batch["inputs"].shape, (8, 10))

        # Labels can already be tensors
        features = [{
            "label": np.array(i),
            "inputs": np.random.randint(0, 10, [10])
        } for i in range(8)]
        batch = default_data_collator(features, return_tensors="np")
        self.assertEqual(batch["labels"].dtype, np.int64)
        self.assertEqual(batch["labels"].tolist(), (list(range(8))))
        self.assertEqual(batch["labels"].dtype, np.int64)
        self.assertEqual(batch["inputs"].shape, (8, 10))
Exemplo n.º 12
0
 def __call__(self, sharded_video_idxs, video_features, text_features):
     from transformers import default_data_collator
     batch, video_ids = [], []
     for video_id, video_feature, text_feature in \
             zip(sharded_video_idxs, video_features, text_features):
         sub_batch = super().__call__(video_id, video_feature, text_feature)
         batch.append(sub_batch)
         if isinstance(video_id, tuple):
             video_id = video_id[0]
         video_ids.append(video_id)
     batch = default_data_collator(batch)
     batch["video_id"] = video_ids
     return batch
Exemplo n.º 13
0
    def meta_text_labels(self, config):
        from transformers import default_data_collator
        from ..utils import get_local_rank

        text_processor = TextProcessor(config)
        binarizer = MetaTextBinarizer(config)
        # TODO: add prompts to .yaml.
        text_labels = [label for label in self.text_labels]

        if get_local_rank() == 0:
            print(text_labels)

        outputs = []
        for text_label in text_labels:
            text_feature = text_processor(text_label)
            outputs.append(binarizer(text_feature))
        return default_data_collator(outputs)
Exemplo n.º 14
0
 def collate(self, samples: Any) -> Tensor:
     """Override to convert a set of samples to a batch"""
     if isinstance(samples, dict):
         samples = [samples]
     return default_data_collator(samples)
Exemplo n.º 15
0
def one_experiment(
    use_parallel: bool,
    train_heuristic: str,
    eval_heuristics: List[str],
    experiment_type: str,
    hans_helper: HansHelper,
    train_dataset: CustomGlueDataset,
    task_model: torch.nn.Module,
    faiss_index: faiss_utils.FAISSIndex,
    s_test_damp: float,
    s_test_scale: float,
    s_test_num_samples: int,
    trainer: transformers.Trainer,
    version: str,
    version_2_num_datapoints: Optional[int],
    version_2_learning_rate: Optional[float],
    hans_eval_heuristic_inputs: Dict[str, Any],
    hans_eval_heuristic_raw_inputs: List[InputFeatures],
) -> Tuple[Dict[str, Any], Optional[torch.nn.Module]]:
    if task_model.device.type != "cuda":
        raise ValueError("The model is supposed to be on CUDA")

    if version_2_num_datapoints is None:
        raise ValueError
    if version_2_learning_rate is None:
        raise ValueError

    if experiment_type in ["most-harmful", "most-helpful"]:

        influences = influence_helpers.compute_influences_simplified(
            k=DEFAULT_KNN_K,
            faiss_index=faiss_index,
            model=task_model,
            inputs=hans_eval_heuristic_inputs,
            train_dataset=train_dataset,
            use_parallel=use_parallel,
            s_test_damp=s_test_damp,
            s_test_scale=s_test_scale,
            s_test_num_samples=s_test_num_samples,
            device_ids=[1, 2, 3],
            precomputed_s_test=None,
            faiss_index_use_mean_features_as_query=True,
        )
        helpful_indices, harmful_indices = misc_utils.get_helpful_harmful_indices_from_influences_dict(
            influences, n=version_2_num_datapoints)
        if experiment_type == "most-helpful":
            datapoint_indices = helpful_indices

        if experiment_type == "most-harmful":
            datapoint_indices = harmful_indices

    if experiment_type == "random":
        # s_test = None
        influences = None
        hans_eval_heuristic_inputs = None
        # Essentially shuffle the indices
        datapoint_indices = np.random.choice(len(train_dataset),
                                             size=len(train_dataset),
                                             replace=False)

    loss_collections = {}
    accuracy_collections = {}

    # num_datapoints = 1
    # learning_rate = 1e-4
    num_datapoints = version_2_num_datapoints
    learning_rate = version_2_learning_rate

    if version == "new-only-z":
        datapoints = [
            train_dataset[index]
            for index in datapoint_indices[:num_datapoints]
        ]

    if version == "new-only-ztest":
        datapoints = hans_eval_heuristic_raw_inputs

    if version == "new-z-and-ztest":
        datapoints = [
            train_dataset[index]
            for index in datapoint_indices[:num_datapoints]
        ] + hans_eval_heuristic_raw_inputs

    batch = default_data_collator(datapoints)
    new_model, _ = pseudo_gradient_step(model=task_model,
                                        inputs=batch,
                                        learning_rate=learning_rate)

    for heuristic in eval_heuristics:
        new_model_loss, new_model_accuracy = evaluate_heuristic(
            hans_helper=hans_helper,
            heuristic=heuristic,
            trainer=trainer,
            model=new_model)

        loss_collections[heuristic] = new_model_loss
        accuracy_collections[heuristic] = new_model_accuracy
        # print(f"Finished {num_datapoints}-{learning_rate}")

    output_collections = {
        # "s_test": s_test,
        "influences": influences,
        "loss": loss_collections,
        "accuracy": accuracy_collections,
        "datapoint_indices": datapoint_indices,
        "learning_rate": learning_rate,
        "num_datapoints": num_datapoints,
        "hans_eval_heuristic_inputs": hans_eval_heuristic_inputs,
    }

    # Warning: Check again whether using this `new_model` is a good idea
    return output_collections, new_model
Exemplo n.º 16
0
 def collate(self, samples: Any) -> Tensor:
     """Override to convert a set of samples to a batch."""
     return default_data_collator(samples)
Exemplo n.º 17
0
def _glue_data_collator(features):
    """Wrap the data collator when we return tuple from dataset."""
    # since we are returning two dicts from the dataset, we have to zip it
    inputs, labels = zip(*features)
    return default_data_collator(inputs), torch.tensor(labels)