def test_from_jsonl(self): # Create a temporary directory os.makedirs("tmp", exist_ok=True) # Create a json file with data with jsonlines.open("tmp/data.jsonl", "w") as writer: writer.write_all( transpose_batch({ "a": [1, 2, 3], "b": [True, False, True], "c": ["x", "y", "z"], "d": [{ "e": 2 }, { "e": 3 }, { "e": 4 }], })) # Load the dataset dataset = Dataset.from_jsonl( json_path="tmp/data.jsonl", identifier=Identifier(_name="MockJSONDataset"), ) self.assertEqual(set(dataset.column_names), {"a", "b", "c", "d", "index"}) self.assertEqual(len(dataset), 3) # Remove the temporary directory shutil.rmtree("tmp")
def apply(self, skeleton_batches: List[Dict[str, List]], slice_membership: np.ndarray, batch: Dict[str, List], columns: List[str], *args, **kwargs) -> Tuple[List[Dict[str, List]], np.ndarray]: for i, example in enumerate(transpose_batch(batch)): if self.dataset == "mnli": # Assume column order is [premise, hypothesis, label] prem_col, hypo_col, label_col = columns text_label = self.get_NLI_text_label(example[label_col]) new_prem, new_hypo, predicted_label, _ = self.attack.morph( example[prem_col], example[hypo_col], example[label_col], constrain_pos=self.constrain_pos, ) if predicted_label != text_label: skeleton_batches[0][prem_col][i] = new_prem skeleton_batches[0][hypo_col][i] = new_hypo else: slice_membership[i, 0] = 0 elif "squad" in self.dataset: question_col = columns[0] # NOTE: assume first element in columns is question_col # Ignoring the rest since example['answers'] is another Dict question_dict = self.prepare_question_dict( example, question_col) new_question, predicted_answer = self.attack.morph( question_dict, example["context"], constrain_pos=self.constrain_pos) if predicted_answer not in example["answers"]["text"]: skeleton_batches[0][question_col][i] = new_question else: slice_membership[i, 0] = 0 elif self.dataset == "cnn_dailymail" or self.dataset == "xsum": # Assume column order is [article_col, summary_col] article_col, summary_col = columns new_article, predicted_summary, _ = self.attack.morph( example[article_col], example[summary_col], constrain_pos=self.constrain_pos, ) if predicted_summary != example[summary_col]: skeleton_batches[0][article_col][i] = new_article else: slice_membership[i, 0] = 0 else: raise NotImplementedError return skeleton_batches, slice_membership
def similarity(self, batch_doc_1: List[str], batch_doc_2: List[str]): # Compute the scores between every pair of documents scores = self.metric.compute(predictions=batch_doc_1, references=batch_doc_2, use_agregator=False) # Transpose the batch of scores scores = [ tz.valmap( lambda v: { m: getattr(v, m) for m in ["precision", "recall", "fmeasure"] }, example, ) for example in transpose_batch(scores) ] return scores