예제 #1
0
    def test_from_jsonl(self):
        # Create a temporary directory
        os.makedirs("tmp", exist_ok=True)

        # Create a json file with data
        with jsonlines.open("tmp/data.jsonl", "w") as writer:
            writer.write_all(
                transpose_batch({
                    "a": [1, 2, 3],
                    "b": [True, False, True],
                    "c": ["x", "y", "z"],
                    "d": [{
                        "e": 2
                    }, {
                        "e": 3
                    }, {
                        "e": 4
                    }],
                }))

        # Load the dataset
        dataset = Dataset.from_jsonl(
            json_path="tmp/data.jsonl",
            identifier=Identifier(_name="MockJSONDataset"),
        )

        self.assertEqual(set(dataset.column_names),
                         {"a", "b", "c", "d", "index"})
        self.assertEqual(len(dataset), 3)

        # Remove the temporary directory
        shutil.rmtree("tmp")
예제 #2
0
    def apply(self, skeleton_batches: List[Dict[str, List]],
              slice_membership: np.ndarray, batch: Dict[str, List],
              columns: List[str], *args,
              **kwargs) -> Tuple[List[Dict[str, List]], np.ndarray]:

        for i, example in enumerate(transpose_batch(batch)):
            if self.dataset == "mnli":
                # Assume column order is [premise, hypothesis, label]
                prem_col, hypo_col, label_col = columns
                text_label = self.get_NLI_text_label(example[label_col])

                new_prem, new_hypo, predicted_label, _ = self.attack.morph(
                    example[prem_col],
                    example[hypo_col],
                    example[label_col],
                    constrain_pos=self.constrain_pos,
                )
                if predicted_label != text_label:
                    skeleton_batches[0][prem_col][i] = new_prem
                    skeleton_batches[0][hypo_col][i] = new_hypo
                else:
                    slice_membership[i, 0] = 0
            elif "squad" in self.dataset:
                question_col = columns[0]
                # NOTE: assume first element in columns is question_col
                # Ignoring the rest since example['answers'] is another Dict
                question_dict = self.prepare_question_dict(
                    example, question_col)
                new_question, predicted_answer = self.attack.morph(
                    question_dict,
                    example["context"],
                    constrain_pos=self.constrain_pos)
                if predicted_answer not in example["answers"]["text"]:
                    skeleton_batches[0][question_col][i] = new_question
                else:
                    slice_membership[i, 0] = 0
            elif self.dataset == "cnn_dailymail" or self.dataset == "xsum":
                # Assume column order is [article_col, summary_col]
                article_col, summary_col = columns
                new_article, predicted_summary, _ = self.attack.morph(
                    example[article_col],
                    example[summary_col],
                    constrain_pos=self.constrain_pos,
                )
                if predicted_summary != example[summary_col]:
                    skeleton_batches[0][article_col][i] = new_article
                else:
                    slice_membership[i, 0] = 0
            else:
                raise NotImplementedError
        return skeleton_batches, slice_membership
예제 #3
0
    def similarity(self, batch_doc_1: List[str], batch_doc_2: List[str]):
        # Compute the scores between every pair of documents
        scores = self.metric.compute(predictions=batch_doc_1,
                                     references=batch_doc_2,
                                     use_agregator=False)

        # Transpose the batch of scores
        scores = [
            tz.valmap(
                lambda v: {
                    m: getattr(v, m)
                    for m in ["precision", "recall", "fmeasure"]
                },
                example,
            ) for example in transpose_batch(scores)
        ]

        return scores