Пример #1
0
    def __init__(self,
                 pipeline: "nlpaug_flow.Pipeline",
                 num_transformed: int = 1,
                 identifiers: List[Identifier] = None,
                 *args,
                 **kwargs):
        assert isinstance(pipeline, nlpaug_flow.Pipeline), (
            "`pipeline` must be an nlpaug Pipeline object. Please use \n"
            "from nlpaug.flow import Sequential\n"
            "rg.NlpAugTransformation(pipeline=Sequential(flow=[...])).")

        super(NlpAugTransformation,
              self).__init__(num_transformed=num_transformed,
                             identifiers=Identifier.range(
                                 n=num_transformed,
                                 _name=self.__class__.__name__,
                                 pipeline=[
                                     Identifier(
                                         _name=augmenter.name,
                                         src=augmenter.aug_src if hasattr(
                                             augmenter, "aug_src") else None,
                                         action=augmenter.action,
                                         method=augmenter.method,
                                     ) for augmenter in pipeline
                                 ],
                             ) if not identifiers else identifiers,
                             *args,
                             **kwargs)

        # Set the pipeline
        self.pipeline = pipeline
Пример #2
0
 def __setstate__(self, state):
     state = dict(state)
     if "interactions" in state and isinstance(state["interactions"], str):
         state["interactions"] = self.loads_interactions(
             state["interactions"]).interactions
     if "identifier" in state and isinstance(state["identifier"], str):
         state["identifier"] = Identifier.loads(state["identifier"])
     if "_identifier" in state:
         try:
             state["_identifier"] = Identifier.loads(state["_identifier"])
         except:  # noqa
             pass
     if "lineage" in state:
         try:
             state["lineage"] = [
                 tuple(t[:1]) + (Identifier.loads(t[1]), ) +
                 (tuple(t[2:]) if len(t) > 2 else ())
                 for t in state["lineage"]
             ]
         except:  # noqa
             pass
     if "logdir" in state:
         try:
             state["logdir"] = (
                 pathlib.Path.home() /
                 f"robustnessgym/datasets/{str(state['identifier'])}")
         except:  # noqa
             state["logdir"] = (
                 pathlib.Path.home() /
                 f"robustnessgym/datasets/{str(state['_identifier'])}")
     super(Dataset, self).__setstate__(state)
Пример #3
0
    def from_dataset(
        cls,
        dp: DataPanel,
        input_columns: List[str],
        output_columns: List[str],
        # prediction_columns: List[str],
        # metrics: List[str],
    ) -> TestBench:
        """Create a TestBench from a dp."""
        # Define the task
        task = Task(
            # Identifier
            Identifier("Task", dp=str(dp.identifier)),
            # Input and output schemas
            *Schema.for_dataset(dp, input_columns, output_columns),
        )

        # Create the testbench
        testbench = TestBench(
            identifier=Identifier("TestBench", dp=str(dp.identifier)),
            task=task,
            slices=[dp],
        )

        # testbench.set_single_dataset_mode()
        # testbench.set_prediction_columns(prediction_columns)

        return testbench
 def setUp(self):
     self.min_identifier = Identifier(_name="MyIdentifier")
     self.identifier = Identifier(
         _name="MyIdentifier",
         _index=1,
         param="a",
         param_2="b",
     )
    def test_eq(self):
        # Two identifiers created with the same arguments should be equal
        identifier = Identifier(_name="MyIdentifier", _index=1, param="a", param_2="b")
        self.assertEqual(self.identifier, identifier)
        self.assertNotEqual(self.min_identifier, identifier)

        # But not two identifiers created with different arguments
        identifier = Identifier(_name="MyIdentifier", _index=2, param="a", param_2="b")
        self.assertNotEqual(self.identifier, identifier)
        self.assertNotEqual(self.min_identifier, identifier)
Пример #6
0
    def setUp(self):
        # Arrange
        self.cachedop = CachedOperation(
            apply_fn=a_single_column_apply_fn,
            identifier=Identifier(_name="TestCachedOperation"),
        )

        self.testbed = MockTestBedv0()

        self.multicol_cachedop = CachedOperation(
            apply_fn=a_multi_column_apply_fn,
            identifier=Identifier(_name="TestCachedOperation", to="multiple"),
        )
 def __init__(self, *args, **kwargs):
     super(ConstituencySubtreeSubpopulation, self).__init__(
         intervals=[(1, 1)],
         identifiers=[Identifier(_name=self.__class__.__name__)],
         *args,
         **kwargs,
     )
Пример #8
0
 def __init__(self):
     super(HansLocationNounsB, self).__init__(
         phrase_groups=[[
             "museum", "school", "library", "office", "laboratory"
         ]],
         identifiers=[Identifier(_name=self.__class__.__name__)],
     )
Пример #9
0
 def __init__(self):
     super(HansUnderstoodArgumentVerbs, self).__init__(
         phrase_groups=[[
             "paid", "explored", "won", "wrote", "left", "read", "ate"
         ]],
         identifiers=[Identifier(_name=self.__class__.__name__)],
     )
Пример #10
0
 def __init__(self):
     super(HansPastParticiples, self).__init__(
         phrase_groups=[[
             "studied", "paid", "helped", "investigated", "presented"
         ]],
         identifiers=[Identifier(_name=self.__class__.__name__)],
     )
Пример #11
0
    def update(self, identifier: Union[str, Identifier],
               columns: List[str]) -> None:
        """Update the interaction tape with information about an interaction.

        Args:
            identifier: Identifier for the interaction used.
            columns: list of columns on which the interaction was applied.

        Returns: True if the interaction was added to the tape, False if it was
        already applied before.
        """
        if isinstance(identifier, str):
            identifier = Identifier(_name=identifier)
        elif isinstance(identifier, Identifier):
            pass
        else:
            raise ValueError(
                f"Parameter `identifier` should be an instance of class Identifier "
                f"or str, "
                f"not {type(identifier)}.")

        # Dump the column names to JSON
        json_columns = strings_as_json(strings=columns)

        # Check if the entry is not in the history
        if (identifier, json_columns) not in self.history:
            # Give it the next index
            self.history[(identifier, json_columns)] = len(self.history)
Пример #12
0
    def test_from_jsonl(self):
        # Create a temporary directory
        os.makedirs("tmp", exist_ok=True)

        # Create a json file with data
        with jsonlines.open("tmp/data.jsonl", "w") as writer:
            writer.write_all(
                transpose_batch({
                    "a": [1, 2, 3],
                    "b": [True, False, True],
                    "c": ["x", "y", "z"],
                    "d": [{
                        "e": 2
                    }, {
                        "e": 3
                    }, {
                        "e": 4
                    }],
                }))

        # Load the dataset
        dataset = Dataset.from_jsonl(
            json_path="tmp/data.jsonl",
            identifier=Identifier(_name="MockJSONDataset"),
        )

        self.assertEqual(set(dataset.column_names),
                         {"a", "b", "c", "d", "index"})
        self.assertEqual(len(dataset), 3)

        # Remove the temporary directory
        shutil.rmtree("tmp")
Пример #13
0
    def __init__(self):
        # Create a fake dataset
        self.dataset = Dataset.from_batch(
            {
                "text_a": [
                    "Before the actor slept, the senator ran.",
                    "The lawyer knew that the judges shouted.",
                    "If the actor slept, the judge saw the artist.",
                    "The lawyers resigned, or the artist slept.",
                ],
                "text_b": [
                    "The actor slept.",
                    "The judges shouted.",
                    "The actor slept.",
                    "The artist slept.",
                ],
                "label": [0, 0, 1, 1],
                "z": [1, 0, 1, 0],
                "fast": [False, True, True, False],
            },
            identifier=Identifier(_name="MockDataset", version="2.0"),
        )

        # Keep a copy of the original
        self.original_dataset = deepcopy(self.dataset)

        assert len(self.dataset) == 4
Пример #14
0
 def __init__(self):
     super(HansConstAdv, self).__init__(
         phrase_groups=[
             ["after", "before", "because", "although", "though", "since", "while"]
         ],
         identifiers=[Identifier(_name=self.__class__.__name__)],
     )
Пример #15
0
 def __init__(self):
     super(HansFoodWords, self).__init__(
         phrase_groups=[
             ["fruit", "salad", "broccoli", "sandwich", "rice", "corn", "ice cream"]
         ],
         identifiers=[Identifier(_name=self.__class__.__name__)],
     )
    def test_init(self):
        # Create a simple identifier with a name
        identifier = Identifier(_name="MyIdentifier")
        self.assertEqual(str(identifier), "MyIdentifier")

        # Create an identifier with a string index
        identifier = Identifier(_name="MyIdentifier", _index="abc")
        self.assertEqual(str(identifier), "MyIdentifier-abc")

        # Create an identifier with an integer index
        identifier = Identifier(_name="MyIdentifier", _index=1)
        self.assertEqual(str(identifier), "MyIdentifier-1")

        # Create an identifier with an integer index and two parameters
        identifier = Identifier(_name="MyIdentifier", _index=1, param="a", param_2="b")
        self.assertEqual(str(identifier), "MyIdentifier-1(param=a, param_2=b)")
Пример #17
0
    def from_jsonl(
        cls,
        json_path: str,
        identifier: Identifier = None,
        dataset_fmt: str = "in_memory",
    ) -> Dataset:
        """Load a dataset from a .jsonl file on disk, where each line of the
        json file consists of a single example."""

        if dataset_fmt == "in_memory":
            # Load the .jsonl file
            with open(json_path) as f:
                data = [json.loads(line) for line in f]

            return cls(
                data,
                identifier=identifier
                if identifier else Identifier("RGDataset", jsonl=json_path),
                dataset_fmt=dataset_fmt,
            )

        elif dataset_fmt == "datasets":
            # Use jsonarrow to directly load the json
            return cls(
                jsonarrow.read_json(json_path),
                identifier=identifier,
                dataset_fmt=dataset_fmt,
            )
        else:
            raise NotImplementedError
Пример #18
0
    def __init__(
        self,
        dataset: str,
        model: str,
        constrain_pos: bool = True,
        **kwargs,
    ):
        super().__init__(identifiers=[
            Identifier(self.__class__.__name__, dataset=dataset, model=model)
        ], )

        self.constrain_pos = constrain_pos

        self.dataset = dataset.lower()
        if self.dataset == "mnli":
            self.attack = morpheus.MorpheusHuggingfaceNLI(model)
        elif "squad" in self.dataset:
            is_squad2 = "2" in self.dataset
            self.attack = morpheus.MorpheusHuggingfaceQA(model,
                                                         squad2=is_squad2)
        elif self.dataset == "cnn_dailymail" or self.dataset == "xsum":
            rouge_type = kwargs.get("rouge_type", "rougeL")
            max_input_tokens = kwargs.get("max_input_tokens", 1024)
            self.attack = morpheus.MorpheusHuggingfaceQA(
                model,
                rouge_type=rouge_type,
                max_input_tokens=max_input_tokens)
        else:
            raise NotImplementedError
Пример #19
0
 def __init__(self):
     super(HansNonEntQuotVerbs, self).__init__(
         phrase_groups=[
             ["hoped", "claimed", "thought", "believed", "said", "assumed"]
         ],
         identifiers=[Identifier(_name=self.__class__.__name__)],
     )
Пример #20
0
 def __init__(self):
     super(HasNegation, self).__init__(
         phrase_groups=[
             [
                 "no",
                 "not",
                 "none",
                 "noone ",
                 "nobody",
                 "nothing",
                 "neither",
                 "nowhere",
                 "never",
                 "hardly",
                 "scarcely",
                 "barely",
                 "doesnt",
                 "isnt",
                 "wasnt",
                 "shouldnt",
                 "wouldnt",
                 "couldnt",
                 "wont",
                 "cant",
                 "dont",
             ]
         ],
         identifiers=[Identifier(_name=self.__class__.__name__)],
     )
Пример #21
0
 def __init__(self):
     super(HansQuestionEmbeddingVerbs, self).__init__(
         phrase_groups=[
             ["wondered", "understood", "knew", "asked", "explained", "realized"]
         ],
         identifiers=[Identifier(_name=self.__class__.__name__)],
     )
Пример #22
0
    def __init__(self):
        # Create a fake batch of data
        self.batch = {
            "text": [
                "The man is walking.",
                "The man is running.",
                "The woman is sprinting.",
                "The woman is resting.",
                "The hobbit is flying.",
                "The hobbit is swimming.",
            ],
            "label": [0, 0, 1, 1, 0, 0],
            "z": [1, 0, 1, 0, 1, 0],
            "fast": [False, True, True, False, False, False],
            "metadata": [
                {"source": "real"},
                {"source": "real"},
                {"source": "real"},
                {"source": "real"},
                {"source": "fictional"},
                {"source": "fictional"},
            ],
        }
        # Create a fake dataset
        self.dataset = Dataset.from_batch(
            self.batch,
            identifier=Identifier(_name="MockDataset", version="1.0"),
        )

        # Keep a copy of the original
        self.original_dataset = deepcopy(self.dataset)

        assert len(self.dataset) == 6
Пример #23
0
 def __init__(self):
     super(HansAdverbs, self).__init__(
         phrase_groups=[
             ["quickly", "slowly", "happily", "easily", "quietly", "thoughtfully"]
         ],
         identifiers=[Identifier(_name=self.__class__.__name__)],
     )
Пример #24
0
    def __init__(
        self,
        dataset: Dataset,
    ):
        # Call the superclass
        super(DevBench, self).__init__()

        # An identifier for the DevBench
        self.identifier = Identifier("DevBench",
                                     dataset=str(dataset.identifier))

        # Dataset that the devbench operates on
        self._dataset = dataset

        # Create the collection of slices
        self._slices = set()
        self._slice_identifiers = set()
        self._slice_table = {}

        # The devbench has aggregators
        self.aggregators = {}

        # The devbench internally tracks metrics
        self.metrics = {}

        # Add slices if any
        self.add_slices(dataset)
Пример #25
0
 def __init__(self):
     super(HansAdvsEntailed, self).__init__(
         phrase_groups=[
             ["certainly", "definitely", "clearly", "obviously", "suddenly"]
         ],
         identifiers=[Identifier(_name=self.__class__.__name__)],
     )
Пример #26
0
    def __init__(
            self,
            intervals: List[Tuple[int, int]],
            metric: Sequence[str] = ("rouge1", "fmeasure"),
            *args,
            **kwargs,
    ):
        assert (
            len(metric) == 2
        ), "Must pass in both rouge score and one of precision/recall/fmeasure."
        super(RougeMatrixScoreSubpopulation, self).__init__(
            intervals=intervals,
            identifiers=[
                Identifier(
                    _name=self.__class__.__name__,
                    gte=interval[0],
                    lte=interval[1],
                    metric=metric,
                ) for interval in intervals
            ],
            *args,
            **kwargs,
        )

        # Assign the metric
        self.metric = metric
Пример #27
0
    def __init__(
        self, phrases=None, identifiers: List[Identifier] = None, *args, **kwargs
    ):

        super(HasPhrase, self).__init__(
            # One slice per phrase
            identifiers=[
                Identifier(_name=self.__class__.__name__, phrase=phrase)
                for phrase in phrases
            ]
            if not identifiers
            else identifiers,
            *args,
            **kwargs
        )

        # This is the list of phrases that will be searched
        self.phrases = phrases
        if self.phrases is None:
            self.phrases = []

        # Create and populate Aho-Corasick automatons for words and phrases
        self.word_ahocorasick = AhoCorasick.from_phrases(
            {i: phrase for i, phrase in enumerate(self.phrases) if " " not in phrase}
        )
        self.phrase_ahocorasick = AhoCorasick.from_phrases(
            {i: phrase for i, phrase in enumerate(self.phrases) if " " in phrase}
        )
Пример #28
0
 def __init__(self):
     super(HansNPZVerbs, self).__init__(
         phrase_groups=[[
             "hid", "moved", "presented", "paid", "studied", "stopped"
         ]],
         identifiers=[Identifier(_name=self.__class__.__name__)],
     )
Пример #29
0
    def __init__(self,
                 num_transformed=1,
                 alpha_sr=0.1,
                 alpha_ri=0.1,
                 alpha_rs=0.1,
                 p_rd=0.1):

        super(EasyDataAugmentation,
              self).__init__(identifiers=Identifier.range(
                  n=num_transformed,
                  _name=self.__class__.__name__,
                  alpha_sr=alpha_sr,
                  alpha_ri=alpha_ri,
                  alpha_rs=alpha_rs,
                  p_rd=p_rd,
              ))

        # Set the parameters
        self.alpha_sr = alpha_sr
        self.alpha_ri = alpha_ri
        self.alpha_rs = alpha_rs
        self.p_rd = p_rd

        # Download wordnet
        self._download_wordnet()
Пример #30
0
    def __init__(self, *args, identifier: Identifier = None, **kwargs):

        if len(args) == 1 and isinstance(args[0], datasets.Dataset):
            # Create a Dataset directly from an datasets.Dataset object
            self.__dict__ = args[0].__dict__.copy()
        else:
            super(Dataset, self).__init__(*args, **kwargs)

        # Call the superclass constructor
        InteractionTapeHierarchyMixin.__init__(self)

        self.identifier = (Identifier(
            _name=self.info.builder_name,
            split=str(self.split),
            version=self.version,
        ) if not identifier else identifier)

        # Keep track of the original dataset keys
        self.original_columns = list(self.features.keys())

        # Add an index to the dataset
        dataset = self.map(self.add_index, with_indices=True)
        self.__dict__.update(dataset.__dict__)

        # TODO(karan): fix the identifier settings for Dataset
        if self.identifier is not None and not str(
                self.identifier).startswith("None"):
            self.logdir /= str(self.identifier)
            self.logdir.mkdir(parents=True, exist_ok=True)