Exemplo n.º 1
0
    def process_batch(
        self,
        dp: DataPanel,
        columns: List[str],
        **kwargs,
    ) -> tuple:
        """

        Args:
            dp (DataPanel): DataPanel
            columns (list): list of columns
            **kwargs: optional keyword arguments

        Returns:
            Tuple with single output
        """

        assert len(columns) == 2, "Exactly two columns required."

        # Lookup the sentences in the given columns
        [lookup(dp, SpacyOp, [col]) for col in columns]
        sentences = SpacyOp.retrieve(
            batch=dp,
            columns=[[col] for col in columns],
            proc_fns=SpacyOp.sentences,
        )

        return self.similarity(*[sentences[col] for col in columns])
    def score(
        self,
        batch: DataPanel,
        columns: List[str],
        *args,
        **kwargs,
    ) -> np.ndarray:
        # Require that the number of keys is exactly 2
        assert len(columns) == 2, "Must specify exactly 2 keys."

        # Retrieve the trees
        trees = {
            col: lookup(batch, AllenConstituencyParsingOp, [col]) for col in columns
        }
        trees_0, trees_1 = trees[columns[0]], trees[columns[1]]

        # Fuzzy match the trees and return the `scores`
        return np.array(
            [
                fuzz.partial_token_set_ratio(
                    tree_0.replace("(", "").replace(")", "").replace(" ", ""),
                    tree_1.replace("(", "").replace(")", "").replace(" ", ""),
                )
                for tree_0, tree_1 in zip(trees_0, trees_1)
            ]
        )
    def test_apply(self):
        op = ActivationOp(model=self.model, target_module="hidden")

        dataset = op(self.dataset, columns=["i"])

        # Make sure things match up
        acts = lookup(dataset, op, ["i"])
        self.assertEqual(type(acts), list)

        acts = torch.stack(acts)
        self.assertTrue(torch.all(torch.eq(acts, 0)))
        self.assertEqual(list(acts.shape), [4, 2, 10, 10])
    def score(
        self,
        batch: DataPanel,
        columns: List[str],
        *args,
        **kwargs,
    ) -> np.ndarray:
        # Require that the number of keys is exactly 2
        assert len(columns) == 2, "Must specify exactly 2 keys."

        # Retrieve the trees
        trees = {
            col: lookup(batch, AllenConstituencyParsingOp, [col]) for col in columns
        }
        trees_0, trees_1 = trees[columns[0]], trees[columns[1]]

        # Convert the trees corresponding to key 0 to NLTK trees
        trees_0 = [nltk.Tree.fromstring(tree) for tree in trees_0]

        # Find all subtrees of these trees
        all_subtrees_0 = [
            set(
                [
                    str(t).replace("\n", "").replace(" ", "").lower()
                    for t in tree_0.subtrees()
                ]
            )
            for tree_0 in trees_0
        ]

        # Output a fuzzy score if the tree corresponding to key 1 is similar to any
        # subtree
        return np.array(
            [
                max(
                    [
                        fuzz.partial_ratio(
                            tree_1.replace(" ", "")
                            .replace("(..)", "")
                            .replace("(,,)", "")
                            .lower(),
                            subtree,
                        )
                        for subtree in subtrees_0
                    ]
                )
                for tree_1, subtrees_0 in zip(trees_1, all_subtrees_0)
            ]
        )
Exemplo n.º 5
0
    def prepare_batch(
        self,
        batch: DataPanel,
        columns: List[str],
        *args,
        **kwargs,
    ) -> None:

        # Compute the scores
        if isinstance(self.score, Operation):
            self.scores.extend(lookup(batch, self.score, columns))
        elif isinstance(self.score, Callable):
            self.scores.extend(self.score(batch, columns))
        else:
            raise RuntimeError("score function invalid.")
Exemplo n.º 6
0
    def score(
        self,
        batch: DataPanel,
        columns: List[str],
        *args,
        **kwargs,
    ) -> np.ndarray:

        # Length of each example, for each column
        try:
            lengths = [[len(doc) for doc in lookup(batch, SpacyOp, [col])]
                       for col in columns]
        except AttributeError:
            lengths = [[len(text.split()) for text in batch[col]]
                       for col in columns]

        # Reduction over column key axis
        return self.reduction_fn(np.array(lengths), axis=0)
Exemplo n.º 7
0
    def test_apply(self):
        # Create the Bootleg cached operation
        bootleg = BootlegAnnotatorOp(cache_dir=self.cache_dir)

        dataset = bootleg(self.testbed.dataset, columns=["text"])

        # Make sure things match up
        res = lookup(dataset, bootleg, ["text"])
        bootleg_keys = [
            "qids",
            "probs",
            "titles",
            "cands",
            "cand_probs",
            "spans",
            "aliases",
        ]
        for output in res:
            for k in bootleg_keys:
                assert k in output
Exemplo n.º 8
0
    def test_apply(self):
        # Create the Stanza cached operation
        stanza = StanzaOp()
        dataset = stanza(self.testbed.dataset, columns=["text"])

        # Make sure things match up
        self.assertEqual(
            [doc.get("lemma") for doc in lookup(
                dataset,
                stanza,
                ["text"],
            )],
            [
                ["the", "man", "be", "walk", "."],
                ["the", "man", "be", "run", "."],
                ["the", "woman", "be", "sprint", "."],
                ["the", "woman", "be", "rest", "."],
                ["the", "hobbit", "be", "fly", "."],
                ["the", "hobbit", "be", "swim", "."],
            ],
        )
Exemplo n.º 9
0
    def apply(
        self,
        batch: DataPanel,
        columns: List[str],
        slice_membership: np.ndarray = None,
        *args,
        **kwargs,
    ) -> np.ndarray:

        # Keep track of the score of each example
        if isinstance(self.score, Operation):
            scores = lookup(batch, self.score, columns)
        elif isinstance(self.score, Callable):
            scores = self.score(batch, columns)
        else:
            raise RuntimeError("score function invalid.")

        assert (
            len(scores) == slice_membership.shape[0]
        ), "Must have exactly one score per example."

        return self.bin(scores=scores)
    def score(
        self,
        batch: DataPanel,
        columns: List[str],
        *args,
        **kwargs,
    ) -> np.ndarray:
        # Require that the number of keys is exactly 2
        assert len(columns) == 2, "Must specify exactly 2 keys."

        # Lookup the tokens after lower-casing and placing into a set
        try:
            tokens = {
                col: [
                    set([str(tok).lower() for tok in doc])
                    for doc in lookup(batch, SpacyOp, [col])
                ]
                for col in columns
            }
        except AttributeError:
            tokens = {
                col: [
                    set([str(tok).lower() for tok in text.split()])
                    for text in batch[col]
                ]
                for col in columns
            }

        # Compute the intersection over union score
        return np.array(
            [
                len(tokens_0.intersection(tokens_1))
                / float(len(tokens_0.union(tokens_1)))
                for tokens_0, tokens_1 in zip(tokens[columns[0]], tokens[columns[1]])
            ]
        )