Пример #1
0
    def initialize(self,
                   path: str,
                   subtasks: Optional[List[str]] = None) -> SyntaxEvalCorpora:
        """Performs the initialization for the tasks of
        Marvin & Linzen (2018)

        Arxiv link: https://arxiv.org/pdf/1808.09031.pdf

        Repo: https://github.com/BeckyMarvin/LM_syneval

        Parameters
        ----------
        path : str
            Path to directory containing the Marvin datasets that can be
            found in the github repo.
        subtasks : List[str], optional
            The downstream tasks that will be tested. If not provided this
            will default to the full set of conditions.

        Returns
        -------
        corpora : Dict[str, Corpus]
            Dictionary mapping a subtask to a Corpus.
        """
        subtasks: List[str] = subtasks or ENVS

        corpora: SyntaxEvalCorpora = {}

        orig_corpus = preproc_warstadt(path)

        for env in subtasks:
            raw_corpus = create_downstream_corpus(orig_corpus, envs=[env])

            header = raw_corpus[0].split("\t")
            tokenize_columns = ["sen", "counter_sen"]
            fields = Corpus.create_fields(header,
                                          tokenize_columns=tokenize_columns,
                                          tokenizer=self.tokenizer)
            examples = [
                Example.fromlist(line.split("\t"), fields)
                for line in raw_corpus[1:]
            ]
            corpus = Corpus(examples, fields)

            corpora[env] = corpus

        return corpora
Пример #2
0
    def _create_corpus(self, path: str, condition_slice: slice) -> Corpus:
        """Attach the correct and incorrect verb form to each sentence
        in the corpus.
        """
        raw_corpus = Corpus.create_raw_corpus(path)

        for idx in range(0, len(raw_corpus), 2):
            token = raw_corpus[idx][0].split()[-1]
            counter_token = raw_corpus[idx + 1][0].split()[-1]
            sen = " ".join(raw_corpus[idx][0].split()[:-1])
            raw_corpus[idx] = [sen, token, counter_token]

        raw_corpus = raw_corpus[::2][condition_slice]

        fields = Corpus.create_fields(
            ["sen", "token", "counter_token"], tokenizer=self.tokenizer
        )

        examples = Corpus.create_examples(raw_corpus, fields)

        return Corpus(examples, fields)
Пример #3
0
    def _create_corpus(
        self,
        items: List[RawItem],
        verb_inflections: Dict[str, str],
        items_per_subtask: Optional[int],
    ) -> Corpus:
        header = ["sen", "token", "counter_token"]
        fields = Corpus.create_fields(header, tokenizer=self.tokenizer)

        examples: List[Optional[Example]] = [
            self._item_to_example(item, fields, verb_inflections)
            for item in items
        ]

        examples: List[Example] = list(filter(None, examples))

        if items_per_subtask is not None:
            examples = examples[:items_per_subtask]

        corpus = Corpus(examples, fields)

        return corpus
Пример #4
0
    def _initialize_subtask(self, subtask: str, corpus_path: str) -> Dict[str, Corpus]:
        corpus_dict: Dict[str, List[Sequence[str]]] = load_pickle(corpus_path)

        if "npi" in subtask:
            header = ["sen", "counter_sen", "token"]
            tokenize_columns = ["sen", "counter_sen"]
        else:
            header = ["sen", "token", "counter_token"]
            tokenize_columns = ["sen"]

        fields = Corpus.create_fields(
            header, tokenize_columns=tokenize_columns, tokenizer=self.tokenizer
        )
        subtask_corpora: Dict[str, Corpus] = {}

        for condition, sens in corpus_dict.items():
            examples = self._create_examples(subtask, sens, fields)

            corpus = Corpus(examples, fields)

            subtask_corpora[condition] = corpus

        return subtask_corpora