Пример #1
0
    def initialize(self,
                   path: str,
                   header: Optional[List[str]] = None) -> SyntaxEvalCorpora:
        if header is None:
            header = ["sen", "token", "counter_token"]

        assert "sen" in header
        assert "token" in header
        assert "counter_sen" in header or "counter_token" in header

        corpora = {}

        if os.path.isdir(path):
            for file in glob.glob(os.path.join(path, "*")):
                corpus = Corpus.create(file,
                                       header=header,
                                       tokenizer=self.tokenizer)
                task_name = file.split("/")[-1].split(".")[0]
                corpora[task_name] = corpus
        elif os.path.isfile(path):
            corpus = Corpus.create(path,
                                   header=header,
                                   tokenizer=self.tokenizer)
            task_name = path.split("/")[-1].split(".")[0]
            corpora[task_name] = corpus
        else:
            raise FileNotFoundError("Path to task is not found")

        return corpora
Пример #2
0
    def _create_init_states_from_corpus(
        self,
        init_states_corpus: str,
        tokenizer: PreTrainedTokenizer,
        save_init_states_to: Optional[str] = None,
    ) -> ActivationDict:
        assert (
            tokenizer is not None
        ), "Tokenizer must be provided when creating init states from corpus"

        corpus: Corpus = Corpus.create(init_states_corpus, tokenizer=tokenizer)

        activation_names: ActivationNames = [
            (layer, name) for layer in range(self.num_layers)
            for name in ["hx", "cx"]
        ]

        extractor = Extractor(
            self,
            corpus,
            activation_names,
            activations_dir=save_init_states_to,
            selection_func=final_sen_token,
        )
        init_states = extractor.extract().activation_dict

        return init_states
Пример #3
0
    def initialize(self,
                   path: str,
                   subtasks: Optional[List[str]] = None) -> SyntaxEvalCorpora:
        """

        Parameters
        ----------
        path : str
            Path to directory containing the Marvin datasets that can be
            found in the github repo.
        subtasks : List[str], optional
            The downstream tasks that will be tested. If not provided this
            will default to the full set of conditions.

        Returns
        -------
        corpora : Dict[str, Corpus]
            Dictionary mapping a subtask to a Corpus.
        """
        subtasks = subtasks or ["stereo", "unamb"]

        corpora: SyntaxEvalCorpora = {}

        for subtask in subtasks:
            for condition in ["FF", "FM", "MF", "MM"]:
                corpus = Corpus.create(
                    os.path.join(path, f"{subtask}_{condition}.tsv"),
                    header_from_first_line=True,
                    tokenizer=self.tokenizer,
                )

                self._add_output_classes(corpus)

                corpora.setdefault(subtask, {})[condition] = corpus

        return corpora
Пример #4
0
    if not os.path.isdir(results_dir):
        os.mkdir(results_dir)

    results = {mn: {} for mn in MODEL_NAMES}

    for mn in MODEL_NAMES:
        config_dict["model"]["state_dict"] = os.path.join(
            model_dir, f"{mn}/40.pt")
        config_dict["tokenizer"]["path"] = os.path.join(
            model_dir, f"{mn}/vocab.txt")
        config_dict["probe"]["save_dir"] = os.path.join(results_dir, mn)
        envs = [env for env in ENVS if env == mn[:len(env)]] or ENVS

        tokenizer: PreTrainedTokenizer = create_tokenizer(
            **config_dict["tokenizer"])
        corpus: Corpus = Corpus.create(tokenizer=tokenizer,
                                       **config_dict["corpus"])
        model: LanguageModel = import_model(**config_dict["model"])
        set_init_states(model,
                        tokenizer=tokenizer,
                        **config_dict["init_states"])

        if len(envs) == len(ENVS):
            print(f"Probing {mn} on", envs)
            results[mn]["probe"] = monotonicity_probe(model,
                                                      corpus,
                                                      config_dict["probe"],
                                                      suppress_print=False)
            print(results[mn]["probe"])

        if "no_npi" not in mn:
            results[mn]["median_rank"] = median_ranks(