Exemplo n.º 1
0
def linzen_init(
    vocab_path: str, path: str, device: str = "cpu", **kwargs: Any
) -> Dict[str, Any]:
    corpus = import_corpus(path, header_from_first_line=True, vocab_path=vocab_path)
    iterator = create_iterator(corpus, batch_size=2000, device=device, sort=True)

    return {"corpus": corpus, "iterator": iterator}
Exemplo n.º 2
0
    def _create_init_states_from_corpus(
        self,
        init_states_corpus: str,
        vocab_path: str,
        save_init_states_to: Optional[str],
    ) -> ActivationTensors:
        corpus: Corpus = import_corpus(init_states_corpus,
                                       vocab_path=vocab_path)

        self.init_states = self.create_zero_state()
        extractor = Extractor(self, corpus, save_init_states_to)
        init_states = extractor.extract(
            create_avg_eos=True,
            only_return_avg_eos=(save_init_states_to is None))
        assert init_states is not None

        return init_states
Exemplo n.º 3
0
from diagnnose.vocab import get_vocab_from_config

if __name__ == "__main__":
    arg_groups = {
        "activations", "corpus", "decompose", "init_states", "model",
        "plot_attention", "vocab", "extract"
    }

    arg_parser, required_args = create_arg_parser(arg_groups)
    config_dict = create_config_dict(arg_parser, required_args)

    # load the model
    model: LanguageModel = import_model(config_dict)

    # load the corpus
    corpus: Corpus = import_corpus(
        vocab_path=get_vocab_from_config(config_dict), **config_dict["corpus"])

    # set fix shapley to false
    if "fix_shapley" not in config_dict["decompose"]:
        config_dict["decompose"]["fix_shapley"] = False

    # get attention?
    attention = CDAttention(
        model,
        corpus,
        cd_config=config_dict["decompose"],
        plot_config=config_dict["plot_attention"],
    )

    print("Creating example plot")
    attention.plot_by_sen_id([2], avg_decs=True)
Exemplo n.º 4
0
def winobias_init(
    vocab_path: str,
    path: str,
    task_activations: Optional[str] = None,
    tasks: Optional[List[str]] = None,
    device: str = "cpu",
    save_activations: bool = True,
    **kwargs: Any,
) -> Dict[str, Dict[str, Any]]:
    """ Initializes the adapted tasks of Zhao et al. (2018)

    Arxiv link: https://arxiv.org/abs/1804.06876
    Repo: https://github.com/i-machine-think/gcd4lm

    Parameters
    ----------
    vocab_path : str
        Path to vocabulary file of the Language Model.
    path : str
        Path to directory containing the datasets that can be found
        in the github repo.
    task_activations : str, optional
        Path to folder containing the extracted activations. If not
        provided new activations will be extracted.
    tasks : List[str], optional
        The downstream tasks that will be tested. If not provided this
        will default to the full set of conditions.re
    device : str, optional
        Torch device name on which model will be run. Defaults to cpu.
    save_activations : bool, optional
        Toggle to save the extracted activations, otherwise delete them.
        Defaults to True.

    Returns
    -------
    init_dict : Dict[str, Dict[str, Any]]
        Dictionary containing the initial task setup, mapping each task
        to to required fields.
    """

    if tasks is None:
        tasks = list(winobias_descriptions.keys())

    init_dict: Dict[str, Dict[str, Any]] = {}

    for task in tasks:
        init_dict[task] = {}
        for condition in winobias_descriptions[task]:
            assert (
                task in winobias_descriptions
            ), f"Provided task {task} is not recognised!"

            corpus = import_corpus(
                os.path.join(path, winobias_descriptions[task][condition]["path"]),
                header_from_first_line=True,
                vocab_path=vocab_path,
            )

            iterator = create_iterator(corpus, batch_size=500, device=device, sort=True)

            if task_activations is not None:
                activations_dir = os.path.join(task_activations, task, condition)
            else:
                activations_dir = None

            init_dict[task][condition] = {
                "corpus": corpus,
                "iterator": iterator,
                "activations_dir": activations_dir,
                "save_activations": save_activations,
            }

    return init_dict
Exemplo n.º 5
0
def lakretz_init(
    vocab_path: str,
    path: str,
    task_activations: Optional[Dict[str, str]] = None,
    tasks: Optional[List[str]] = None,
    device: str = "cpu",
) -> Dict[str, Dict[str, Any]]:
    """ Initializes the tasks described in Lakretz et al. (2019)

    Arxiv link: https://arxiv.org/pdf/1903.07435.pdf
    Repo: https://github.com/FAIRNS/Number_and_syntax_units_in_LSTM_LMs

    Parameters
    ----------
    vocab_path : str
        Path to vocabulary file of the Language Model.
    path : str
        Path to directory containing the datasets that can be found
        in the github repo.
    task_activations : str, optional
        Dictionary mapping task names to directories to which the
        Lakretz task embeddings have been extracted. If a task is not
        provided the activations will be created during the task.
    tasks : List[str], optional
        The downstream tasks that will be tested. If not provided this
        will default to the full set of conditions.
    device : str, optional
        Torch device name on which model will be run. Defaults to cpu.

    Returns
    -------
    init_dict : Dict[str, Dict[str, Any]]
        Dictionary containing the initial task setup, mapping each task
        to to required fields.
    """

    task_activations = task_activations or {}

    if tasks is None:
        tasks = list(lakretz_descriptions.keys())

    init_dict: Dict[str, Dict[str, Any]] = {}

    for task in tasks:
        assert task in lakretz_descriptions, f"Provided task {task} is not recognised!"

        activation_dir = task_activations.get(task, None)
        activation_reader = (ActivationReader(activation_dir)
                             if activation_dir is not None else None)

        task_specs = lakretz_descriptions[task]
        items_per_class = task_specs["items_per_class"]

        corpus = import_corpus(
            os.path.join(path, f"{task}.txt"),
            header=["sen", "type", "correct", "idx"],
            vocab_path=vocab_path,
        )

        iterator = create_iterator(corpus,
                                   batch_size=(items_per_class * 2),
                                   device=device)

        init_dict[task] = {
            "activation_reader": activation_reader,
            "corpus": corpus,
            "iterator": iterator,
        }

    return init_dict
from diagnnose.decompositions.factory import DecomposerFactory
from diagnnose.models.import_model import import_model
from diagnnose.typedefs.corpus import Corpus
from diagnnose.typedefs.models import LanguageModel
from diagnnose.vocab import get_vocab_from_config

if __name__ == "__main__":
    arg_groups = {
        "model", "init_states", "corpus", "vocab", "activations", "decompose"
    }
    arg_parser, required_args = create_arg_parser(arg_groups)
    config_dict = create_config_dict(arg_parser, required_args)

    model: LanguageModel = import_model(config_dict)
    vocab_path = get_vocab_from_config(config_dict)
    corpus: Corpus = import_corpus(vocab_path=vocab_path,
                                   **config_dict["corpus"])

    decompose_args = {**config_dict["decompose"], **config_dict["activations"]}

    print("Initializing decomposition")

    sen_ids = slice(0, 625)

    factory = DecomposerFactory(model,
                                corpus=corpus,
                                sen_ids=sen_ids,
                                **decompose_args)

    decomposer = factory.create(sen_ids,
                                classes=torch.arange(20),
                                subsen_index=slice(0, None))
Exemplo n.º 7
0
from diagnnose.classifiers.dc_trainer import DCTrainer
from diagnnose.config.arg_parser import create_arg_parser
from diagnnose.config.setup import create_config_dict
from diagnnose.corpus.import_corpus import import_corpus
from diagnnose.typedefs.corpus import Corpus

if __name__ == "__main__":
    arg_groups = {"activations", "classify", "corpus"}
    arg_parser, required_args = create_arg_parser(arg_groups)
    config_dict = create_config_dict(arg_parser, required_args, arg_groups)

    corpus: Corpus = import_corpus(**config_dict["corpus"])

    dc_trainer = DCTrainer(**config_dict["activations"],
                           **config_dict["classify"],
                           corpus=corpus)

    dc_trainer.train()
Exemplo n.º 8
0
def calc_diff_scores(config,
                     lm: LanguageModel) -> Dict[str, Dict[str, Tensor]]:
    scores = {}

    for corpus_type in ["unamb", "stereo"]:
        scores[corpus_type] = {}
        for condition in ["FM", "MF"]:
            corpus_name = f"{corpus_type}_{condition}.txt"
            corpus_path = os.path.join(config["corpus"]["path"], corpus_name)
            corpus: Corpus = import_corpus(
                corpus_path,
                vocab_path=get_vocab_from_config(config),
                header_from_first_line=True,
            )

            if config["activations"].get("activations_dir", None) is not None:
                activations_dir = os.path.join(
                    config["activations"]["activations_dir"],
                    corpus_type,
                    condition.lower(),
                )
            else:
                activations_dir = None

            sen_ids = slice(0, len(corpus))
            factory = DecomposerFactory(
                lm,
                activations_dir or TMP_DIR,
                create_new_activations=(activations_dir is None),
                corpus=corpus,
                sen_ids=sen_ids,
            )

            ref_types = [ex.ref_type for ex in corpus.examples]
            classes = create_winobias_classes(ref_types, corpus)

            decomposer = factory.create(sen_ids)
            lens = decomposer.final_index - 1

            final_hidden = decomposer.activation_dict[decomposer.toplayer,
                                                      "hx"][range(len(corpus)),
                                                            lens +
                                                            1].unsqueeze(2)
            full_probs = torch.bmm(lm.decoder_w[classes],
                                   final_hidden).squeeze()
            full_probs += lm.decoder_b[classes]

            obj_idx_start = torch.tensor(
                [ex.obj_idx_start - 1 for ex in corpus])
            obj_idx_end = torch.tensor([ex.obj_idx + 1 for ex in corpus])
            ranges = [
                (0, 2),
                (2, obj_idx_start),
                (obj_idx_start, obj_idx_end),
                (obj_idx_end, lens + 1),
            ]

            scores[corpus_type][condition] = torch.zeros(4)

            for i, (start, stop) in enumerate(ranges):
                config["decompose"].update({"start": start, "stop": stop})
                rel_dec = decomposer.decompose(
                    **config["decompose"])["relevant"]
                final_rel_dec = rel_dec[range(len(corpus)), lens].unsqueeze(2)
                rel_probs = torch.bmm(lm.decoder_w[classes],
                                      final_rel_dec).squeeze()
                rel_probs /= full_probs

                prob_diffs = rel_probs[:, 0] - rel_probs[:, 1]
                scores[corpus_type][condition][i] = torch.mean(prob_diffs)
            print(corpus_type, condition, scores[corpus_type][condition])

    return scores