def linzen_init( vocab_path: str, path: str, device: str = "cpu", **kwargs: Any ) -> Dict[str, Any]: corpus = import_corpus(path, header_from_first_line=True, vocab_path=vocab_path) iterator = create_iterator(corpus, batch_size=2000, device=device, sort=True) return {"corpus": corpus, "iterator": iterator}
def _create_init_states_from_corpus( self, init_states_corpus: str, vocab_path: str, save_init_states_to: Optional[str], ) -> ActivationTensors: corpus: Corpus = import_corpus(init_states_corpus, vocab_path=vocab_path) self.init_states = self.create_zero_state() extractor = Extractor(self, corpus, save_init_states_to) init_states = extractor.extract( create_avg_eos=True, only_return_avg_eos=(save_init_states_to is None)) assert init_states is not None return init_states
from diagnnose.vocab import get_vocab_from_config if __name__ == "__main__": arg_groups = { "activations", "corpus", "decompose", "init_states", "model", "plot_attention", "vocab", "extract" } arg_parser, required_args = create_arg_parser(arg_groups) config_dict = create_config_dict(arg_parser, required_args) # load the model model: LanguageModel = import_model(config_dict) # load the corpus corpus: Corpus = import_corpus( vocab_path=get_vocab_from_config(config_dict), **config_dict["corpus"]) # set fix shapley to false if "fix_shapley" not in config_dict["decompose"]: config_dict["decompose"]["fix_shapley"] = False # get attention? attention = CDAttention( model, corpus, cd_config=config_dict["decompose"], plot_config=config_dict["plot_attention"], ) print("Creating example plot") attention.plot_by_sen_id([2], avg_decs=True)
def winobias_init( vocab_path: str, path: str, task_activations: Optional[str] = None, tasks: Optional[List[str]] = None, device: str = "cpu", save_activations: bool = True, **kwargs: Any, ) -> Dict[str, Dict[str, Any]]: """ Initializes the adapted tasks of Zhao et al. (2018) Arxiv link: https://arxiv.org/abs/1804.06876 Repo: https://github.com/i-machine-think/gcd4lm Parameters ---------- vocab_path : str Path to vocabulary file of the Language Model. path : str Path to directory containing the datasets that can be found in the github repo. task_activations : str, optional Path to folder containing the extracted activations. If not provided new activations will be extracted. tasks : List[str], optional The downstream tasks that will be tested. If not provided this will default to the full set of conditions.re device : str, optional Torch device name on which model will be run. Defaults to cpu. save_activations : bool, optional Toggle to save the extracted activations, otherwise delete them. Defaults to True. Returns ------- init_dict : Dict[str, Dict[str, Any]] Dictionary containing the initial task setup, mapping each task to to required fields. """ if tasks is None: tasks = list(winobias_descriptions.keys()) init_dict: Dict[str, Dict[str, Any]] = {} for task in tasks: init_dict[task] = {} for condition in winobias_descriptions[task]: assert ( task in winobias_descriptions ), f"Provided task {task} is not recognised!" corpus = import_corpus( os.path.join(path, winobias_descriptions[task][condition]["path"]), header_from_first_line=True, vocab_path=vocab_path, ) iterator = create_iterator(corpus, batch_size=500, device=device, sort=True) if task_activations is not None: activations_dir = os.path.join(task_activations, task, condition) else: activations_dir = None init_dict[task][condition] = { "corpus": corpus, "iterator": iterator, "activations_dir": activations_dir, "save_activations": save_activations, } return init_dict
def lakretz_init( vocab_path: str, path: str, task_activations: Optional[Dict[str, str]] = None, tasks: Optional[List[str]] = None, device: str = "cpu", ) -> Dict[str, Dict[str, Any]]: """ Initializes the tasks described in Lakretz et al. (2019) Arxiv link: https://arxiv.org/pdf/1903.07435.pdf Repo: https://github.com/FAIRNS/Number_and_syntax_units_in_LSTM_LMs Parameters ---------- vocab_path : str Path to vocabulary file of the Language Model. path : str Path to directory containing the datasets that can be found in the github repo. task_activations : str, optional Dictionary mapping task names to directories to which the Lakretz task embeddings have been extracted. If a task is not provided the activations will be created during the task. tasks : List[str], optional The downstream tasks that will be tested. If not provided this will default to the full set of conditions. device : str, optional Torch device name on which model will be run. Defaults to cpu. Returns ------- init_dict : Dict[str, Dict[str, Any]] Dictionary containing the initial task setup, mapping each task to to required fields. """ task_activations = task_activations or {} if tasks is None: tasks = list(lakretz_descriptions.keys()) init_dict: Dict[str, Dict[str, Any]] = {} for task in tasks: assert task in lakretz_descriptions, f"Provided task {task} is not recognised!" activation_dir = task_activations.get(task, None) activation_reader = (ActivationReader(activation_dir) if activation_dir is not None else None) task_specs = lakretz_descriptions[task] items_per_class = task_specs["items_per_class"] corpus = import_corpus( os.path.join(path, f"{task}.txt"), header=["sen", "type", "correct", "idx"], vocab_path=vocab_path, ) iterator = create_iterator(corpus, batch_size=(items_per_class * 2), device=device) init_dict[task] = { "activation_reader": activation_reader, "corpus": corpus, "iterator": iterator, } return init_dict
from diagnnose.decompositions.factory import DecomposerFactory from diagnnose.models.import_model import import_model from diagnnose.typedefs.corpus import Corpus from diagnnose.typedefs.models import LanguageModel from diagnnose.vocab import get_vocab_from_config if __name__ == "__main__": arg_groups = { "model", "init_states", "corpus", "vocab", "activations", "decompose" } arg_parser, required_args = create_arg_parser(arg_groups) config_dict = create_config_dict(arg_parser, required_args) model: LanguageModel = import_model(config_dict) vocab_path = get_vocab_from_config(config_dict) corpus: Corpus = import_corpus(vocab_path=vocab_path, **config_dict["corpus"]) decompose_args = {**config_dict["decompose"], **config_dict["activations"]} print("Initializing decomposition") sen_ids = slice(0, 625) factory = DecomposerFactory(model, corpus=corpus, sen_ids=sen_ids, **decompose_args) decomposer = factory.create(sen_ids, classes=torch.arange(20), subsen_index=slice(0, None))
from diagnnose.classifiers.dc_trainer import DCTrainer from diagnnose.config.arg_parser import create_arg_parser from diagnnose.config.setup import create_config_dict from diagnnose.corpus.import_corpus import import_corpus from diagnnose.typedefs.corpus import Corpus if __name__ == "__main__": arg_groups = {"activations", "classify", "corpus"} arg_parser, required_args = create_arg_parser(arg_groups) config_dict = create_config_dict(arg_parser, required_args, arg_groups) corpus: Corpus = import_corpus(**config_dict["corpus"]) dc_trainer = DCTrainer(**config_dict["activations"], **config_dict["classify"], corpus=corpus) dc_trainer.train()
def calc_diff_scores(config, lm: LanguageModel) -> Dict[str, Dict[str, Tensor]]: scores = {} for corpus_type in ["unamb", "stereo"]: scores[corpus_type] = {} for condition in ["FM", "MF"]: corpus_name = f"{corpus_type}_{condition}.txt" corpus_path = os.path.join(config["corpus"]["path"], corpus_name) corpus: Corpus = import_corpus( corpus_path, vocab_path=get_vocab_from_config(config), header_from_first_line=True, ) if config["activations"].get("activations_dir", None) is not None: activations_dir = os.path.join( config["activations"]["activations_dir"], corpus_type, condition.lower(), ) else: activations_dir = None sen_ids = slice(0, len(corpus)) factory = DecomposerFactory( lm, activations_dir or TMP_DIR, create_new_activations=(activations_dir is None), corpus=corpus, sen_ids=sen_ids, ) ref_types = [ex.ref_type for ex in corpus.examples] classes = create_winobias_classes(ref_types, corpus) decomposer = factory.create(sen_ids) lens = decomposer.final_index - 1 final_hidden = decomposer.activation_dict[decomposer.toplayer, "hx"][range(len(corpus)), lens + 1].unsqueeze(2) full_probs = torch.bmm(lm.decoder_w[classes], final_hidden).squeeze() full_probs += lm.decoder_b[classes] obj_idx_start = torch.tensor( [ex.obj_idx_start - 1 for ex in corpus]) obj_idx_end = torch.tensor([ex.obj_idx + 1 for ex in corpus]) ranges = [ (0, 2), (2, obj_idx_start), (obj_idx_start, obj_idx_end), (obj_idx_end, lens + 1), ] scores[corpus_type][condition] = torch.zeros(4) for i, (start, stop) in enumerate(ranges): config["decompose"].update({"start": start, "stop": stop}) rel_dec = decomposer.decompose( **config["decompose"])["relevant"] final_rel_dec = rel_dec[range(len(corpus)), lens].unsqueeze(2) rel_probs = torch.bmm(lm.decoder_w[classes], final_rel_dec).squeeze() rel_probs /= full_probs prob_diffs = rel_probs[:, 0] - rel_probs[:, 1] scores[corpus_type][condition][i] = torch.mean(prob_diffs) print(corpus_type, condition, scores[corpus_type][condition]) return scores