def initialize(self, path: str, header: Optional[List[str]] = None) -> SyntaxEvalCorpora: if header is None: header = ["sen", "token", "counter_token"] assert "sen" in header assert "token" in header assert "counter_sen" in header or "counter_token" in header corpora = {} if os.path.isdir(path): for file in glob.glob(os.path.join(path, "*")): corpus = Corpus.create(file, header=header, tokenizer=self.tokenizer) task_name = file.split("/")[-1].split(".")[0] corpora[task_name] = corpus elif os.path.isfile(path): corpus = Corpus.create(path, header=header, tokenizer=self.tokenizer) task_name = path.split("/")[-1].split(".")[0] corpora[task_name] = corpus else: raise FileNotFoundError("Path to task is not found") return corpora
def _create_init_states_from_corpus( self, init_states_corpus: str, tokenizer: PreTrainedTokenizer, save_init_states_to: Optional[str] = None, ) -> ActivationDict: assert ( tokenizer is not None ), "Tokenizer must be provided when creating init states from corpus" corpus: Corpus = Corpus.create(init_states_corpus, tokenizer=tokenizer) activation_names: ActivationNames = [ (layer, name) for layer in range(self.num_layers) for name in ["hx", "cx"] ] extractor = Extractor( self, corpus, activation_names, activations_dir=save_init_states_to, selection_func=final_sen_token, ) init_states = extractor.extract().activation_dict return init_states
def initialize(self, path: str, subtasks: Optional[List[str]] = None) -> SyntaxEvalCorpora: """ Parameters ---------- path : str Path to directory containing the Marvin datasets that can be found in the github repo. subtasks : List[str], optional The downstream tasks that will be tested. If not provided this will default to the full set of conditions. Returns ------- corpora : Dict[str, Corpus] Dictionary mapping a subtask to a Corpus. """ subtasks = subtasks or ["stereo", "unamb"] corpora: SyntaxEvalCorpora = {} for subtask in subtasks: for condition in ["FF", "FM", "MF", "MM"]: corpus = Corpus.create( os.path.join(path, f"{subtask}_{condition}.tsv"), header_from_first_line=True, tokenizer=self.tokenizer, ) self._add_output_classes(corpus) corpora.setdefault(subtask, {})[condition] = corpus return corpora
if not os.path.isdir(results_dir): os.mkdir(results_dir) results = {mn: {} for mn in MODEL_NAMES} for mn in MODEL_NAMES: config_dict["model"]["state_dict"] = os.path.join( model_dir, f"{mn}/40.pt") config_dict["tokenizer"]["path"] = os.path.join( model_dir, f"{mn}/vocab.txt") config_dict["probe"]["save_dir"] = os.path.join(results_dir, mn) envs = [env for env in ENVS if env == mn[:len(env)]] or ENVS tokenizer: PreTrainedTokenizer = create_tokenizer( **config_dict["tokenizer"]) corpus: Corpus = Corpus.create(tokenizer=tokenizer, **config_dict["corpus"]) model: LanguageModel = import_model(**config_dict["model"]) set_init_states(model, tokenizer=tokenizer, **config_dict["init_states"]) if len(envs) == len(ENVS): print(f"Probing {mn} on", envs) results[mn]["probe"] = monotonicity_probe(model, corpus, config_dict["probe"], suppress_print=False) print(results[mn]["probe"]) if "no_npi" not in mn: results[mn]["median_rank"] = median_ranks(