def _get_task(name, args, data_path, scratch_path): """ Build or load a single task. """ assert name in TASKS_REGISTRY, f"Task '{name:s}' not found!" task_cls, rel_path, task_kw = TASKS_REGISTRY[name] pkl_path = os.path.join(scratch_path, "tasks", f"{name:s}.{args.tokenizer:s}.pkl") # TODO: refactor to always read from disk, even if task is constructed # here. This should avoid subtle bugs from deserialization issues. if os.path.isfile(pkl_path) and not args.reload_tasks: task = pkl.load(open(pkl_path, "rb")) log.info("\tLoaded existing task %s", name) else: log.info("\tCreating task %s from scratch.", name) # These tasks take an additional kwarg. if name == "nli-prob" or name == "nli-alt": # TODO: remove special case, replace with something general # to pass custom loader args to task. task_kw["probe_path"] = args["nli-prob"].probe_path task_src_path = os.path.join(data_path, rel_path) task = task_cls( task_src_path, max_seq_len=args.max_seq_len, name=name, tokenizer_name=args.tokenizer, **task_kw, ) task.load_data() utils.maybe_make_dir(os.path.dirname(pkl_path)) pkl.dump(task, open(pkl_path, "wb")) return task
def __init__(self, args): super().__init__() self.cache_dir = FLAGS.cache_dir utils.maybe_make_dir(self.cache_dir) self.output_mode = args.transformers_output_mode self.input_module = args.input_module self.tokenizer_required = input_module_tokenizer_name( args.input_module) # If set, treat these special tokens as part of input segments other than A/B. self._SEG_ID_CLS = None self._SEG_ID_SEP = None # self.model = transformers.RobertaModel.from_pretrained( # args.input_module, cache_dir=self.cache_dir, output_hidden_states=True # ) if FLAGS.saved_pretrained_model_path: self.model = load_pretrained_model_for_SG() else: self.model = MODEL_MAPPING[FLAGS.model](finetune_stage=True) self.max_pos = None self.tokenizer = get_my_tokenizer() self._sep_id = self.tokenizer.sep_token_id self._cls_id = self.tokenizer.cls_token_id self._pad_id = self.tokenizer.pad_token_id self._unk_id = self.tokenizer.unk_token_id self.parameter_setup(args)
def _get_task(name: str, args: config.Params, data_path: str, scratch_path: str) -> Task: """Get task object from disk if available. Else construct, prepare and save a new task object. Parameters ---------- name : str task name to load. args : config.Params param handler object. data_path : str base data directory. scratch_path : str where to save Task objects. Returns ------- Task loaded task object. """ assert name in TASKS_REGISTRY, f"Task '{name:s}' not found!" task_cls, rel_path, task_kw = TASKS_REGISTRY[name] pkl_path = os.path.join(scratch_path, "tasks", f"{name:s}.{args.tokenizer:s}.pkl") # TODO: refactor to always read from disk, even if task is constructed # here. This should avoid subtle bugs from deserialization issues. if os.path.isfile(pkl_path) and not args.reload_tasks: task = pkl.load(open(pkl_path, "rb")) log.info("\tLoaded existing task %s", name) else: log.info("\tCreating task %s from scratch.", name) # These tasks take an additional kwarg. if name == "nli-prob" or name == "nli-alt": # TODO: remove special case, replace with something general # to pass custom loader args to task. task_kw["probe_path"] = args["nli-prob"].probe_path if name in ALL_SEQ2SEQ_TASKS: task_kw["max_targ_v_size"] = args.max_targ_word_v_size task_src_path = os.path.join(data_path, rel_path) task = task_cls( task_src_path, max_seq_len=args.max_seq_len, name=name, tokenizer_name=args.tokenizer, **task_kw, ) log.info('testing: %s', str(args)) # if the user requires to calculate the online code of an edge probing task if args.get("online_code_preshuffle_seed", False) and args.get( "online_code_data_split", False): task.load_data(args) log.info('testing, flags detected; preprocess.py') else: task.load_data() utils.maybe_make_dir(os.path.dirname(pkl_path)) pkl.dump(task, open(pkl_path, "wb")) return task
def __init__(self, args): super(PytorchTransformersEmbedderModule, self).__init__() self.cache_dir = os.getenv( "PYTORCH_PRETRAINED_BERT_CACHE", os.path.join(args.exp_dir, "pytorch_transformers_cache"), ) utils.maybe_make_dir(self.cache_dir) self.embeddings_mode = args.pytorch_transformers_output_mode # Integer token indices for special symbols. self._sep_id = None self._cls_id = None self._pad_id = None # If set, treat these special tokens as part of input segments other than A/B. self._SEG_ID_CLS = None self._SEG_ID_SEP = None
def __init__(self, args): super(HuggingfaceTransformersEmbedderModule, self).__init__() self.cache_dir = os.getenv( "HUGGINGFACE_TRANSFORMERS_CACHE", os.path.join(args.exp_dir, "transformers_cache")) utils.maybe_make_dir(self.cache_dir) self.output_mode = args.transformers_output_mode self.input_module = args.input_module self.max_pos = None self.tokenizer_required = input_module_tokenizer_name( args.input_module) # Integer token indices for special symbols. self._cls_id = None self._sep_id = None self._pad_id = None self._unk_id = None # If set, treat these special tokens as part of input segments other than A/B. self._SEG_ID_CLS = None self._SEG_ID_SEP = None
def build_tasks(args): """Main logic for preparing tasks, doing so by 1) creating / loading the tasks 2) building / loading the vocabulary 3) building / loading the word vectors 4) indexing each task's data 5) initializing lazy loaders (streaming iterators) """ # 1) create / load tasks tasks, pretrain_task_names, target_task_names = get_tasks(args) for task in tasks: task_classifier = config.get_task_attr(args, task.name, "use_classifier") setattr(task, "_classifier_name", task_classifier if task_classifier else task.name) tokenizer_names = {task.name: task.tokenizer_name for task in tasks} assert len(set(tokenizer_names.values())) == 1, ( f"Error: mixing tasks with different tokenizers!" " Tokenizations: {tokenizer_names:s}") # 2) build / load vocab and indexers indexers = build_indexers(args) vocab_path = os.path.join(args.exp_dir, "vocab") if args.reload_vocab or not os.path.exists(vocab_path): _build_vocab(args, tasks, vocab_path) # Always load vocab from file. vocab = Vocabulary.from_files(vocab_path) log.info("\tLoaded vocab from %s", vocab_path) for namespace, mapping in vocab._index_to_token.items(): log.info("\tVocab namespace %s: size %d", namespace, len(mapping)) log.info("\tFinished building vocab.") args.max_word_v_size = vocab.get_vocab_size("tokens") args.max_char_v_size = vocab.get_vocab_size("chars") # 3) build / load word vectors word_embs = None if args.input_module in ["glove", "fastText"]: emb_file = os.path.join(args.exp_dir, "embs.pkl") if args.reload_vocab or not os.path.exists(emb_file): word_embs = _build_embeddings(args, vocab, emb_file) else: # load from file word_embs = pkl.load(open(emb_file, "rb")) log.info("Trimmed word embeddings: %s", str(word_embs.size())) # 4) Index tasks using vocab (if preprocessed copy not available). preproc_dir = os.path.join(args.exp_dir, "preproc") utils.maybe_make_dir(preproc_dir) reindex_tasks = parse_task_list_arg(args.reindex_tasks) utils.assert_for_log( not (args.reload_indexing and not reindex_tasks), 'Flag reload_indexing was set, but no tasks are set to reindex (use -o "args.reindex_tasks' ' = "task1,task2,..."")', ) # Set up boundary_token_fn, which applies SOS/EOS/SEP/CLS delimiters if args.input_module.startswith("bert"): from jiant.pytorch_transformers_interface.modules import BertEmbedderModule boundary_token_fn = BertEmbedderModule.apply_boundary_tokens elif args.input_module.startswith("xlnet"): from jiant.pytorch_transformers_interface.modules import XLNetEmbedderModule boundary_token_fn = XLNetEmbedderModule.apply_boundary_tokens else: boundary_token_fn = utils.apply_standard_boundary_tokens for task in tasks: force_reindex = args.reload_indexing and task.name in reindex_tasks for split in ALL_SPLITS: log_prefix = "\tTask '%s', split '%s'" % (task.name, split) relative_path = _get_serialized_record_path( task.name, split, "preproc") cache_found = _find_cached_file(args.exp_dir, args.global_ro_exp_dir, relative_path, log_prefix=log_prefix) if force_reindex or not cache_found: # Re-index from scratch. record_file = _get_serialized_record_path( task.name, split, preproc_dir) if os.path.exists(record_file) and os.path.islink(record_file): os.remove(record_file) _index_split(task, split, indexers, vocab, record_file, boundary_token_fn) # Delete in-memory data - we'll lazy-load from disk later. # TODO: delete task.{split}_data_text as well? task.train_data = None task.val_data = None task.test_data = None log.info("\tFinished indexing tasks") # 5) Initialize tasks with data iterators. pretrain_tasks = [] target_tasks = [] for task in tasks: # Replace lists of instances with lazy generators from disk. task.val_data = _get_instance_generator(task.name, "val", preproc_dir) task.test_data = _get_instance_generator(task.name, "test", preproc_dir) # When using pretrain_data_fraction, we need modified iterators for use # only on training datasets at pretraining time. if task.name in pretrain_task_names: log.info("\tCreating trimmed pretraining-only version of " + task.name + " train.") task.train_data = _get_instance_generator( task.name, "train", preproc_dir, fraction=args.pretrain_data_fraction) pretrain_tasks.append(task) # When using target_train_data_fraction, we need modified iterators # only for training datasets at do_target_task_training time. if task.name in target_task_names: log.info("\tCreating trimmed target-only version of " + task.name + " train.") task.train_data = _get_instance_generator( task.name, "train", preproc_dir, fraction=args.target_train_data_fraction) target_tasks.append(task) log.info("\t Training on %s", ", ".join(pretrain_task_names)) log.info("\t Evaluating on %s", ", ".join(target_task_names)) return pretrain_tasks, target_tasks, vocab, word_embs
def initial_setup(args, cl_args): """ Sets up email hook, creating seed, and cuda settings. Parameters ---------------- args: Params object cl_args: list of arguments Returns ---------------- tasks: list of Task objects pretrain_tasks: list of pretraining tasks target_tasks: list of target tasks vocab: list of vocab word_embs: loaded word embeddings, may be None if args.input_module in {gpt, elmo, elmo-chars-only, bert-*} model: a MultiTaskModel object """ output = io.StringIO() maybe_make_dir(args.project_dir) # e.g. /nfs/jsalt/exp/$HOSTNAME maybe_make_dir(args.exp_dir) # e.g. <project_dir>/jiant-demo maybe_make_dir(args.run_dir) # e.g. <project_dir>/jiant-demo/sst log_fh = log.FileHandler(args.local_log_path) log_fmt = log.Formatter("%(asctime)s: %(message)s", datefmt="%m/%d %I:%M:%S %p") log_fh.setFormatter(log_fmt) log.getLogger().addHandler(log_fh) if cl_args.remote_log: from jiant.utils import gcp gcp.configure_remote_logging(args.remote_log_name) if cl_args.notify: from jiant.utils import emails global EMAIL_NOTIFIER log.info("Registering email notifier for %s", cl_args.notify) EMAIL_NOTIFIER = emails.get_notifier(cl_args.notify, args) if EMAIL_NOTIFIER: EMAIL_NOTIFIER(body="Starting run.", prefix="") _log_git_info() config_file = os.path.join(args.run_dir, "params.conf") config.write_params(args, config_file) print_args = select_relevant_print_args(args) log.info("Parsed args: \n%s", print_args) log.info("Saved config to %s", config_file) seed = random.randint(1, 10000) if args.random_seed < 0 else args.random_seed random.seed(seed) torch.manual_seed(seed) log.info("Using random seed %d", seed) if args.cuda >= 0: try: if not torch.cuda.is_available(): raise EnvironmentError("CUDA is not available, or not detected" " by PyTorch.") log.info("Using GPU %d", args.cuda) torch.cuda.set_device(args.cuda) torch.cuda.manual_seed_all(seed) except Exception: log.warning( "GPU access failed. You might be using a CPU-only installation of PyTorch. " "Falling back to CPU.") args.cuda = -1 if args.tokenizer == "auto": args.tokenizer = tokenizers.select_tokenizer(args) if args.pool_type == "auto": args.pool_type = select_pool_type(args) return args, seed
def build_tasks( args: config.Params, cuda_device: Any ) -> (List[Task], List[Task], Vocabulary, Union[np.ndarray, float]): """Main logic for preparing tasks: 1. create or load the tasks 2. configure classifiers for tasks 3. set up indexers 4. build and save vocab to disk 5. load vocab from disk 6. if specified, load word embeddings 7. set up ModelPreprocessingInterface (MPI) to handle model-specific preprocessing 8. index tasks using vocab and task-specific MPI, save to disk. 9. return: task data lazy-loaders in phase-specific lists w/ vocab, and word embeddings Parameters ---------- args : Params config map Returns ------- List[Task] list of pretrain Tasks. List[Task] list of target Tasks. allennlp.data.Vocabulary vocabulary from task data. Union[np.ndarray, float] Word embeddings. """ # 1) create / load tasks tasks, pretrain_task_names, target_task_names = get_tasks( args, cuda_device) for task in tasks: task_classifier = config.get_task_attr(args, task.name, "use_classifier") setattr(task, "_classifier_name", task_classifier if task_classifier else task.name) tokenizer_names = {task.name: task.tokenizer_name for task in tasks} assert not len(set(tokenizer_names.values())) > 1, ( f"Error: mixing tasks with different tokenizers!" " Tokenizations: {tokenizer_names:s}") # 2) build / load vocab and indexers indexers = build_indexers(args) vocab_path = os.path.join(args.exp_dir, "vocab") if args.reload_vocab or not os.path.exists(vocab_path): _build_vocab(args, tasks, vocab_path) # Always load vocab from file. vocab = Vocabulary.from_files(vocab_path) log.info("\tLoaded vocab from %s", vocab_path) for namespace, mapping in vocab._index_to_token.items(): log.info("\tVocab namespace %s: size %d", namespace, len(mapping)) log.info("\tFinished building vocab.") args.max_word_v_size = vocab.get_vocab_size("tokens") args.max_char_v_size = vocab.get_vocab_size("chars") # 3) build / load word vectors word_embs = None if args.input_module in ["glove", "fastText"]: emb_file = os.path.join(args.exp_dir, "embs.pkl") if args.reload_vocab or not os.path.exists(emb_file): word_embs = _build_embeddings(args, vocab, emb_file) else: # load from file word_embs = pkl.load(open(emb_file, "rb")) log.info("Trimmed word embeddings: %s", str(word_embs.size())) # 4) Set up model_preprocessing_interface model_preprocessing_interface = ModelPreprocessingInterface(args) # 5) Index tasks using vocab (if preprocessed copy not available). preproc_dir = os.path.join(args.exp_dir, "preproc") utils.maybe_make_dir(preproc_dir) reindex_tasks = parse_task_list_arg(args.reindex_tasks) utils.assert_for_log( not (args.reload_indexing and not reindex_tasks), 'Flag reload_indexing was set, but no tasks are set to reindex (use -o "args.reindex_tasks' ' = "task1,task2,..."")', ) for task in tasks: force_reindex = args.reload_indexing and task.name in reindex_tasks for split in ALL_SPLITS: log_prefix = "\tTask '%s', split '%s'" % (task.name, split) relative_path = _get_serialized_record_path( task.name, split, "preproc") cache_found = _find_cached_file(args.exp_dir, args.global_ro_exp_dir, relative_path, log_prefix=log_prefix) if force_reindex or not cache_found: # Re-index from scratch. record_file = _get_serialized_record_path( task.name, split, preproc_dir) if os.path.exists(record_file) and os.path.islink(record_file): os.remove(record_file) _index_split(task, split, indexers, vocab, record_file, model_preprocessing_interface) # Delete in-memory data - we'll lazy-load from disk later. # TODO: delete task.{split}_data_text? log.info("\tFinished indexing tasks") # 6) Initialize tasks with data iterators. pretrain_tasks = [] target_tasks = [] for task in tasks: # Replace lists of instances with lazy generators from disk. task.set_instance_iterable( split_name="val", instance_iterable=_get_instance_generator(task.name, "val", preproc_dir), ) task.set_instance_iterable( split_name="test", instance_iterable=_get_instance_generator(task.name, "test", preproc_dir), ) # When using pretrain_data_fraction, we need modified iterators for use # only on training datasets at pretraining time. if task.name in pretrain_task_names: log.info("\tCreating trimmed pretraining-only version of " + task.name + " train.") task.set_instance_iterable( split_name="train", instance_iterable=_get_instance_generator( task.name, "train", preproc_dir, fraction=args.pretrain_data_fraction), phase="pretrain", ) pretrain_tasks.append(task) # When using target_train_data_fraction, we need modified iterators # only for training datasets at do_target_task_training time. if task.name in target_task_names: log.info("\tCreating trimmed target-only version of " + task.name + " train.") task.set_instance_iterable( split_name="train", instance_iterable=_get_instance_generator( task.name, "train", preproc_dir, fraction=args.target_train_data_fraction), phase="target_train", ) target_tasks.append(task) log.info("\t Training on %s", ", ".join(pretrain_task_names)) log.info("\t Evaluating on %s", ", ".join(target_task_names)) return pretrain_tasks, target_tasks, vocab, word_embs
def initial_setup(args: config.Params, cl_args: argparse.Namespace) -> (config.Params, int): """Perform setup steps: 1. create project, exp, and run dirs if they don't already exist 2. create log formatter 3. configure GCP remote logging 4. set up email notifier 5. log git info 6. write the config out to file 7. log diff between default and experiment's configs 8. choose torch's and random's random seed 9. if config specifies a single GPU, then set the GPU's random seed (doesn't cover multi-GPU) 10. resolve "auto" settings for tokenizer and pool_type parameters Parameters ---------- args : config.Params config map cl_args : argparse.Namespace mapping named arguments to parsed values Returns ------- args : config.Params config map seed : int random's and pytorch's random seed """ output = io.StringIO() maybe_make_dir(args.project_dir) # e.g. /nfs/jsalt/exp/$HOSTNAME maybe_make_dir(args.exp_dir) # e.g. <project_dir>/jiant-demo maybe_make_dir(args.run_dir) # e.g. <project_dir>/jiant-demo/sst log_fh = log.FileHandler(args.local_log_path) log_fmt = log.Formatter("%(asctime)s: %(message)s", datefmt="%m/%d %I:%M:%S %p") log_fh.setFormatter(log_fmt) log.getLogger().addHandler(log_fh) if cl_args.remote_log: from jiant.utils import gcp gcp.configure_remote_logging(args.remote_log_name) if cl_args.notify: from jiant.utils import emails global EMAIL_NOTIFIER log.info("Registering email notifier for %s", cl_args.notify) EMAIL_NOTIFIER = emails.get_notifier(cl_args.notify, args) if EMAIL_NOTIFIER: EMAIL_NOTIFIER(body="Starting run.", prefix="") _log_git_info() config_file = os.path.join(args.run_dir, "params.conf") config.write_params(args, config_file) print_args = select_relevant_print_args(args) log.info("Parsed args: \n%s", print_args) log.info("Saved config to %s", config_file) seed = random.randint(1, 10000) if args.random_seed < 0 else args.random_seed random.seed(seed) torch.manual_seed(seed) log.info("Using random seed %d", seed) if isinstance(args.cuda, int) and args.cuda >= 0: # If only running on one GPU. try: if not torch.cuda.is_available(): raise EnvironmentError("CUDA is not available, or not detected" " by PyTorch.") log.info("Using GPU %d", args.cuda) torch.cuda.set_device(args.cuda) torch.cuda.manual_seed_all(seed) except Exception: log.warning( "GPU access failed. You might be using a CPU-only installation of PyTorch. " "Falling back to CPU." ) args.cuda = -1 if args.tokenizer == "auto": args.tokenizer = tokenizers.select_tokenizer(args) if args.pool_type == "auto": args.pool_type = select_pool_type(args) return args, seed
def build_model(args, vocab, pretrained_embs, tasks): """ Build model according to args Returns: model which has attributes set in it with the attrbutes. """ # Build embeddings. if args.input_module == "gpt": # Note: incompatible with other embedders, but logic in preprocess.py # should prevent these from being enabled anyway. from .openai_transformer_lm.utils import OpenAIEmbedderModule log.info("Using OpenAI transformer model.") cove_layer = None # Here, this uses openAIEmbedder. embedder = OpenAIEmbedderModule(args) d_emb = embedder.get_output_dim() elif args.input_module.startswith("bert"): # Note: incompatible with other embedders, but logic in preprocess.py # should prevent these from being enabled anyway. from .bert.utils import BertEmbedderModule log.info(f"Using BERT model ({args.input_module}).") cove_layer = None # Set PYTORCH_PRETRAINED_BERT_CACHE environment variable to an existing # cache; see # https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/file_utils.py # noqa bert_cache_dir = os.getenv( "PYTORCH_PRETRAINED_BERT_CACHE", os.path.join(args.exp_dir, "bert_cache") ) maybe_make_dir(bert_cache_dir) embedder = BertEmbedderModule(args, cache_dir=bert_cache_dir) d_emb = embedder.get_output_dim() else: # Default case, used for ELMo, CoVe, word embeddings, etc. d_emb, embedder, cove_layer = build_embeddings(args, vocab, tasks, pretrained_embs) d_sent_input = args.d_hid sent_encoder, d_sent_output = build_sent_encoder( args, vocab, d_emb, tasks, embedder, cove_layer ) # d_task_input is the input dimension of the task-specific module # set skip_emb = 1 if you want to concatenate the encoder input with encoder output to pass # into task specific module. d_task_input = d_sent_output + (args.skip_embs * d_emb) # Build model and classifiers model = MultiTaskModel(args, sent_encoder, vocab) build_task_modules(args, tasks, model, d_task_input, d_emb, embedder, vocab) model = model.cuda() if args.cuda >= 0 else model log.info("Model specification:") log.info(model) param_count = 0 trainable_param_count = 0 if args.list_params: log.info("Model parameters:") for name, param in model.named_parameters(): param_count += np.prod(param.size()) if param.requires_grad: trainable_param_count += np.prod(param.size()) if args.list_params: log.info( "\t%s: Trainable parameter, count %d with %s", name, np.prod(param.size()), str(param.size()), ) elif args.list_params: log.info( "\t%s: Non-trainable parameter, count %d with %s", name, np.prod(param.size()), str(param.size()), ) log.info("Total number of parameters: {ct:d} ({ct:g})".format(ct=param_count)) log.info("Number of trainable parameters: {ct:d} ({ct:g})".format(ct=trainable_param_count)) return model