def build_indexers(args): indexers = {} if args.input_module in ["scratch", "glove", "fastText"]: indexers["words"] = SingleIdTokenIndexer() elif args.input_module in ["elmo", "elmo-chars-only"]: indexers["elmo"] = ELMoTokenCharactersIndexer("elmo") assert args.tokenizer in {"", "MosesTokenizer"} if args.char_embs: indexers["chars"] = TokenCharactersIndexer("chars") if args.cove: assert args.tokenizer == "MosesTokenizer", ( f"CoVe model expects Moses tokenization (MosesTokenizer);" " you are using args.tokenizer = {args.tokenizer}") if input_module_uses_transformers(args.input_module): assert ( not indexers ), "transformers modules like BERT/XLNet are not supported alongside other " "indexers due to tokenization." assert args.tokenizer == args.input_module, ( "transformers models use custom tokenization for each model, so tokenizer " "must match the specified model.") tokenizer_name = input_module_tokenizer_name(args.input_module) indexers[tokenizer_name] = SingleIdTokenIndexer(tokenizer_name) return indexers
def __init__(self, args): super().__init__() self.cache_dir = FLAGS.cache_dir utils.maybe_make_dir(self.cache_dir) self.output_mode = args.transformers_output_mode self.input_module = args.input_module self.tokenizer_required = input_module_tokenizer_name( args.input_module) # If set, treat these special tokens as part of input segments other than A/B. self._SEG_ID_CLS = None self._SEG_ID_SEP = None # self.model = transformers.RobertaModel.from_pretrained( # args.input_module, cache_dir=self.cache_dir, output_hidden_states=True # ) if FLAGS.saved_pretrained_model_path: self.model = load_pretrained_model_for_SG() else: self.model = MODEL_MAPPING[FLAGS.model](finetune_stage=True) self.max_pos = None self.tokenizer = get_my_tokenizer() self._sep_id = self.tokenizer.sep_token_id self._cls_id = self.tokenizer.cls_token_id self._pad_id = self.tokenizer.pad_token_id self._unk_id = self.tokenizer.unk_token_id self.parameter_setup(args)
def add_transformers_vocab(vocab, tokenizer_name): """Add vocabulary from tokenizers in transformers for use with pre-tokenized data. These tokenizers have a convert_tokens_to_ids method, but this doesn't do anything special, so we can just use the standard indexers. """ do_lower_case = "uncased" in tokenizer_name log.info('In add_transformers_vocab') log.info(tokenizer_name) if tokenizer_name.startswith( "bert-" ) or 'rubert' in tokenizer_name or '/bert-' in tokenizer_name: tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith( "roberta-"): # or 'roberta' in tokenizer_name: tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("albert-"): tokenizer = AlbertTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlnet-"): tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("openai-gpt"): tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("gpt2") or 'gpt' in tokenizer_name: tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("transfo-xl-"): tokenizer = TransfoXLTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlm-roberta"): tokenizer = XLMRobertaTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlm-"): tokenizer = XLMTokenizer.from_pretrained(tokenizer_name) if (tokenizer_name.startswith("openai-gpt") or tokenizer_name.startswith("gpt2") or tokenizer_name.startswith("transo-xl-")): tokenizer.add_special_tokens({ "bos_token": "<start>", "sep_token": "<delim>", "cls_token": "<extract>" }) # TODO: this is another place can be simplified by "model-before-preprocess" reorganization # we can pass tokenizer created in model here, see issue <TBD> vocab_size = len(tokenizer) # do not use tokenizer.vocab_size, it does not include newly added token ordered_vocab = tokenizer.convert_ids_to_tokens(range(vocab_size)) log.info("Added transformers vocab (%s): %d tokens", tokenizer_name, len(ordered_vocab)) for word in ordered_vocab: vocab.add_token_to_namespace( word, input_module_tokenizer_name(tokenizer_name))
def __init__(self, args): super(HuggingfaceTransformersEmbedderModule, self).__init__() self.cache_dir = os.getenv( "HUGGINGFACE_TRANSFORMERS_CACHE", os.path.join(args.exp_dir, "transformers_cache")) utils.maybe_make_dir(self.cache_dir) self.output_mode = args.transformers_output_mode self.input_module = args.input_module self.max_pos = None self.tokenizer_required = input_module_tokenizer_name( args.input_module) # Integer token indices for special symbols. self._cls_id = None self._sep_id = None self._pad_id = None self._unk_id = None # If set, treat these special tokens as part of input segments other than A/B. self._SEG_ID_CLS = None self._SEG_ID_SEP = None
def build_tasks( args: config.Params, cuda_device: Any ) -> (List[Task], List[Task], Vocabulary, Union[np.ndarray, float]): """Main logic for preparing tasks: 1. create or load the tasks 2. configure classifiers for tasks 3. set up indexers 4. build and save vocab to disk 5. load vocab from disk 6. if specified, load word embeddings 7. set up ModelPreprocessingInterface (MPI) to handle model-specific preprocessing 8. index tasks using vocab and task-specific MPI, save to disk. 9. return: task data lazy-loaders in phase-specific lists w/ vocab, and word embeddings Parameters ---------- args : Params config map Returns ------- List[Task] list of pretrain Tasks. List[Task] list of target Tasks. allennlp.data.Vocabulary vocabulary from task data. Union[np.ndarray, float] Word embeddings. """ # 1) create / load tasks tasks, pretrain_task_names, target_task_names = get_tasks(args, cuda_device) for task in tasks: task_classifier = config.get_task_attr(args, task.name, "use_classifier") setattr(task, "_classifier_name", task_classifier if task_classifier else task.name) tokenizer_names = {task.name: task.tokenizer_name for task in tasks} assert not len(set(tokenizer_names.values())) > 1, ( f"Error: mixing tasks with different tokenizers!" " Tokenizations: {tokenizer_names:s}" ) # 2) build / load vocab and indexers indexers = build_indexers(args) vocab_path = os.path.join(args.exp_dir, "vocab",input_module_tokenizer_name(args.input_module)) #to allow roberta and albert (with diff vocabs) in one exp folder if args.reload_vocab or not os.path.exists(vocab_path): _build_vocab(args, tasks, vocab_path) # Always load vocab from file. vocab = Vocabulary.from_files(vocab_path) log.info("\tLoaded vocab from %s", vocab_path) for namespace, mapping in vocab._index_to_token.items(): log.info("\tVocab namespace %s: size %d", namespace, len(mapping)) log.info("\tFinished building vocab.") args.max_word_v_size = vocab.get_vocab_size("tokens") args.max_char_v_size = vocab.get_vocab_size("chars") # 3) build / load word vectors word_embs = None if args.input_module in ["glove", "fastText"]: emb_file = os.path.join(args.exp_dir, "embs.pkl") if args.reload_vocab or not os.path.exists(emb_file): word_embs = _build_embeddings(args, vocab, emb_file) else: # load from file word_embs = pkl.load(open(emb_file, "rb")) log.info("Trimmed word embeddings: %s", str(word_embs.size())) # 4) Set up model_preprocessing_interface model_preprocessing_interface = ModelPreprocessingInterface(args) # 5) Index tasks using vocab (if preprocessed copy not available). preproc_dir = os.path.join(args.exp_dir, "preproc") utils.maybe_make_dir(preproc_dir) reindex_tasks = parse_task_list_arg(args.reindex_tasks) utils.assert_for_log( not (args.reload_indexing and not reindex_tasks), 'Flag reload_indexing was set, but no tasks are set to reindex (use -o "args.reindex_tasks' ' = "task1,task2,..."")', ) for task in tasks: force_reindex = args.reload_indexing and task.name in reindex_tasks for split in ALL_SPLITS: log_prefix = "\tTask '%s', split '%s'" % (task.name, split) # To store preprocessed data for models that use different indexers in the same exp directory indexer = input_module_tokenizer_name(args.input_module) relative_path = _get_serialized_record_path(task.name, split, "preproc",indexer) cache_found =_find_cached_file( args.exp_dir, args.global_ro_exp_dir, relative_path, log_prefix=log_prefix #TODO change global one to point to arwen, and local one to be in one exp folder with diff runs ) if force_reindex or not cache_found: # Re-index from scratch. record_file = _get_serialized_record_path(task.name, split, preproc_dir,indexer) if os.path.exists(record_file) and os.path.islink(record_file): os.remove(record_file) _index_split( task, split, indexers, vocab, record_file, model_preprocessing_interface ) # Delete in-memory data - we'll lazy-load from disk later. # TODO: delete task.{split}_data_text? log.info("\tFinished indexing tasks") # 6) Initialize tasks with data iterators. pretrain_tasks = [] target_tasks = [] for task in tasks: indexer = input_module_tokenizer_name(args.input_module) # Replace lists of instances with lazy generators from disk. task.val_data = _get_instance_generator(task.name, "val", preproc_dir,indexer=indexer) task.test_data = _get_instance_generator(task.name, "test", preproc_dir,indexer=indexer) # When using pretrain_data_fraction, we need modified iterators for use # only on training datasets at pretraining time. if task.name in pretrain_task_names: log.info("\tCreating trimmed pretraining-only version of " + task.name + " train.") task.train_data = _get_instance_generator( task.name, "train", preproc_dir, fraction=args.pretrain_data_fraction ,indexer=indexer) pretrain_tasks.append(task) # When using target_train_data_fraction, we need modified iterators # only for training datasets at do_target_task_training time. if task.name in target_task_names: log.info("\tCreating trimmed target-only version of " + task.name + " train.") task.train_data = _get_instance_generator( task.name, "train", preproc_dir, fraction=args.target_train_data_fraction ,indexer=indexer) target_tasks.append(task) log.info("\t Training on %s", ", ".join(pretrain_task_names)) log.info("\t Evaluating on %s", ", ".join(target_task_names)) return pretrain_tasks, target_tasks, vocab, word_embs