def build_indexers(args): indexers = {} if args.input_module in ["scratch", "glove", "fastText"]: indexers["words"] = SingleIdTokenIndexer() elif args.input_module in ["elmo", "elmo-chars-only"]: indexers["elmo"] = ELMoTokenCharactersIndexer("elmo") assert args.tokenizer in {"", "MosesTokenizer"} if args.char_embs: indexers["chars"] = TokenCharactersIndexer("chars") if args.cove: assert args.tokenizer == "MosesTokenizer", ( f"CoVe model expects Moses tokenization (MosesTokenizer);" " you are using args.tokenizer = {args.tokenizer}" ) if input_module_uses_pytorch_transformers(args.input_module): assert ( not indexers ), "pytorch_transformers modules like BERT/XLNet are not supported alongside other " "indexers due to tokenization." assert args.tokenizer == args.input_module, ( "pytorch_transformers models use custom tokenization for each model, so tokenizer " "must match the specified model." ) tokenizer_name = input_module_tokenizer_name(args.input_module) indexers[tokenizer_name] = SingleIdTokenIndexer(tokenizer_name) return indexers
def select_tokenizer(args): """ Select a sane default tokenizer. """ if args.tokenizer == "auto": if input_module_uses_pytorch_transformers(args.input_module): tokenizer_name = args.input_module else: tokenizer_name = "MosesTokenizer" else: tokenizer_name = args.tokenizer return tokenizer_name
def __init__(self, args, sent_encoder, vocab): """ Args: sentence encoder """ super(MultiTaskModel, self).__init__() self.sent_encoder = sent_encoder self.vocab = vocab self.utilization = Average() if args.track_batch_utilization else None self.elmo = args.input_module == "elmo" self.use_pytorch_transformers = input_module_uses_pytorch_transformers(args.input_module) self.project_before_pooling = not ( self.use_pytorch_transformers and args.transfer_paradigm == "finetune" ) # Rough heuristic. TODO: Make this directly user-controllable. self.sep_embs_for_skip = args.sep_embs_for_skip
def _build_vocab(args, tasks, vocab_path: str): """ Build vocabulary from scratch, reading data from tasks. """ # NOTE: task-specific target vocabulary should be counted in the task object # and provided via `task.all_labels()`. The namespace should be task-specific, # i.e. not something generic like "targets". log.info("\tBuilding vocab from scratch.") max_v_sizes = {"word": args.max_word_v_size, "char": args.max_char_v_size} word2freq, char2freq = get_words(tasks) vocab = get_vocab(word2freq, char2freq, max_v_sizes) for task in tasks: # add custom label namespaces add_task_label_vocab(vocab, task) if args.force_include_wsj_vocabulary: # Add WSJ full vocabulary for PTB F1 parsing tasks. add_wsj_vocab(vocab, args.data_dir) if input_module_uses_pytorch_transformers(args.input_module): # Add pre-computed vocabulary of corresponding tokenizer for pytorch_transformers models. add_pytorch_transformers_vocab(vocab, args.tokenizer) vocab.save_to_files(vocab_path) log.info("\tSaved vocab to %s", vocab_path)
def build_indexers(args): indexers = {} if args.input_module in ["scratch", "glove", "fastText"]: indexers["words"] = SingleIdTokenIndexer() elif args.input_module in ["elmo", "elmo-chars-only"]: indexers["elmo"] = ELMoTokenCharactersIndexer("elmo") assert args.tokenizer in {"", "MosesTokenizer"} if args.char_embs: indexers["chars"] = TokenCharactersIndexer("chars") if args.cove: assert args.tokenizer == "MosesTokenizer", ( f"CoVe model expects Moses tokenization (MosesTokenizer);" " you are using args.tokenizer = {args.tokenizer}") if args.input_module == "gpt": assert ( not indexers ), "OpenAI transformer is not supported alongside other indexers due to tokenization." assert ( args.tokenizer == "OpenAI.BPE" ), "OpenAI transformer uses custom BPE tokenization. Set tokenizer=OpenAI.BPE." indexers["openai_bpe_pretokenized"] = SingleIdTokenIndexer( "openai_bpe") elif input_module_uses_pytorch_transformers(args.input_module): assert ( not indexers ), "pytorch_transformers modules like BERT/XLNet are not supported alongside other " "indexers due to tokenization." assert args.tokenizer == args.input_module, ( "BERT/XLNet models use custom WPM tokenization for each model, so tokenizer " "must match the specified model.") indexers[ "pytorch_transformers_wpm_pretokenized"] = SingleIdTokenIndexer( args.input_module) return indexers
def build_embeddings(args, vocab, tasks, pretrained_embs=None): """ Build embeddings according to options in args """ d_emb, d_char = 0, args.d_char token_embedders = {} # Word embeddings n_token_vocab = vocab.get_vocab_size("tokens") if args.input_module in ["glove", "fastText"] and pretrained_embs is not None: word_embs = pretrained_embs assert word_embs.size()[0] == n_token_vocab d_word = word_embs.size()[1] log.info("\tUsing pre-trained word embeddings: %s", str(word_embs.size())) elif args.input_module == "scratch": log.info("\tTraining word embeddings from scratch.") d_word = args.d_word word_embs = nn.Embedding(n_token_vocab, d_word).weight else: assert input_module_uses_pytorch_transformers(args.input_module) or args.input_module in [ "gpt", "elmo", "elmo-chars-only", ], f"'{args.input_module}' is not a valid value for input_module." embeddings = None word_embs = None if word_embs is not None: embeddings = Embedding( num_embeddings=n_token_vocab, embedding_dim=d_word, weight=word_embs, trainable=(args.embeddings_train == 1), padding_index=vocab.get_token_index("@@PADDING@@"), ) token_embedders["words"] = embeddings d_emb += d_word # Handle cove cove_layer = None if args.cove: assert embeddings is not None assert args.input_module == "glove", "CoVe requires GloVe embeddings." assert d_word == 300, "CoVe expects 300-dimensional GloVe embeddings." try: from jiant.modules.cove.cove import MTLSTM as cove_lstm # Have CoVe do an internal GloVe lookup, but don't add residual. # We'll do this manually in modules.py; see # SentenceEncoder.forward(). cove_layer = cove_lstm(n_vocab=n_token_vocab, vectors=embeddings.weight.data) # Control whether CoVe is trainable. for param in cove_layer.parameters(): param.requires_grad = bool(args.cove_fine_tune) d_emb += 600 # 300 x 2 for biLSTM activations log.info("\tUsing CoVe embeddings!") except ImportError as e: log.info("Failed to import CoVe!") raise e # Character embeddings if args.char_embs: log.info("\tUsing character embeddings!") char_embeddings = Embedding(vocab.get_vocab_size("chars"), d_char) filter_sizes = tuple([int(i) for i in args.char_filter_sizes.split(",")]) char_encoder = CnnEncoder( d_char, num_filters=args.n_char_filters, ngram_filter_sizes=filter_sizes, output_dim=d_char, ) char_embedder = TokenCharactersEncoder( char_embeddings, char_encoder, dropout=args.dropout_embs ) d_emb += d_char token_embedders["chars"] = char_embedder else: log.info("\tNot using character embeddings!") # If we want separate ELMo scalar weights (a different ELMo representation for each classifier, # then we need count and reliably map each classifier to an index used by # allennlp internal ELMo. if args.sep_embs_for_skip: # Determine a deterministic list of classifier names to use for each # task. classifiers = sorted(set(map(lambda x: x._classifier_name, tasks))) # Reload existing classifier map, if it exists. classifier_save_path = args.run_dir + "/classifier_task_map.json" if os.path.isfile(classifier_save_path): loaded_classifiers = json.load(open(args.run_dir + "/classifier_task_map.json", "r")) else: # No file exists, so assuming we are just starting to pretrain. If pretrain is to be # skipped, then there's a way to bypass this assertion by explicitly allowing for # a missing classiifer task map. assert_for_log( args.do_pretrain or args.allow_missing_task_map, "Error: {} should already exist.".format(classifier_save_path), ) if args.allow_missing_task_map: log.warning( "Warning: classifier task map not found in model" " directory. Creating a new one from scratch." ) # default is always @pretrain@ loaded_classifiers = {"@pretrain@": 0} # Add the new tasks and update map, keeping the internal ELMo index # consistent. max_number_classifiers = max(loaded_classifiers.values()) offset = 1 for classifier in classifiers: if classifier not in loaded_classifiers: loaded_classifiers[classifier] = max_number_classifiers + offset offset += 1 log.info("Classifiers:{}".format(loaded_classifiers)) open(classifier_save_path, "w+").write(json.dumps(loaded_classifiers)) # Every index in classifiers needs to correspond to a valid ELMo output # representation. num_reps = 1 + max(loaded_classifiers.values()) else: # All tasks share the same scalars. # Not used if input_module = elmo-chars-only (i.e. no elmo) loaded_classifiers = {"@pretrain@": 0} num_reps = 1 if args.input_module.startswith("elmo"): log.info("Loading ELMo from files:") log.info("ELMO_OPT_PATH = %s", ELMO_OPT_PATH) if args.input_module == "elmo-chars-only": log.info("\tUsing ELMo character CNN only!") log.info("ELMO_WEIGHTS_PATH = %s", ELMO_WEIGHTS_PATH) elmo_embedder = ElmoCharacterEncoder( options_file=ELMO_OPT_PATH, weight_file=ELMO_WEIGHTS_PATH, requires_grad=False ) d_emb += 512 else: log.info("\tUsing full ELMo! (separate scalars/task)") if args.elmo_weight_file_path != "none": assert os.path.exists(args.elmo_weight_file_path), ( 'ELMo weight file path "' + args.elmo_weight_file_path + '" does not exist.' ) weight_file = args.elmo_weight_file_path else: weight_file = ELMO_WEIGHTS_PATH log.info("ELMO_WEIGHTS_PATH = %s", weight_file) elmo_embedder = ElmoTokenEmbedderWrapper( options_file=ELMO_OPT_PATH, weight_file=weight_file, num_output_representations=num_reps, # Dropout is added by the sentence encoder later. dropout=0.0, ) d_emb += 1024 token_embedders["elmo"] = elmo_embedder # Wrap ELMo and other embedders, and concatenates the resulting # representations alone the last (vector) dimension. embedder = ElmoTextFieldEmbedder( token_embedders, loaded_classifiers, elmo_chars_only=args.input_module == "elmo-chars-only", sep_embs_for_skip=args.sep_embs_for_skip, ) assert d_emb, "You turned off all the embeddings, ya goof!" return d_emb, embedder, cove_layer