def build_indexers(args):
    indexers = {}
    if args.input_module in ["scratch", "glove", "fastText"]:
        indexers["words"] = SingleIdTokenIndexer()
    elif args.input_module in ["elmo", "elmo-chars-only"]:
        indexers["elmo"] = ELMoTokenCharactersIndexer("elmo")
        assert args.tokenizer in {"", "MosesTokenizer"}

    if args.char_embs:
        indexers["chars"] = TokenCharactersIndexer("chars")
    if args.cove:
        assert args.tokenizer == "MosesTokenizer", (
            f"CoVe model expects Moses tokenization (MosesTokenizer);"
            " you are using args.tokenizer = {args.tokenizer}"
        )

    if input_module_uses_pytorch_transformers(args.input_module):
        assert (
            not indexers
        ), "pytorch_transformers modules like BERT/XLNet are not supported alongside other "
        "indexers due to tokenization."
        assert args.tokenizer == args.input_module, (
            "pytorch_transformers models use custom tokenization for each model, so tokenizer "
            "must match the specified model."
        )
        tokenizer_name = input_module_tokenizer_name(args.input_module)
        indexers[tokenizer_name] = SingleIdTokenIndexer(tokenizer_name)
    return indexers
示例#2
0
def select_tokenizer(args):
    """
        Select a sane default tokenizer.
    """
    if args.tokenizer == "auto":
        if input_module_uses_pytorch_transformers(args.input_module):
            tokenizer_name = args.input_module
        else:
            tokenizer_name = "MosesTokenizer"
    else:
        tokenizer_name = args.tokenizer
    return tokenizer_name
示例#3
0
 def __init__(self, args, sent_encoder, vocab):
     """ Args: sentence encoder """
     super(MultiTaskModel, self).__init__()
     self.sent_encoder = sent_encoder
     self.vocab = vocab
     self.utilization = Average() if args.track_batch_utilization else None
     self.elmo = args.input_module == "elmo"
     self.use_pytorch_transformers = input_module_uses_pytorch_transformers(args.input_module)
     self.project_before_pooling = not (
         self.use_pytorch_transformers and args.transfer_paradigm == "finetune"
     )  # Rough heuristic. TODO: Make this directly user-controllable.
     self.sep_embs_for_skip = args.sep_embs_for_skip
def _build_vocab(args, tasks, vocab_path: str):
    """ Build vocabulary from scratch, reading data from tasks. """
    # NOTE: task-specific target vocabulary should be counted in the task object
    # and provided via `task.all_labels()`. The namespace should be task-specific,
    # i.e. not something generic like "targets".
    log.info("\tBuilding vocab from scratch.")
    max_v_sizes = {"word": args.max_word_v_size, "char": args.max_char_v_size}
    word2freq, char2freq = get_words(tasks)
    vocab = get_vocab(word2freq, char2freq, max_v_sizes)
    for task in tasks:  # add custom label namespaces
        add_task_label_vocab(vocab, task)
    if args.force_include_wsj_vocabulary:
        # Add WSJ full vocabulary for PTB F1 parsing tasks.
        add_wsj_vocab(vocab, args.data_dir)
    if input_module_uses_pytorch_transformers(args.input_module):
        # Add pre-computed vocabulary of corresponding tokenizer for pytorch_transformers models.
        add_pytorch_transformers_vocab(vocab, args.tokenizer)

    vocab.save_to_files(vocab_path)
    log.info("\tSaved vocab to %s", vocab_path)
示例#5
0
def build_indexers(args):
    indexers = {}
    if args.input_module in ["scratch", "glove", "fastText"]:
        indexers["words"] = SingleIdTokenIndexer()
    elif args.input_module in ["elmo", "elmo-chars-only"]:
        indexers["elmo"] = ELMoTokenCharactersIndexer("elmo")
        assert args.tokenizer in {"", "MosesTokenizer"}

    if args.char_embs:
        indexers["chars"] = TokenCharactersIndexer("chars")
    if args.cove:
        assert args.tokenizer == "MosesTokenizer", (
            f"CoVe model expects Moses tokenization (MosesTokenizer);"
            " you are using args.tokenizer = {args.tokenizer}")

    if args.input_module == "gpt":
        assert (
            not indexers
        ), "OpenAI transformer is not supported alongside other indexers due to tokenization."
        assert (
            args.tokenizer == "OpenAI.BPE"
        ), "OpenAI transformer uses custom BPE tokenization. Set tokenizer=OpenAI.BPE."
        indexers["openai_bpe_pretokenized"] = SingleIdTokenIndexer(
            "openai_bpe")
    elif input_module_uses_pytorch_transformers(args.input_module):
        assert (
            not indexers
        ), "pytorch_transformers modules like BERT/XLNet are not supported alongside other "
        "indexers due to tokenization."
        assert args.tokenizer == args.input_module, (
            "BERT/XLNet models use custom WPM tokenization for each model, so tokenizer "
            "must match the specified model.")
        indexers[
            "pytorch_transformers_wpm_pretokenized"] = SingleIdTokenIndexer(
                args.input_module)
    return indexers
示例#6
0
def build_embeddings(args, vocab, tasks, pretrained_embs=None):
    """ Build embeddings according to options in args """
    d_emb, d_char = 0, args.d_char

    token_embedders = {}
    # Word embeddings
    n_token_vocab = vocab.get_vocab_size("tokens")
    if args.input_module in ["glove", "fastText"] and pretrained_embs is not None:
        word_embs = pretrained_embs
        assert word_embs.size()[0] == n_token_vocab
        d_word = word_embs.size()[1]
        log.info("\tUsing pre-trained word embeddings: %s", str(word_embs.size()))
    elif args.input_module == "scratch":
        log.info("\tTraining word embeddings from scratch.")
        d_word = args.d_word
        word_embs = nn.Embedding(n_token_vocab, d_word).weight
    else:
        assert input_module_uses_pytorch_transformers(args.input_module) or args.input_module in [
            "gpt",
            "elmo",
            "elmo-chars-only",
        ], f"'{args.input_module}' is not a valid value for input_module."
        embeddings = None
        word_embs = None

    if word_embs is not None:
        embeddings = Embedding(
            num_embeddings=n_token_vocab,
            embedding_dim=d_word,
            weight=word_embs,
            trainable=(args.embeddings_train == 1),
            padding_index=vocab.get_token_index("@@PADDING@@"),
        )
        token_embedders["words"] = embeddings
        d_emb += d_word

    # Handle cove
    cove_layer = None
    if args.cove:
        assert embeddings is not None
        assert args.input_module == "glove", "CoVe requires GloVe embeddings."
        assert d_word == 300, "CoVe expects 300-dimensional GloVe embeddings."
        try:
            from jiant.modules.cove.cove import MTLSTM as cove_lstm

            # Have CoVe do an internal GloVe lookup, but don't add residual.
            # We'll do this manually in modules.py; see
            # SentenceEncoder.forward().
            cove_layer = cove_lstm(n_vocab=n_token_vocab, vectors=embeddings.weight.data)
            # Control whether CoVe is trainable.
            for param in cove_layer.parameters():
                param.requires_grad = bool(args.cove_fine_tune)
            d_emb += 600  # 300 x 2 for biLSTM activations
            log.info("\tUsing CoVe embeddings!")
        except ImportError as e:
            log.info("Failed to import CoVe!")
            raise e

    # Character embeddings
    if args.char_embs:
        log.info("\tUsing character embeddings!")
        char_embeddings = Embedding(vocab.get_vocab_size("chars"), d_char)
        filter_sizes = tuple([int(i) for i in args.char_filter_sizes.split(",")])
        char_encoder = CnnEncoder(
            d_char,
            num_filters=args.n_char_filters,
            ngram_filter_sizes=filter_sizes,
            output_dim=d_char,
        )
        char_embedder = TokenCharactersEncoder(
            char_embeddings, char_encoder, dropout=args.dropout_embs
        )
        d_emb += d_char
        token_embedders["chars"] = char_embedder
    else:
        log.info("\tNot using character embeddings!")

    # If we want separate ELMo scalar weights (a different ELMo representation for each classifier,
    # then we need count and reliably map each classifier to an index used by
    # allennlp internal ELMo.
    if args.sep_embs_for_skip:
        # Determine a deterministic list of classifier names to use for each
        # task.
        classifiers = sorted(set(map(lambda x: x._classifier_name, tasks)))
        # Reload existing classifier map, if it exists.
        classifier_save_path = args.run_dir + "/classifier_task_map.json"
        if os.path.isfile(classifier_save_path):
            loaded_classifiers = json.load(open(args.run_dir + "/classifier_task_map.json", "r"))
        else:
            # No file exists, so assuming we are just starting to pretrain. If pretrain is to be
            # skipped, then there's a way to bypass this assertion by explicitly allowing for
            # a missing classiifer task map.
            assert_for_log(
                args.do_pretrain or args.allow_missing_task_map,
                "Error: {} should already exist.".format(classifier_save_path),
            )
            if args.allow_missing_task_map:
                log.warning(
                    "Warning: classifier task map not found in model"
                    " directory. Creating a new one from scratch."
                )
            # default is always @pretrain@
            loaded_classifiers = {"@pretrain@": 0}
        # Add the new tasks and update map, keeping the internal ELMo index
        # consistent.
        max_number_classifiers = max(loaded_classifiers.values())
        offset = 1
        for classifier in classifiers:
            if classifier not in loaded_classifiers:
                loaded_classifiers[classifier] = max_number_classifiers + offset
                offset += 1
        log.info("Classifiers:{}".format(loaded_classifiers))
        open(classifier_save_path, "w+").write(json.dumps(loaded_classifiers))
        # Every index in classifiers needs to correspond to a valid ELMo output
        # representation.
        num_reps = 1 + max(loaded_classifiers.values())
    else:
        # All tasks share the same scalars.
        # Not used if input_module = elmo-chars-only (i.e. no elmo)
        loaded_classifiers = {"@pretrain@": 0}
        num_reps = 1
    if args.input_module.startswith("elmo"):
        log.info("Loading ELMo from files:")
        log.info("ELMO_OPT_PATH = %s", ELMO_OPT_PATH)
        if args.input_module == "elmo-chars-only":
            log.info("\tUsing ELMo character CNN only!")
            log.info("ELMO_WEIGHTS_PATH = %s", ELMO_WEIGHTS_PATH)
            elmo_embedder = ElmoCharacterEncoder(
                options_file=ELMO_OPT_PATH, weight_file=ELMO_WEIGHTS_PATH, requires_grad=False
            )
            d_emb += 512
        else:
            log.info("\tUsing full ELMo! (separate scalars/task)")
            if args.elmo_weight_file_path != "none":
                assert os.path.exists(args.elmo_weight_file_path), (
                    'ELMo weight file path "' + args.elmo_weight_file_path + '" does not exist.'
                )
                weight_file = args.elmo_weight_file_path
            else:
                weight_file = ELMO_WEIGHTS_PATH
            log.info("ELMO_WEIGHTS_PATH = %s", weight_file)
            elmo_embedder = ElmoTokenEmbedderWrapper(
                options_file=ELMO_OPT_PATH,
                weight_file=weight_file,
                num_output_representations=num_reps,
                # Dropout is added by the sentence encoder later.
                dropout=0.0,
            )
            d_emb += 1024

        token_embedders["elmo"] = elmo_embedder

    # Wrap ELMo and other embedders, and concatenates the resulting
    # representations alone the last (vector) dimension.
    embedder = ElmoTextFieldEmbedder(
        token_embedders,
        loaded_classifiers,
        elmo_chars_only=args.input_module == "elmo-chars-only",
        sep_embs_for_skip=args.sep_embs_for_skip,
    )

    assert d_emb, "You turned off all the embeddings, ya goof!"
    return d_emb, embedder, cove_layer