예제 #1
0
def add_task_label_vocab(vocab, task):
    """Add custom task labels to a separate namespace.

    If task has a 'get_all_labels' method, call that to get a list of labels
    to populate the <task_name>_labels vocabulary namespace.

    This is the recommended way to implement multiclass models: in your task's
    process_split code, make instances that use LabelFields with the task label
    namespace, e.g.:
        label_namespace = "%s_labels" % self.name
        label = LabelField(label_string, label_namespace=label_namespace)
    This will cause them to be properly indexed by the Vocabulary.

    This can then be accessed when generating Instances, either via a custom
    Indexer or by invoking the namespace when creating a LabelField.
    """
    if not hasattr(task, "get_all_labels"):
        return
    utils.assert_for_log(
        hasattr(task, "_label_namespace"),
        "Task %s is missing method `_label_namespace`!" % task.name,
    )
    namespace = task._label_namespace
    if namespace is None:
        return
    log.info("\tTask '%s': adding vocab namespace '%s'", task.name, namespace)
    for label in task.get_all_labels():
        vocab.add_token_to_namespace(label, namespace)
예제 #2
0
    def _lm_only_lr_forward(self, batch, task):
        """Only left to right pass for LM model - non-bidirectional models.
           Used for language modeling training only in one direction.
        Args:
            batch: indexed input data
            task: (Task obejct)
        return:
            out: (dict)
                - 'logits': output layer, dimension: [batchSize * timeSteps, outputDim]
                    is output layer from forward layer
                - 'loss': size average CE loss
        """

        out = {}
        assert_for_log(
            "targs" in batch and "words" in batch["targs"], "Batch missing target words!"
        )
        pad_idx = self.vocab.get_token_index(self.vocab._padding_token, "tokens")
        b_size, seq_len = batch["targs"]["words"].size()
        # pad_idx is the token used to pad till max_seq_len
        n_pad = batch["targs"]["words"].eq(pad_idx).sum().item()
        # No of examples: only left to right, every unit in the sequence length is
        # a training example only once.
        out["n_exs"] = b_size * seq_len - n_pad
        sent, mask = self.sent_encoder(batch["input"], task)
        sent = sent.masked_fill(1 - mask.byte(), 0)
        hid2voc = getattr(self, "%s_hid2voc" % task.name)
        logits = hid2voc(sent).view(b_size * seq_len, -1)
        out["logits"] = logits
        trg_fwd = batch["targs"]["words"].view(-1)
        assert logits.size(0) == trg_fwd.size(0), "Number of logits and targets differ!"
        out["loss"] = F.cross_entropy(logits, trg_fwd, ignore_index=pad_idx)
        task.scorer1(out["loss"].item())
        return out
예제 #3
0
def setup_target_task_training(args, target_tasks, model, strict):
    """
    Gets the model path used to restore model after each target
    task run, and saves current state if no other previous checkpoint can
    be used as the model path.
    The logic for loading the correct model state for target task training is:
    1) If load_target_train_checkpoint is used, then load the weights from that checkpoint.
    2) If we did pretraining, then load the best model from pretraining.
    3) Default case: we save untrained encoder weights.

    Parameters
    ----------------
    args: Params object
    target_tasks: list of target Task objects
    mdoel: a MultiTaskModel object

    Returns
    ----------------
    model_path: str
    """
    model_path = get_best_checkpoint_path(args, "target_train")
    if model_path is None:
        # We want to do target training without pretraining, thus
        # we need to first create a checkpoint to come back to for each of
        # the target tasks to finetune.
        if args.transfer_paradigm == "frozen":
            assert_for_log(
                args.allow_untrained_encoder_parameters,
                "No best checkpoint found to target train on. Set `allow_untrained_encoder_parameters` if you really want to use an untrained encoder.",
            )
        model_path = os.path.join(args.run_dir,
                                  "model_state_untrained_pre_target_train.th")
        torch.save(model.state_dict(), model_path)

    return model_path
예제 #4
0
    def _lm_forward(self, batch, task, predict):
        """Forward pass for LM model
        Args:
            batch: indexed input data
            task: (Task obejct)
            predict: (boolean) predict mode (not supported)
        return:
            out: (dict)
                - 'logits': output layer, dimension: [batchSize * timeSteps * 2, outputDim]
                            first half: [:batchSize*timeSteps, outputDim] is output layer from
                                forward layer
                            second half: [batchSize*timeSteps:, outputDim] is output layer from
                                backward layer
                - 'loss': size average CE loss
        """
        out = {}
        sent_encoder = self.sent_encoder
        assert_for_log(
            isinstance(sent_encoder._phrase_layer, BiLMEncoder),
            "Not using LM for language modeling task!",
        )
        assert_for_log(
            "targs" in batch and "words" in batch["targs"], "Batch missing target words!"
        )
        pad_idx = self.vocab.get_token_index(self.vocab._padding_token, "tokens")
        b_size, seq_len = batch["targs"]["words"].size()
        n_pad = batch["targs"]["words"].eq(pad_idx).sum().item()
        out["n_exs"] = (b_size * seq_len - n_pad) * 2

        sent, mask = sent_encoder(batch["input"], task)
        sent = sent.masked_fill(1 - mask.byte(), 0)  # avoid NaNs

        # Split encoder outputs by direction
        split = int(self.sent_encoder._phrase_layer.get_output_dim() / 2)
        fwd, bwd = sent[:, :, :split], sent[:, :, split : split * 2]
        if split * 2 < sent.size(2):  # skip embeddings
            out_embs = sent[:, :, split * 2 :]
            fwd = torch.cat([fwd, out_embs], dim=2)
            bwd = torch.cat([bwd, out_embs], dim=2)

        # Forward and backward logits and targs
        hid2voc = getattr(self, "%s_hid2voc" % task.name)
        logits_fwd = hid2voc(fwd).view(b_size * seq_len, -1)
        logits_bwd = hid2voc(bwd).view(b_size * seq_len, -1)
        logits = torch.cat([logits_fwd, logits_bwd], dim=0)
        out["logits"] = logits
        trg_fwd = batch["targs"]["words"].view(-1)
        trg_bwd = batch["targs_b"]["words"].view(-1)
        targs = torch.cat([trg_fwd, trg_bwd], dim=0)
        assert logits.size(0) == targs.size(0), "Number of logits and targets differ!"
        out["loss"] = F.cross_entropy(logits, targs, ignore_index=pad_idx)
        task.scorer1(out["loss"].item())
        if predict:
            pass
        return out
예제 #5
0
def load_model_for_target_train_run(args, ckpt_path, model, strict, task,
                                    cuda_devices):
    """
        Function that reloads model if necessary and extracts trainable parts
        of the model in preparation for target_task training.
        It only reloads model after the first task is trained.

        Parameters
        -------------------
        args: config.Param object,
        ckpt_path: str: path to reload model from,
        model: MultiTaskModel object,
        strict: bool,
        task: Task object

        Returns
        -------------------
        to_train: List of tuples of (name, weight) of trainable parameters

    """

    if args.transfer_paradigm == "finetune":
        load_model_state(model,
                         ckpt_path,
                         cuda_devices,
                         skip_task_models=[task.name],
                         strict=strict)
        # Train both the task specific models as well as sentence encoder.
        to_train = [(n, p) for n, p in model.named_parameters()
                    if p.requires_grad]
    else:  # args.transfer_paradigm == "frozen":
        # will be empty if args.input_module != "elmo", scalar_mix_0 should always be
        # pretrain scalars
        elmo_scalars = [(n, p) for n, p in model.named_parameters()
                        if "scalar_mix" in n and "scalar_mix_0" not in n]
        # Fails when sep_embs_for_skip is 0 and elmo_scalars has nonzero
        # length.
        assert_for_log(
            not elmo_scalars or args.sep_embs_for_skip,
            "Error: ELMo scalars loaded and will be updated in do_target_task_training but "
            "they should not be updated! Check sep_embs_for_skip flag or make an issue.",
        )
        # Only train task-specific module

        pred_module = get_model_attribute(model, "%s_mdl" % task.name,
                                          cuda_devices)
        to_train = [(n, p) for n, p in pred_module.named_parameters()
                    if p.requires_grad]
        to_train += elmo_scalars
    model = model.cuda() if uses_cuda(cuda_devices) else model
    if isinstance(cuda_devices, list):
        model = nn.DataParallel(model, device_ids=cuda_devices)
    return to_train
예제 #6
0
def get_best_checkpoint_path(args, phase, task_name=None):
    """ Look in run_dir for model checkpoint to load when setting up for
    phase = target_train or phase = eval.
    Hierarchy is:
        If phase == target_train:
            1) user-specified target task checkpoint
            2) best task-specific checkpoint from pretraining stage
        If phase == eval:
            1) user-specified eval checkpoint
            2) best task-specific checkpoint for target_train, used when evaluating
            3) best pretraining checkpoint
    If all these fail, then we default to None.
    """
    checkpoint = []
    if phase == "target_train":
        if args.load_target_train_checkpoint not in ("none", ""):
            checkpoint = glob.glob(args.load_target_train_checkpoint)
            assert len(checkpoint) > 0, (
                "Specified load_target_train_checkpoint not found: %r" %
                args.load_target_train_checkpoint)
        else:
            checkpoint = glob.glob(
                os.path.join(args.run_dir,
                             "model_state_pretrain_val_*.best.th"))
    if phase == "eval":
        # In other words, if we should load_eval_checkpoint
        if args.load_eval_checkpoint not in ("none", ""):
            checkpoint = glob.glob(args.load_eval_checkpoint)
            assert len(checkpoint) > 0, (
                "Specified load_eval_checkpoint not found: %r" %
                args.load_eval_checkpoint)
        else:
            # Get the best checkpoint from the target_train phase to evaluate on.
            assert task_name is not None, "Specify a task checkpoint to evaluate from."
            checkpoint = glob.glob(
                os.path.join(args.run_dir, task_name,
                             "model_state_target_train_val_*.best.th"))
            # NOTE This is where we are generally going, since load_eval_checkpoint has been "none" and len(checkpoint) would be 0
            if len(checkpoint) == 0:
                checkpoint = glob.glob(
                    os.path.join(args.run_dir,
                                 "model_state_pretrain_val_*.best.th"))

    if len(checkpoint) > 0:
        assert_for_log(
            len(checkpoint) == 1,
            "Too many best checkpoints. Something is wrong.")
        return checkpoint[0]
    return None
예제 #7
0
def check_arg_name(args: config.Params):
    """Check for obsolete params in config, throw exceptions if obsolete params are found.

    Parameters
    ----------
    args: config.Params
        config map

    Raises
    ------
    AssertionError
        If obsolete parameter names are present in config

    """
    # Mapping - key: old name, value: new name
    name_dict = {
        "task_patience": "lr_patience",
        "do_train": "do_pretrain",
        "train_for_eval": "do_target_task_training",
        "do_eval": "do_full_eval",
        "train_tasks": "pretrain_tasks",
        "eval_tasks": "target_tasks",
        "eval_data_fraction": "target_train_data_fraction",
        "eval_val_interval": "target_train_val_interval",
        "eval_max_vals": "target_train_max_vals",
        "eval_data_fraction": "target_train_data_fraction",
    }
    for task in task_modules.ALL_GLUE_TASKS + task_modules.ALL_SUPERGLUE_TASKS:
        assert_for_log(
            not args.regex_contains("^{}_".format(task)),
            "Error: Attempting to load old task-specific args for task %s, please refer to the "
            "master branch's default configs for the most recent task specific argument "
            "structures." % task,
        )
    for old_name, new_name in name_dict.items():
        assert_for_log(
            old_name not in args,
            "Error: Attempting to load old arg name %s, please update to new name %s."
            % (old_name, name_dict[old_name]),
        )
    old_input_module_vals = [
        "elmo",
        "elmo_chars_only",
        "bert_model_name",
        "openai_transformer",
        "word_embs",
    ]
    for input_type in old_input_module_vals:
        assert_for_log(
            input_type not in args,
            "Error: Attempting to load old arg name %s, please use input_module config "
            "parameter and refer to master branch's default configs for current way to specify %s."
            % (input_type, input_type),
        )
예제 #8
0
def build_tasks(args):
    """Main logic for preparing tasks, doing so by
    1) creating / loading the tasks
    2) building / loading the vocabulary
    3) building / loading the word vectors
    4) indexing each task's data
    5) initializing lazy loaders (streaming iterators)
    """

    # 1) create / load tasks
    tasks, pretrain_task_names, target_task_names = get_tasks(args)
    for task in tasks:
        task_classifier = config.get_task_attr(args, task.name,
                                               "use_classifier")
        setattr(task, "_classifier_name",
                task_classifier if task_classifier else task.name)

    tokenizer_names = {task.name: task.tokenizer_name for task in tasks}
    assert len(set(tokenizer_names.values())) == 1, (
        f"Error: mixing tasks with different tokenizers!"
        " Tokenizations: {tokenizer_names:s}")

    # 2) build / load vocab and indexers
    indexers = build_indexers(args)

    vocab_path = os.path.join(args.exp_dir, "vocab")
    if args.reload_vocab or not os.path.exists(vocab_path):
        _build_vocab(args, tasks, vocab_path)

    # Always load vocab from file.
    vocab = Vocabulary.from_files(vocab_path)
    log.info("\tLoaded vocab from %s", vocab_path)

    for namespace, mapping in vocab._index_to_token.items():
        log.info("\tVocab namespace %s: size %d", namespace, len(mapping))
    log.info("\tFinished building vocab.")
    args.max_word_v_size = vocab.get_vocab_size("tokens")
    args.max_char_v_size = vocab.get_vocab_size("chars")

    # 3) build / load word vectors
    word_embs = None
    if args.input_module in ["glove", "fastText"]:
        emb_file = os.path.join(args.exp_dir, "embs.pkl")
        if args.reload_vocab or not os.path.exists(emb_file):
            word_embs = _build_embeddings(args, vocab, emb_file)
        else:  # load from file
            word_embs = pkl.load(open(emb_file, "rb"))
        log.info("Trimmed word embeddings: %s", str(word_embs.size()))

    # 4) Index tasks using vocab (if preprocessed copy not available).
    preproc_dir = os.path.join(args.exp_dir, "preproc")
    utils.maybe_make_dir(preproc_dir)
    reindex_tasks = parse_task_list_arg(args.reindex_tasks)
    utils.assert_for_log(
        not (args.reload_indexing and not reindex_tasks),
        'Flag reload_indexing was set, but no tasks are set to reindex (use -o "args.reindex_tasks'
        ' = "task1,task2,..."")',
    )

    # Set up boundary_token_fn, which applies SOS/EOS/SEP/CLS delimiters
    if args.input_module.startswith("bert"):
        from jiant.pytorch_transformers_interface.modules import BertEmbedderModule

        boundary_token_fn = BertEmbedderModule.apply_boundary_tokens
    elif args.input_module.startswith("xlnet"):
        from jiant.pytorch_transformers_interface.modules import XLNetEmbedderModule

        boundary_token_fn = XLNetEmbedderModule.apply_boundary_tokens
    else:
        boundary_token_fn = utils.apply_standard_boundary_tokens

    for task in tasks:
        force_reindex = args.reload_indexing and task.name in reindex_tasks
        for split in ALL_SPLITS:
            log_prefix = "\tTask '%s', split '%s'" % (task.name, split)
            relative_path = _get_serialized_record_path(
                task.name, split, "preproc")
            cache_found = _find_cached_file(args.exp_dir,
                                            args.global_ro_exp_dir,
                                            relative_path,
                                            log_prefix=log_prefix)
            if force_reindex or not cache_found:
                # Re-index from scratch.
                record_file = _get_serialized_record_path(
                    task.name, split, preproc_dir)
                if os.path.exists(record_file) and os.path.islink(record_file):
                    os.remove(record_file)

                _index_split(task, split, indexers, vocab, record_file,
                             boundary_token_fn)

        # Delete in-memory data - we'll lazy-load from disk later.
        # TODO: delete task.{split}_data_text as well?
        task.train_data = None
        task.val_data = None
        task.test_data = None

    log.info("\tFinished indexing tasks")

    # 5) Initialize tasks with data iterators.
    pretrain_tasks = []
    target_tasks = []
    for task in tasks:
        # Replace lists of instances with lazy generators from disk.
        task.val_data = _get_instance_generator(task.name, "val", preproc_dir)
        task.test_data = _get_instance_generator(task.name, "test",
                                                 preproc_dir)
        # When using pretrain_data_fraction, we need modified iterators for use
        # only on training datasets at pretraining time.
        if task.name in pretrain_task_names:
            log.info("\tCreating trimmed pretraining-only version of " +
                     task.name + " train.")
            task.train_data = _get_instance_generator(
                task.name,
                "train",
                preproc_dir,
                fraction=args.pretrain_data_fraction)
            pretrain_tasks.append(task)
        # When using target_train_data_fraction, we need modified iterators
        # only for training datasets at do_target_task_training time.
        if task.name in target_task_names:
            log.info("\tCreating trimmed target-only version of " + task.name +
                     " train.")
            task.train_data = _get_instance_generator(
                task.name,
                "train",
                preproc_dir,
                fraction=args.target_train_data_fraction)
            target_tasks.append(task)

    log.info("\t  Training on %s", ", ".join(pretrain_task_names))
    log.info("\t  Evaluating on %s", ", ".join(target_task_names))
    return pretrain_tasks, target_tasks, vocab, word_embs
def check_configurations(args, pretrain_tasks, target_tasks):
    """
    Checks configurations for any obvious logical flaws
    and that necessary parameters are set for each step -
    throws asserts and exits if found.

    Parameters
    ----------------
    args: Params object
    pretrain_tasks: list of pretraining Task objects
    target_tasks: list of target task training Task objects

    Returns
    ----------------
    None
    """
    steps_log = io.StringIO()
    if any([t.val_metric_decreases for t in pretrain_tasks]) and any(
        [not t.val_metric_decreases for t in pretrain_tasks]):
        log.warn(
            "\tMixing training tasks with increasing and decreasing val metrics!"
        )

    if args.load_target_train_checkpoint != "none":
        assert_for_log(
            not args.do_pretrain,
            "Error: Attempting to train a model and then replace that model with one from "
            "a checkpoint.",
        )
        steps_log.write("Loading model from path: %s \n" %
                        args.load_target_train_checkpoint)

    assert_for_log(
        args.transfer_paradigm in ["finetune", "frozen"],
        "Transfer paradigm %s not supported!" % args.transfer_paradigm,
    )

    if args.do_pretrain:
        assert_for_log(
            args.pretrain_tasks != "none",
            "Error: Must specify at least one pretraining task: [%s]" %
            args.pretrain_tasks,
        )
        steps_log.write("Training model on tasks: %s \n" % args.pretrain_tasks)

    if args.do_target_task_training:
        assert_for_log(
            args.target_tasks != "none",
            "Error: Must specify at least one target task: [%s]" %
            args.target_tasks,
        )
        steps_log.write("Re-training model for individual target tasks \n")
        assert_for_log(
            len(set(pretrain_tasks).intersection(target_tasks)) == 0
            or args.allow_reuse_of_pretraining_parameters
            or args.do_pretrain == 0,
            "If you're pretraining on a task you plan to reuse as a target task, set\n"
            "allow_reuse_of_pretraining_parameters = 1 (risky), or train in two steps:\n"
            "train with do_pretrain = 1, do_target_task_training = 0, stop, and restart with\n"
            "do_pretrain = 0 and do_target_task_training = 1.",
        )
    if args.do_full_eval:
        assert_for_log(
            args.target_tasks != "none",
            "Error: Must specify at least one target task: [%s]" %
            args.target_tasks,
        )
        if not args.do_target_task_training:
            untrained_tasks = set(
                config.get_task_attr(
                    args, task.name, "use_classifier", default=task.name)
                for task in target_tasks)
            if args.do_pretrain:
                untrained_tasks -= set(
                    config.get_task_attr(
                        args, task.name, "use_classifier", default=task.name)
                    for task in pretrain_tasks)
            if len(untrained_tasks) > 0:
                assert (
                    args.load_model
                    or args.load_target_train_checkpoint not in ["none", ""]
                    or args.allow_untrained_encoder_parameters
                ), f"Evaluating a target task model on tasks {untrained_tasks} "
                "without training it on this run or loading a checkpoint. "
                "Set `allow_untrained_encoder_parameters` if you really want to use "
                "an untrained task model."
                log.warning(
                    f"Evauluating a target task model on tasks {untrained_tasks} without training "
                    "it in this run. It's up to you to ensure that you are loading parameters "
                    "that were sufficiently trained for this task.")
        steps_log.write("Evaluating model on tasks: %s \n" % args.target_tasks)

    log.info("Will run the following steps for this experiment:\n%s",
             steps_log.getvalue())
    steps_log.close()
예제 #10
0
def build_tasks(
    args: config.Params, cuda_device: Any
) -> (List[Task], List[Task], Vocabulary, Union[np.ndarray, float]):
    """Main logic for preparing tasks:

    1. create or load the tasks
    2. configure classifiers for tasks
    3. set up indexers
    4. build and save vocab to disk
    5. load vocab from disk
    6. if specified, load word embeddings
    7. set up ModelPreprocessingInterface (MPI) to handle model-specific preprocessing
    8. index tasks using vocab and task-specific MPI, save to disk.
    9. return: task data lazy-loaders in phase-specific lists w/ vocab, and word embeddings

    Parameters
    ----------
    args : Params
        config map

    Returns
    -------
    List[Task]
        list of pretrain Tasks.
    List[Task]
        list of target Tasks.
    allennlp.data.Vocabulary
        vocabulary from task data.
    Union[np.ndarray, float]
        Word embeddings.

    """
    # 1) create / load tasks
    tasks, pretrain_task_names, target_task_names = get_tasks(
        args, cuda_device)
    for task in tasks:
        task_classifier = config.get_task_attr(args, task.name,
                                               "use_classifier")
        setattr(task, "_classifier_name",
                task_classifier if task_classifier else task.name)

    tokenizer_names = {task.name: task.tokenizer_name for task in tasks}
    assert not len(set(tokenizer_names.values())) > 1, (
        f"Error: mixing tasks with different tokenizers!"
        " Tokenizations: {tokenizer_names:s}")

    # 2) build / load vocab and indexers
    indexers = build_indexers(args)

    vocab_path = os.path.join(args.exp_dir, "vocab")
    if args.reload_vocab or not os.path.exists(vocab_path):
        _build_vocab(args, tasks, vocab_path)

    # Always load vocab from file.
    vocab = Vocabulary.from_files(vocab_path)
    log.info("\tLoaded vocab from %s", vocab_path)

    for namespace, mapping in vocab._index_to_token.items():
        log.info("\tVocab namespace %s: size %d", namespace, len(mapping))
    log.info("\tFinished building vocab.")
    args.max_word_v_size = vocab.get_vocab_size("tokens")
    args.max_char_v_size = vocab.get_vocab_size("chars")

    # 3) build / load word vectors
    word_embs = None
    if args.input_module in ["glove", "fastText"]:
        emb_file = os.path.join(args.exp_dir, "embs.pkl")
        if args.reload_vocab or not os.path.exists(emb_file):
            word_embs = _build_embeddings(args, vocab, emb_file)
        else:  # load from file
            word_embs = pkl.load(open(emb_file, "rb"))
        log.info("Trimmed word embeddings: %s", str(word_embs.size()))

    # 4) Set up model_preprocessing_interface
    model_preprocessing_interface = ModelPreprocessingInterface(args)

    # 5) Index tasks using vocab (if preprocessed copy not available).
    preproc_dir = os.path.join(args.exp_dir, "preproc")
    utils.maybe_make_dir(preproc_dir)
    reindex_tasks = parse_task_list_arg(args.reindex_tasks)
    utils.assert_for_log(
        not (args.reload_indexing and not reindex_tasks),
        'Flag reload_indexing was set, but no tasks are set to reindex (use -o "args.reindex_tasks'
        ' = "task1,task2,..."")',
    )

    for task in tasks:
        force_reindex = args.reload_indexing and task.name in reindex_tasks
        for split in ALL_SPLITS:
            log_prefix = "\tTask '%s', split '%s'" % (task.name, split)
            relative_path = _get_serialized_record_path(
                task.name, split, "preproc")
            cache_found = _find_cached_file(args.exp_dir,
                                            args.global_ro_exp_dir,
                                            relative_path,
                                            log_prefix=log_prefix)
            if force_reindex or not cache_found:
                # Re-index from scratch.
                record_file = _get_serialized_record_path(
                    task.name, split, preproc_dir)
                if os.path.exists(record_file) and os.path.islink(record_file):
                    os.remove(record_file)

                _index_split(task, split, indexers, vocab, record_file,
                             model_preprocessing_interface)

        # Delete in-memory data - we'll lazy-load from disk later.
        # TODO: delete task.{split}_data_text?

    log.info("\tFinished indexing tasks")

    # 6) Initialize tasks with data iterators.
    pretrain_tasks = []
    target_tasks = []
    for task in tasks:
        # Replace lists of instances with lazy generators from disk.
        task.set_instance_iterable(
            split_name="val",
            instance_iterable=_get_instance_generator(task.name, "val",
                                                      preproc_dir),
        )
        task.set_instance_iterable(
            split_name="test",
            instance_iterable=_get_instance_generator(task.name, "test",
                                                      preproc_dir),
        )
        # When using pretrain_data_fraction, we need modified iterators for use
        # only on training datasets at pretraining time.
        if task.name in pretrain_task_names:
            log.info("\tCreating trimmed pretraining-only version of " +
                     task.name + " train.")
            task.set_instance_iterable(
                split_name="train",
                instance_iterable=_get_instance_generator(
                    task.name,
                    "train",
                    preproc_dir,
                    fraction=args.pretrain_data_fraction),
                phase="pretrain",
            )
            pretrain_tasks.append(task)
        # When using target_train_data_fraction, we need modified iterators
        # only for training datasets at do_target_task_training time.
        if task.name in target_task_names:
            log.info("\tCreating trimmed target-only version of " + task.name +
                     " train.")
            task.set_instance_iterable(
                split_name="train",
                instance_iterable=_get_instance_generator(
                    task.name,
                    "train",
                    preproc_dir,
                    fraction=args.target_train_data_fraction),
                phase="target_train",
            )
            target_tasks.append(task)

    log.info("\t  Training on %s", ", ".join(pretrain_task_names))
    log.info("\t  Evaluating on %s", ", ".join(target_task_names))
    return pretrain_tasks, target_tasks, vocab, word_embs
예제 #11
0
    def forward(self, sent, task, reset=True):
        # pylint: disable=arguments-differ
        """
        Args:
            - sent (Dict[str, torch.LongTensor]): From a ``TextField``.
            - task (Task): Used by the _text_field_embedder to pick the correct output
                           ELMo representation.
            - reset (Bool): if True, manually reset the states of the ELMo LSTMs present
                (if using BiLM or ELMo embeddings). Set False, if want to preserve statefulness.
        Returns:
            - sent_enc (torch.FloatTensor): (b_size, seq_len, d_emb)
                the padded values in sent_enc are set to 0
            - sent_mask (torch.FloatTensor): (b_size, seq_len, d_emb); all 0/1s
        """
        if reset:
            self.reset_states()

        # General sentence embeddings (for sentence encoder).
        # Skip this for probing runs that don't need it.
        if not isinstance(self._phrase_layer, NullPhraseLayer):
            word_embs_in_context = self._highway_layer(
                self._text_field_embedder(sent))
        else:
            word_embs_in_context = None

        # Task-specific sentence embeddings (e.g. custom ELMo weights).
        # Skip computing this if it won't be used.
        if self.sep_embs_for_skip:
            task_word_embs_in_context = self._highway_layer(
                self._text_field_embedder(sent, task._classifier_name))
        else:
            task_word_embs_in_context = None

        # Make sure we're embedding /something/
        assert (word_embs_in_context is not None) or (task_word_embs_in_context
                                                      is not None)

        if self._cove_layer is not None:
            # Slightly wasteful as this repeats the GloVe lookup internally,
            # but this allows CoVe to be used alongside other embedding models
            # if we want to.
            sent_lens = torch.ne(sent["words"],
                                 self.pad_idx).long().sum(dim=-1).data
            # CoVe doesn't use <SOS> or <EOS>, so strip these before running.
            # Note that we need to also drop the last column so that CoVe returns
            # the right shape. If all inputs have <EOS> then this will be the
            # only thing clipped.
            sent_cove_embs_raw = self._cove_layer(sent["words"][:, 1:-1],
                                                  sent_lens - 2)
            pad_col = torch.zeros(
                sent_cove_embs_raw.size()[0],
                1,
                sent_cove_embs_raw.size()[2],
                dtype=sent_cove_embs_raw.dtype,
                device=sent_cove_embs_raw.device,
            )
            sent_cove_embs = torch.cat([pad_col, sent_cove_embs_raw, pad_col],
                                       dim=1)
            if word_embs_in_context is not None:
                word_embs_in_context = torch.cat(
                    [word_embs_in_context, sent_cove_embs], dim=-1)
            if task_word_embs_in_context is not None:
                task_word_embs_in_context = torch.cat(
                    [task_word_embs_in_context, sent_cove_embs], dim=-1)

        if word_embs_in_context is not None:
            word_embs_in_context = self._dropout(word_embs_in_context)
        if task_word_embs_in_context is not None:
            task_word_embs_in_context = self._dropout(
                task_word_embs_in_context)

        # The rest of the model
        sent_mask = util.get_text_field_mask(sent).float()
        sent_lstm_mask = sent_mask if self._mask_lstms else None
        if word_embs_in_context is not None:
            if isinstance(self._phrase_layer, ONLSTMStack) or isinstance(
                    self._phrase_layer, PRPN):
                # The ONLSTMStack or PRPN takes the raw words as input and computes
                # embeddings separately.
                sent_enc, _ = self._phrase_layer(
                    torch.transpose(sent["words"], 0, 1), sent_lstm_mask)
                sent_enc = torch.transpose(sent_enc, 0, 1)
            else:
                sent_enc = self._phrase_layer(word_embs_in_context,
                                              sent_lstm_mask)
        else:
            sent_enc = None

        # ELMoLSTM returns all layers, we just want to use the top layer
        sent_enc = sent_enc[-1] if isinstance(self._phrase_layer,
                                              BiLMEncoder) else sent_enc
        sent_enc = self._dropout(
            sent_enc) if sent_enc is not None else sent_enc
        if self.skip_embs:
            # Use skip connection with original sentence embs or task sentence
            # embs
            skip_vec = task_word_embs_in_context if self.sep_embs_for_skip else word_embs_in_context
            utils.assert_for_log(
                skip_vec is not None,
                "skip_vec is none - perhaps embeddings are not configured properly?",
            )
            if isinstance(self._phrase_layer, NullPhraseLayer):
                sent_enc = skip_vec
            else:
                sent_enc = torch.cat([sent_enc, skip_vec], dim=-1)

        sent_mask = sent_mask.unsqueeze(dim=-1)
        pad_mask = sent_mask == 0

        assert sent_enc is not None
        sent_enc = sent_enc.masked_fill(pad_mask, 0)
        return sent_enc, sent_mask
예제 #12
0
def build_sent_encoder(args, vocab, d_emb, tasks, embedder, cove_layer):
    # Build single sentence encoder: the main component of interest
    # Need special handling for language modeling
    # Note: sent_enc is expected to apply dropout to its input _and_ output if
    # needed.
    rnn_params = Params(
        {
            "input_size": d_emb,
            "bidirectional": True,
            "hidden_size": args.d_hid,
            "num_layers": args.n_layers_enc,
        }
    )
    if args.sent_enc == "onlstm":
        onlayer = ONLSTMPhraseLayer(
            vocab,
            args.d_word,
            args.d_hid,
            args.n_layers_enc,
            args.onlstm_chunk_size,
            args.onlstm_dropconnect,
            args.onlstm_dropouti,
            args.dropout,
            args.onlstm_dropouth,
            embedder,
            args.batch_size,
        )
        # The 'onlayer' acts as a phrase layer module for the larger SentenceEncoder module.
        sent_encoder = SentenceEncoder(
            vocab,
            embedder,
            args.n_layers_highway,
            onlayer.onlayer,
            skip_embs=args.skip_embs,
            dropout=args.dropout,
            sep_embs_for_skip=args.sep_embs_for_skip,
            cove_layer=cove_layer,
        )
        d_sent = args.d_word
        log.info("Using ON-LSTM sentence encoder!")
    elif args.sent_enc == "prpn":
        prpnlayer = PRPNPhraseLayer(
            vocab,
            args.d_word,
            args.d_hid,
            args.n_layers_enc,
            args.n_slots,
            args.n_lookback,
            args.resolution,
            args.dropout,
            args.idropout,
            args.rdropout,
            args.res,
            embedder,
            args.batch_size,
        )
        # The 'prpn' acts as a phrase layer module for the larger SentenceEncoder module.
        sent_encoder = SentenceEncoder(
            vocab,
            embedder,
            args.n_layers_highway,
            prpnlayer.prpnlayer,
            skip_embs=args.skip_embs,
            dropout=args.dropout,
            sep_embs_for_skip=args.sep_embs_for_skip,
            cove_layer=cove_layer,
        )
        d_sent = args.d_word
        log.info("Using PRPN sentence encoder!")
    elif any(isinstance(task, LanguageModelingTask) for task in tasks) or args.sent_enc == "bilm":
        assert_for_log(args.sent_enc in ["rnn", "bilm"], "Only RNNLM supported!")
        assert_for_log(
            args.input_module != "elmo" and not args.input_module.startswith("bert"),
            "LM with full ELMo and BERT not supported",
        )
        bilm = BiLMEncoder(d_emb, args.d_hid, args.d_hid, args.n_layers_enc)
        sent_encoder = SentenceEncoder(
            vocab,
            embedder,
            args.n_layers_highway,
            bilm,
            skip_embs=args.skip_embs,
            dropout=args.dropout,
            sep_embs_for_skip=args.sep_embs_for_skip,
            cove_layer=cove_layer,
        )
        d_sent = 2 * args.d_hid
    elif args.sent_enc == "bow":
        sent_encoder = BoWSentEncoder(vocab, embedder)
        assert_for_log(
            not args.skip_embs, "Skip connection not currently supported with `bow` encoder."
        )
        d_sent = d_emb
    elif args.sent_enc == "rnn":
        sent_rnn = s2s_e.by_name("lstm").from_params(copy.deepcopy(rnn_params))
        sent_encoder = SentenceEncoder(
            vocab,
            embedder,
            args.n_layers_highway,
            sent_rnn,
            skip_embs=args.skip_embs,
            dropout=args.dropout,
            sep_embs_for_skip=args.sep_embs_for_skip,
            cove_layer=cove_layer,
        )
        d_sent = 2 * args.d_hid
    elif args.sent_enc == "none":
        # Expose word representation layer (GloVe, ELMo, etc.) directly.
        assert_for_log(args.skip_embs, f"skip_embs must be set for " "'{args.sent_enc}' encoder")
        phrase_layer = NullPhraseLayer(rnn_params["input_size"])
        sent_encoder = SentenceEncoder(
            vocab,
            embedder,
            args.n_layers_highway,
            phrase_layer,
            skip_embs=args.skip_embs,
            dropout=args.dropout,
            sep_embs_for_skip=args.sep_embs_for_skip,
            cove_layer=cove_layer,
        )
        d_sent = 0  # skip connection added below
        log.info("No shared encoder (just using word embeddings)!")
    else:
        assert_for_log(False, "No valid sentence encoder specified.")
    return sent_encoder, d_sent
예제 #13
0
def build_embeddings(args, vocab, tasks, pretrained_embs=None):
    """ Build embeddings according to options in args """
    d_emb, d_char = 0, args.d_char

    token_embedders = {}
    # Word embeddings
    n_token_vocab = vocab.get_vocab_size("tokens")
    if args.input_module in ["glove", "fastText"] and pretrained_embs is not None:
        word_embs = pretrained_embs
        assert word_embs.size()[0] == n_token_vocab
        d_word = word_embs.size()[1]
        log.info("\tUsing pre-trained word embeddings: %s", str(word_embs.size()))
    elif args.input_module == "scratch":
        log.info("\tTraining word embeddings from scratch.")
        d_word = args.d_word
        word_embs = nn.Embedding(n_token_vocab, d_word).weight
    else:
        assert args.input_module.startswith("bert") or args.input_module in [
            "gpt",
            "elmo",
            "elmo-chars-only",
        ], "You do not have a valid value for input_module."
        embeddings = None
        word_embs = None

    if word_embs is not None:
        embeddings = Embedding(
            num_embeddings=n_token_vocab,
            embedding_dim=d_word,
            weight=word_embs,
            trainable=(args.embeddings_train == 1),
            padding_index=vocab.get_token_index("@@PADDING@@"),
        )
        token_embedders["words"] = embeddings
        d_emb += d_word

    # Handle cove
    cove_layer = None
    if args.cove:
        assert embeddings is not None
        assert args.input_module == "glove", "CoVe requires GloVe embeddings."
        assert d_word == 300, "CoVe expects 300-dimensional GloVe embeddings."
        try:
            from jiant.modules.cove.cove import MTLSTM as cove_lstm

            # Have CoVe do an internal GloVe lookup, but don't add residual.
            # We'll do this manually in modules.py; see
            # SentenceEncoder.forward().
            cove_layer = cove_lstm(n_vocab=n_token_vocab, vectors=embeddings.weight.data)
            # Control whether CoVe is trainable.
            for param in cove_layer.parameters():
                param.requires_grad = bool(args.cove_fine_tune)
            d_emb += 600  # 300 x 2 for biLSTM activations
            log.info("\tUsing CoVe embeddings!")
        except ImportError as e:
            log.info("Failed to import CoVe!")
            raise e

    # Character embeddings
    if args.char_embs:
        log.info("\tUsing character embeddings!")
        char_embeddings = Embedding(vocab.get_vocab_size("chars"), d_char)
        filter_sizes = tuple([int(i) for i in args.char_filter_sizes.split(",")])
        char_encoder = CnnEncoder(
            d_char,
            num_filters=args.n_char_filters,
            ngram_filter_sizes=filter_sizes,
            output_dim=d_char,
        )
        char_embedder = TokenCharactersEncoder(
            char_embeddings, char_encoder, dropout=args.dropout_embs
        )
        d_emb += d_char
        token_embedders["chars"] = char_embedder
    else:
        log.info("\tNot using character embeddings!")

    # If we want separate ELMo scalar weights (a different ELMo representation for each classifier,
    # then we need count and reliably map each classifier to an index used by
    # allennlp internal ELMo.
    if args.sep_embs_for_skip:
        # Determine a deterministic list of classifier names to use for each
        # task.
        classifiers = sorted(set(map(lambda x: x._classifier_name, tasks)))
        # Reload existing classifier map, if it exists.
        classifier_save_path = args.run_dir + "/classifier_task_map.json"
        if os.path.isfile(classifier_save_path):
            loaded_classifiers = json.load(open(args.run_dir + "/classifier_task_map.json", "r"))
        else:
            # No file exists, so assuming we are just starting to pretrain. If pretrain is to be
            # skipped, then there's a way to bypass this assertion by explicitly allowing for
            # a missing classiifer task map.
            assert_for_log(
                args.do_pretrain or args.allow_missing_task_map,
                "Error: {} should already exist.".format(classifier_save_path),
            )
            if args.allow_missing_task_map:
                log.warning(
                    "Warning: classifier task map not found in model"
                    " directory. Creating a new one from scratch."
                )
            # default is always @pretrain@
            loaded_classifiers = {"@pretrain@": 0}
        # Add the new tasks and update map, keeping the internal ELMo index
        # consistent.
        max_number_classifiers = max(loaded_classifiers.values())
        offset = 1
        for classifier in classifiers:
            if classifier not in loaded_classifiers:
                loaded_classifiers[classifier] = max_number_classifiers + offset
                offset += 1
        log.info("Classifiers:{}".format(loaded_classifiers))
        open(classifier_save_path, "w+").write(json.dumps(loaded_classifiers))
        # Every index in classifiers needs to correspond to a valid ELMo output
        # representation.
        num_reps = 1 + max(loaded_classifiers.values())
    else:
        # All tasks share the same scalars.
        # Not used if input_module = elmo-chars-only (i.e. no elmo)
        loaded_classifiers = {"@pretrain@": 0}
        num_reps = 1
    if args.input_module.startswith("elmo"):
        log.info("Loading ELMo from files:")
        log.info("ELMO_OPT_PATH = %s", ELMO_OPT_PATH)
        if args.input_module == "elmo-chars-only":
            log.info("\tUsing ELMo character CNN only!")
            log.info("ELMO_WEIGHTS_PATH = %s", ELMO_WEIGHTS_PATH)
            elmo_embedder = ElmoCharacterEncoder(
                options_file=ELMO_OPT_PATH, weight_file=ELMO_WEIGHTS_PATH, requires_grad=False
            )
            d_emb += 512
        else:
            log.info("\tUsing full ELMo! (separate scalars/task)")
            if args.elmo_weight_file_path != "none":
                assert os.path.exists(args.elmo_weight_file_path), (
                    'ELMo weight file path "' + args.elmo_weight_file_path + '" does not exist.'
                )
                weight_file = args.elmo_weight_file_path
            else:
                weight_file = ELMO_WEIGHTS_PATH
            log.info("ELMO_WEIGHTS_PATH = %s", weight_file)
            elmo_embedder = ElmoTokenEmbedderWrapper(
                options_file=ELMO_OPT_PATH,
                weight_file=weight_file,
                num_output_representations=num_reps,
                # Dropout is added by the sentence encoder later.
                dropout=0.0,
            )
            d_emb += 1024

        token_embedders["elmo"] = elmo_embedder

    # Wrap ELMo and other embedders, and concatenates the resulting
    # representations alone the last (vector) dimension.
    embedder = ElmoTextFieldEmbedder(
        token_embedders,
        loaded_classifiers,
        elmo_chars_only=args.input_module == "elmo-chars-only",
        sep_embs_for_skip=args.sep_embs_for_skip,
    )

    assert d_emb, "You turned off all the embeddings, ya goof!"
    return d_emb, embedder, cove_layer
예제 #14
0
def build_sent_encoder(args, vocab, d_emb, tasks, embedder, cove_layer):
    # Build single sentence encoder: the main component of interest
    # Need special handling for language modeling
    # Note: sent_enc is expected to apply dropout to its input _and_ output if
    # needed.
    rnn_params = Params(
        {
            "input_size": d_emb,
            "bidirectional": True,
            "hidden_size": args.d_hid,
            "num_layers": args.n_layers_enc,
        }
    )
    if args.sent_enc == "onlstm":
        onlayer = ONLSTMPhraseLayer(
            vocab,
            args.d_word,
            args.d_hid,
            args.n_layers_enc,
            args.onlstm_chunk_size,
            args.onlstm_dropconnect,
            args.onlstm_dropouti,
            args.dropout,
            args.onlstm_dropouth,
            embedder,
            args.batch_size,
        )
        # The 'onlayer' acts as a phrase layer module for the larger SentenceEncoder module.
        sent_encoder = SentenceEncoder(
            vocab,
            embedder,
            args.n_layers_highway,
            onlayer.onlayer,
            skip_embs=args.skip_embs,
            dropout=args.dropout,
            sep_embs_for_skip=args.sep_embs_for_skip,
            cove_layer=cove_layer,
        )
        d_sent = args.d_word
        log.info("Using ON-LSTM sentence encoder!")
    elif args.sent_enc == "prpn":
        prpnlayer = PRPNPhraseLayer(
            vocab,
            args.d_word,
            args.d_hid,
            args.n_layers_enc,
            args.n_slots,
            args.n_lookback,
            args.resolution,
            args.dropout,
            args.idropout,
            args.rdropout,
            args.res,
            embedder,
            args.batch_size,
        )
        # The 'prpn' acts as a phrase layer module for the larger SentenceEncoder module.
        sent_encoder = SentenceEncoder(
            vocab,
            embedder,
            args.n_layers_highway,
            prpnlayer.prpnlayer,
            skip_embs=args.skip_embs,
            dropout=args.dropout,
            sep_embs_for_skip=args.sep_embs_for_skip,
            cove_layer=cove_layer,
        )
        d_sent = args.d_word
        log.info("Using PRPN sentence encoder!")
    elif any(isinstance(task, LanguageModelingTask) for task in tasks) or args.sent_enc == "bilm":
        assert_for_log(args.sent_enc in ["rnn", "bilm"], "Only RNNLM supported!")
        assert_for_log(
            not (
                args.input_module == "elmo"
                or args.input_module.startswith("bert")
                or args.input_module.startswith("xlnet")
            ),
            f"Using input_module = {args.input_module} for language modeling is probably not a "
            "good idea, since it allows the language model to use information from the right-hand "
            "context.",
        )
        bilm = BiLMEncoder(d_emb, args.d_hid, args.d_hid, args.n_layers_enc)
        sent_encoder = SentenceEncoder(
            vocab,
            embedder,
            args.n_layers_highway,
            bilm,
            skip_embs=args.skip_embs,
            dropout=args.dropout,
            sep_embs_for_skip=args.sep_embs_for_skip,
            cove_layer=cove_layer,
        )
        d_sent = 2 * args.d_hid
    elif args.sent_enc == "bow":
        sent_encoder = BoWSentEncoder(vocab, embedder)
        assert_for_log(
            not args.skip_embs, "Skip connection not currently supported with `bow` encoder."
        )
        d_sent = d_emb
    elif args.sent_enc == "rnn":
        sent_rnn = s2s_e.by_name("lstm").from_params(copy.deepcopy(rnn_params))
        sent_encoder = SentenceEncoder(
            vocab,
            embedder,
            args.n_layers_highway,
            sent_rnn,
            skip_embs=args.skip_embs,
            dropout=args.dropout,
            sep_embs_for_skip=args.sep_embs_for_skip,
            cove_layer=cove_layer,
        )
        d_sent = 2 * args.d_hid
    elif args.sent_enc == "none":
        # Expose word representation layer (GloVe, ELMo, etc.) directly.
        assert_for_log(
            args.skip_embs,
            "skip_embs is false and sent_enc is none, "
            "which means that your token representations are zero-dimensional. Consider setting skip_embs.",
        )
        phrase_layer = NullPhraseLayer(rnn_params["input_size"])
        sent_encoder = SentenceEncoder(
            vocab,
            embedder,
            args.n_layers_highway,
            phrase_layer,
            skip_embs=args.skip_embs,
            dropout=args.dropout,
            sep_embs_for_skip=args.sep_embs_for_skip,
            cove_layer=cove_layer,
        )
        d_sent = 0
    else:
        assert_for_log(
            False, f"Shared encoder layer specification `{args.sent_enc}` not recognized."
        )
    return sent_encoder, d_sent