Пример #1
0
    def test_tied_src_trg_softmax(self):

        # test source embedding, target embedding, and softmax tying
        torch.manual_seed(self.seed)
        cfg = copy.deepcopy(self.cfg)

        cfg["model"]["decoder"]["type"] = "transformer"
        cfg["model"]["tied_embeddings"] = True
        cfg["model"]["tied_softmax"] = True
        cfg["model"]["decoder"]["embeddings"]["embedding_dim"] = 64
        cfg["model"]["encoder"]["embeddings"]["embedding_dim"] = 64

        src_vocab = trg_vocab = self.vocab
        model = build_model(cfg["model"],
                            src_vocab=src_vocab,
                            trg_vocab=trg_vocab)

        src_weight = model.src_embed.lut.weight
        trg_weight = model.trg_embed.lut.weight
        output_weight = model.decoder.output_layer.weight

        self.assertTensorEqual(src_weight, trg_weight)
        self.assertTensorEqual(src_weight, output_weight)
        self.assertEqual(src_weight.shape, trg_weight.shape)
        self.assertEqual(trg_weight.shape, output_weight.shape)

        output_weight.data.fill_(3.)
        self.assertEqual(output_weight.sum().item(), 6528)
        self.assertEqual(output_weight.sum().item(), src_weight.sum().item())
        self.assertEqual(output_weight.sum().item(), trg_weight.sum().item())
        self.assertEqual(src_weight.sum().item(), trg_weight.sum().item())
Пример #2
0
    def test_tied_embeddings(self):

        torch.manual_seed(self.seed)
        cfg = copy.deepcopy(self.cfg)
        cfg["model"]["tied_embeddings"] = True
        cfg["model"]["tied_softmax"] = False

        src_vocab = trg_vocab = self.vocab

        model = build_model(cfg["model"],
                            src_vocab=src_vocab,
                            trg_vocab=trg_vocab)

        self.assertEqual(src_vocab.itos, trg_vocab.itos)
        self.assertEqual(model.src_embed, model.trg_embed)
        self.assertTensorEqual(model.src_embed.lut.weight,
                               model.trg_embed.lut.weight)
        self.assertEqual(model.src_embed.lut.weight.shape,
                         model.trg_embed.lut.weight.shape)
Пример #3
0
    def test_tied_softmax(self):

        torch.manual_seed(self.seed)
        cfg = copy.deepcopy(self.cfg)
        cfg["model"]["decoder"]["type"] = "transformer"
        cfg["model"]["tied_embeddings"] = False
        cfg["model"]["tied_softmax"] = True
        cfg["model"]["decoder"]["embeddings"]["embedding_dim"] = 64

        src_vocab = trg_vocab = self.vocab

        model = build_model(cfg["model"],
                            src_vocab=src_vocab,
                            trg_vocab=trg_vocab)

        self.assertEqual(model.trg_embed.lut.weight.shape,
                         model.decoder.output_layer.weight.shape)

        self.assertTensorEqual(model.trg_embed.lut.weight,
                               model.decoder.output_layer.weight)
Пример #4
0
    def test_transformer_layer_norm_init(self):
        torch.manual_seed(self.seed)
        cfg = copy.deepcopy(self.cfg)

        src_vocab = trg_vocab = self.vocab

        model = build_model(cfg["model"],
                            src_vocab=src_vocab,
                            trg_vocab=trg_vocab)

        def check_layer_norm(m: nn.Module):
            for name, child in m.named_children():
                if isinstance(child, nn.LayerNorm):
                    self.assertTensorEqual(child.weight,
                                           torch.ones([self.hidden_size]))
                    self.assertTensorEqual(child.bias,
                                           torch.zeros([self.hidden_size]))
                else:
                    check_layer_norm(child)

        check_layer_norm(model)
Пример #5
0
def train(cfg_file: str) -> None:
    """
    Main training function. After training, also test on test data if given.

    :param cfg_file: path to configuration yaml file
    """
    cfg = load_config(cfg_file)

    # set the random seed
    set_seed(seed=cfg["training"].get("random_seed", 42))

    # load the data
    if cfg.get("speech", True):
        train_data, dev_data, test_data, src_vocab, trg_vocab = load_audio_data(
            cfg=cfg)
    else:
        train_data, dev_data, test_data, src_vocab, trg_vocab = load_data(
            data_cfg=cfg["data"])

    # build an encoder-decoder model
    if cfg.get("speech", True):
        model = build_speech_model(cfg["model"],
                                   src_vocab=src_vocab,
                                   trg_vocab=trg_vocab)
    else:
        model = build_model(cfg["model"],
                            src_vocab=src_vocab,
                            trg_vocab=trg_vocab)

    # for training management, e.g. early stopping and model selection
    trainer = TrainManager(model=model, config=cfg)

    # store copy of original training config in model dir
    shutil.copy2(cfg_file, trainer.model_dir + "/config.yaml")

    # log all entries of config
    log_cfg(cfg, trainer.logger)

    log_data_info(train_data=train_data,
                  valid_data=dev_data,
                  test_data=test_data,
                  src_vocab=src_vocab,
                  trg_vocab=trg_vocab,
                  logging_function=trainer.logger.info)

    trainer.logger.info(str(model))

    # store the vocabs
    src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"])
    src_vocab.to_file(src_vocab_file)
    trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"])
    trg_vocab.to_file(trg_vocab_file)

    # train the model
    trainer.train_and_validate(train_data=train_data, valid_data=dev_data)

    # predict with the best model on validation and test
    # (if test data is available)
    ckpt = "{}/{}.ckpt".format(trainer.model_dir, trainer.best_ckpt_iteration)
    output_name = "{:08d}.hyps".format(trainer.best_ckpt_iteration)
    output_path = os.path.join(trainer.model_dir, output_name)
    test(cfg_file, ckpt=ckpt, output_path=output_path, logger=trainer.logger)
Пример #6
0
def filter_noise(cfg_file,
                 ckpt: str,
                 output_path: str = None,
                 logger: Logger = None) -> None:
    """
    Main test function. Handles loading a model from checkpoint, generating
    translations and storing them and attention plots.

    :param cfg_file: path to configuration file
    :param ckpt: path to checkpoint to load
    :param output_path: path to output
    :param logger: log output to this logger (creates new logger if not set)
    """

    if logger is None:
        logger = make_logger()

    cfg = load_config(cfg_file)

    # when checkpoint is not specified, take latest (best) from model dir
    if ckpt is None:
        model_dir = cfg["training"]["model_dir"]
        ckpt = get_latest_checkpoint(model_dir)
        if ckpt is None:
            raise FileNotFoundError(
                "No checkpoint found in directory {}.".format(model_dir))
        try:
            step = ckpt.split(model_dir + "/")[1].split(".ckpt")[0]
        except IndexError:
            step = "best"

    use_cuda = cfg["training"].get("use_cuda", False)
    max_output_length = cfg["training"].get("max_output_length", None)

    # load the data
    if cfg.get("speech", True):
        train_data, _, _, src_vocab, trg_vocab = load_audio_data(cfg=cfg)
    else:
        train_data, _, _, src_vocab, trg_vocab = load_data(
            data_cfg=cfg["data"])

    data_to_predict = ("train", train_data)

    # load model state from disk
    model_checkpoint = load_checkpoint(ckpt, use_cuda=use_cuda)

    # build model and load parameters into it
    if cfg.get("speech", True):
        model = build_speech_model(cfg["model"],
                                   src_vocab=src_vocab,
                                   trg_vocab=trg_vocab)
    else:
        model = build_model(cfg["model"],
                            src_vocab=src_vocab,
                            trg_vocab=trg_vocab)
    model.load_state_dict(model_checkpoint["model_state"])

    if use_cuda:
        model.cuda()

    pad_index = model.pad_index
    label_smoothing = 0.0
    loss_function = XentLoss(pad_index=pad_index, smoothing=label_smoothing)

    data_set_name, data_set = data_to_predict

    #pylint: disable=unused-variable
    ppls_list = generate_perplexities_on_data(
        model,
        data=data_set,
        max_output_length=max_output_length,
        use_cuda=use_cuda,
        loss_function=loss_function,
        logger=logger)
    #pylint: enable=unused-variable

    if output_path is None:
        raise ValueError("Output path must be specified")

    else:
        if not os.path.isdir(output_path):
            os.makedirs(output_path)
        output_path_set = os.path.join(output_path,
                                       data_set_name + "_perplexities.txt")
        with open(output_path_set, "w") as outfile:
            first_iteration = True
            for ppls in ppls_list:
                if not first_iteration:
                    outfile.write("\n")
                outfile.write(str(ppls))
                first_iteration = False

        logger.info("Perplexities saved to: %s", output_path_set)
Пример #7
0
def translate(cfg_file, ckpt: str, output_path: str = None) -> None:
    """
    Interactive translation function.
    Loads model from checkpoint and translates either the stdin input or
    asks for input to translate interactively.
    The input has to be pre-processed according to the data that the model
    was trained on, i.e. tokenized or split into subwords.
    Translations are printed to stdout.

    :param cfg_file: path to configuration file
    :param ckpt: path to checkpoint to load
    :param output_path: path to output file
    """
    def _load_line_as_data(line):
        """ Create a dataset from one line via a temporary file. """
        # write src input to temporary file
        tmp_name = "tmp"
        tmp_suffix = ".src"
        tmp_filename = tmp_name + tmp_suffix
        with open(tmp_filename, "w") as tmp_file:
            tmp_file.write("{}\n".format(line))

        test_data = MonoDataset(path=tmp_name, ext=tmp_suffix, field=src_field)

        # remove temporary file
        if os.path.exists(tmp_filename):
            os.remove(tmp_filename)

        return test_data

    cfg = load_config(cfg_file)
    speech_mode = cfg.get("speech", True)
    if speech_mode:
        raise NotImplementedError(
            "Translation mode isn't implemented for speech processing yet.")

    logger = make_logger()

    def _translate_data(test_data):
        """ Translates given dataset, using parameters from outer scope. """
        # pylint: disable=unused-variable
        score, loss, ppl, sources, sources_raw, references, hypotheses, \
            hypotheses_raw, attention_scores = validate_on_data(
                model, data=test_data, batch_size=batch_size,
                batch_type=batch_type, level=level,
                max_output_length=max_output_length, eval_metric="",
                use_cuda=use_cuda, loss_function=None, beam_size=beam_size,
                beam_alpha=beam_alpha, logger=logger)
        return hypotheses

    # when checkpoint is not specified, take oldest from model dir
    if ckpt is None:
        model_dir = cfg["training"]["model_dir"]
        ckpt = get_latest_checkpoint(model_dir)

    batch_size = cfg["training"].get("eval_batch_size",
                                     cfg["training"].get("batch_size", 1))
    batch_type = cfg["training"].get(
        "eval_batch_type", cfg["training"].get("batch_type", "sentence"))
    use_cuda = cfg["training"].get("use_cuda", False)
    level = cfg["data"]["level"]
    max_output_length = cfg["training"].get("max_output_length", None)

    # read vocabs
    src_vocab_file = cfg["data"].get(
        "src_vocab", cfg["training"]["model_dir"] + "/src_vocab.txt")
    trg_vocab_file = cfg["data"].get(
        "trg_vocab", cfg["training"]["model_dir"] + "/trg_vocab.txt")
    src_vocab = Vocabulary(file=src_vocab_file)
    trg_vocab = Vocabulary(file=trg_vocab_file)

    data_cfg = cfg["data"]
    level = data_cfg["level"]
    lowercase = data_cfg["lowercase"]

    def tok_fun(s):
        return list(s) if level == "char" else s.split()

    src_field = Field(init_token=None,
                      eos_token=EOS_TOKEN,
                      pad_token=PAD_TOKEN,
                      tokenize=tok_fun,
                      batch_first=True,
                      lower=lowercase,
                      unk_token=UNK_TOKEN,
                      include_lengths=True)
    src_field.vocab = src_vocab

    # load model state from disk
    model_checkpoint = load_checkpoint(ckpt, use_cuda=use_cuda)

    # build model and load parameters into it
    model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab)
    model.load_state_dict(model_checkpoint["model_state"])

    if use_cuda:
        model.cuda()

    # whether to use beam search for decoding, <2: greedy decoding
    if "testing" in cfg.keys():
        beam_size = cfg["testing"].get("beam_size", 1)
        beam_alpha = cfg["testing"].get("alpha", -1)
    else:
        beam_size = 1
        beam_alpha = -1

    if not sys.stdin.isatty():
        # input file given
        test_data = MonoDataset(path=sys.stdin, ext="", field=src_field)
        hypotheses = _translate_data(test_data)

        if output_path is not None:
            # write to outputfile if given
            output_path_set = "{}".format(output_path)
            with open(output_path_set, mode="w", encoding="utf-8") as out_file:
                for hyp in hypotheses:
                    out_file.write(hyp + "\n")
            logger.info("Translations saved to: %s.", output_path_set)
        else:
            # print to stdout
            for hyp in hypotheses:
                print(hyp)

    else:
        # enter interactive mode
        batch_size = 1
        batch_type = "sentence"
        while True:
            try:
                src_input = input("\nPlease enter a source sentence "
                                  "(pre-processed): \n")
                if not src_input.strip():
                    break

                # every line has to be made into dataset
                test_data = _load_line_as_data(line=src_input)

                hypotheses = _translate_data(test_data)
                print("JoeyNMT: {}".format(hypotheses[0]))

            except (KeyboardInterrupt, EOFError):
                print("\nBye.")
                break
Пример #8
0
def test(cfg_file,
         ckpt: str,
         output_path: str = None,
         save_attention: bool = False,
         logger: Logger = None) -> None:
    """
    Main test function. Handles loading a model from checkpoint, generating
    translations and storing them and attention plots.

    :param cfg_file: path to configuration file
    :param ckpt: path to checkpoint to load
    :param output_path: path to output
    :param save_attention: whether to save the computed attention weights
    :param logger: log output to this logger (creates new logger if not set)
    """

    if logger is None:
        logger = make_logger()

    cfg = load_config(cfg_file)

    if "test" not in cfg["data"].keys():
        raise ValueError("Test data must be specified in config.")

    # when checkpoint is not specified, take latest (best) from model dir
    if ckpt is None:
        model_dir = cfg["training"]["model_dir"]
        ckpt = get_latest_checkpoint(model_dir)
        if ckpt is None:
            raise FileNotFoundError(
                "No checkpoint found in directory {}.".format(model_dir))
        try:
            step = ckpt.split(model_dir + "/")[1].split(".ckpt")[0]
        except IndexError:
            step = "best"

    batch_size = cfg["training"].get("eval_batch_size",
                                     cfg["training"]["batch_size"])
    batch_type = cfg["training"].get(
        "eval_batch_type", cfg["training"].get("batch_type", "sentence"))
    use_cuda = cfg["training"].get("use_cuda", False)
    level = cfg["data"]["level"]
    eval_metric = cfg["training"]["eval_metric"]
    max_output_length = cfg["training"].get("max_output_length", None)

    # load the data
    if cfg.get("speech", True):
        _, dev_data, test_data, src_vocab, trg_vocab = load_audio_data(cfg=cfg)
    else:
        _, dev_data, test_data, src_vocab, trg_vocab = load_data(
            data_cfg=cfg["data"])

    data_to_predict = {"dev": dev_data, "test": test_data}

    # load model state from disk
    model_checkpoint = load_checkpoint(ckpt, use_cuda=use_cuda)

    # build model and load parameters into it
    if cfg.get("speech", True):
        model = build_speech_model(cfg["model"],
                                   src_vocab=src_vocab,
                                   trg_vocab=trg_vocab)
    else:
        model = build_model(cfg["model"],
                            src_vocab=src_vocab,
                            trg_vocab=trg_vocab)
    model.load_state_dict(model_checkpoint["model_state"])

    if use_cuda:
        model.cuda()

    # whether to use beam search for decoding, 0: greedy decoding
    if "testing" in cfg.keys():
        beam_size = cfg["testing"].get("beam_size", 1)
        beam_alpha = cfg["testing"].get("alpha", -1)
    else:
        beam_size = 1
        beam_alpha = -1

    for data_set_name, data_set in data_to_predict.items():

        #pylint: disable=unused-variable
        score, loss, ppl, sources, sources_raw, references, hypotheses, \
            hypotheses_raw, attention_scores = validate_on_data(
                model, data=data_set, batch_size=batch_size,
                batch_type=batch_type, level=level,
                max_output_length=max_output_length, eval_metric=eval_metric,
                use_cuda=use_cuda, loss_function=None, beam_size=beam_size,
                beam_alpha=beam_alpha, logger=logger)
        #pylint: enable=unused-variable

        if "trg" in data_set.fields:
            decoding_description = "Greedy decoding" if beam_size < 2 else \
                "Beam search decoding with beam size = {} and alpha = {}".\
                format(beam_size, beam_alpha)
            logger.info("%4s %s: %6.4f [%s]", data_set_name, eval_metric,
                        score, decoding_description)
        else:
            logger.info("No references given for %s -> no evaluation.",
                        data_set_name)

        if save_attention:
            if attention_scores:
                attention_name = "{}.{}.att".format(data_set_name, step)
                attention_path = os.path.join(model_dir, attention_name)
                logger.info(
                    "Saving attention plots. This might take a while..")
                store_attention_plots(attentions=attention_scores,
                                      targets=hypotheses_raw,
                                      sources=data_set.src,
                                      indices=range(len(hypotheses)),
                                      output_prefix=attention_path)
                logger.info("Attention plots saved to: %s", attention_path)
            else:
                logger.warning("Attention scores could not be saved. "
                               "Note that attention scores are not available "
                               "when using beam search. "
                               "Set beam_size to 1 for greedy decoding.")

        if output_path is not None:
            output_path_set = "{}.{}".format(output_path, data_set_name)
            with open(output_path_set, mode="w", encoding="utf-8") as out_file:
                for hyp in hypotheses:
                    out_file.write(hyp + "\n")
            logger.info("Translations saved to: %s", output_path_set)