Пример #1
0
    def testRandomSubset(self):
        # only a random subset should be selected for training
        current_cfg = self.data_cfg.copy()
        current_cfg["random_train_subset"] = -1

        # load the data
        train_data, dev_data, test_data, src_vocab, trg_vocab = \
            load_data(current_cfg)
        assert len(train_data) == 382

        current_cfg["random_train_subset"] = 10
        train_data, dev_data, test_data, src_vocab, trg_vocab = \
            load_data(current_cfg)
        assert len(train_data) == 10
Пример #2
0
    def testIteratorBatchType(self):

        current_cfg = self.data_cfg.copy()

        # load toy data
        train_data, dev_data, test_data, src_vocab, trg_vocab = \
            load_data(current_cfg)

        # make batches by number of sentences
        train_iter = iter(
            make_data_iter(train_data, batch_size=10, batch_type="sentence"))
        batch = next(train_iter)

        self.assertEqual(batch.src[0].shape[0], 10)
        self.assertEqual(batch.trg[0].shape[0], 10)

        # make batches by number of tokens
        train_iter = iter(
            make_data_iter(train_data, batch_size=100, batch_type="token"))
        _ = next(train_iter)  # skip a batch
        _ = next(train_iter)  # skip another batch
        batch = next(train_iter)

        self.assertEqual(batch.src[0].shape[0], 8)
        self.assertEqual(np.prod(batch.src[0].shape), 88)
        self.assertLessEqual(np.prod(batch.src[0].shape), 100)
Пример #3
0
    def setUp(self):
        self.train_path = "test/data/toy/train"
        self.dev_path = "test/data/toy/dev"
        self.test_path = "test/data/toy/test"
        self.levels = ["char", "word"]  # bpe is equivalently processed to word
        self.max_sent_length = 20

        # minimal data config
        self.data_cfg = {
            "src": "de",
            "trg": "en",
            "train": self.train_path,
            "dev": self.dev_path,
            "level": "char",
            "lowercase": True,
            "max_sent_length": self.max_sent_length
        }

        # load the data
        self.train_data, self.dev_data, self.test_data, src_vocab, trg_vocab = \
            load_data(self.data_cfg)
        self.pad_index = trg_vocab.stoi[PAD_TOKEN]
        # random seeds
        seed = 42
        torch.manual_seed(seed)
        random.seed(42)
Пример #4
0
def train(cfg_file: str) -> None:
    """
    Main training function. After training, also test on test data if given.

    :param cfg_file: path to configuration yaml file
    """
    cfg = load_config(cfg_file)

    # set the random seed
    set_seed(seed=cfg["training"].get("random_seed", 42))

    # load the data
    if cfg.get("speech", True):
        train_data, dev_data, test_data, src_vocab, trg_vocab = load_audio_data(
            cfg=cfg)
    else:
        train_data, dev_data, test_data, src_vocab, trg_vocab = load_data(
            data_cfg=cfg["data"])

    # build an encoder-decoder model
    if cfg.get("speech", True):
        model = build_speech_model(cfg["model"],
                                   src_vocab=src_vocab,
                                   trg_vocab=trg_vocab)
    else:
        model = build_model(cfg["model"],
                            src_vocab=src_vocab,
                            trg_vocab=trg_vocab)

    # for training management, e.g. early stopping and model selection
    trainer = TrainManager(model=model, config=cfg)

    # store copy of original training config in model dir
    shutil.copy2(cfg_file, trainer.model_dir + "/config.yaml")

    # log all entries of config
    log_cfg(cfg, trainer.logger)

    log_data_info(train_data=train_data,
                  valid_data=dev_data,
                  test_data=test_data,
                  src_vocab=src_vocab,
                  trg_vocab=trg_vocab,
                  logging_function=trainer.logger.info)

    trainer.logger.info(str(model))

    # store the vocabs
    src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"])
    src_vocab.to_file(src_vocab_file)
    trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"])
    trg_vocab.to_file(trg_vocab_file)

    # train the model
    trainer.train_and_validate(train_data=train_data, valid_data=dev_data)

    # predict with the best model on validation and test
    # (if test data is available)
    ckpt = "{}/{}.ckpt".format(trainer.model_dir, trainer.best_ckpt_iteration)
    output_name = "{:08d}.hyps".format(trainer.best_ckpt_iteration)
    output_path = os.path.join(trainer.model_dir, output_name)
    test(cfg_file, ckpt=ckpt, output_path=output_path, logger=trainer.logger)
Пример #5
0
    def testDataLoading(self):
        # test all combinations of configuration settings
        for test_path in [None, self.test_path]:
            for level in self.levels:
                for lowercase in [True, False]:
                    current_cfg = self.data_cfg.copy()
                    current_cfg["level"] = level
                    current_cfg["lowercase"] = lowercase
                    if test_path is not None:
                        current_cfg["test"] = test_path

                    # load the data
                    train_data, dev_data, test_data, src_vocab, trg_vocab = \
                        load_data(current_cfg)

                    self.assertIs(type(train_data), TranslationDataset)
                    self.assertIs(type(dev_data), TranslationDataset)
                    if test_path is not None:
                        # test has no target side
                        self.assertIs(type(test_data), MonoDataset)

                    # check the number of examples loaded
                    if level == "char":
                        # training set is filtered to max_sent_length
                        expected_train_len = 5
                    else:
                        expected_train_len = 382
                    expected_testdev_len = 20  # dev and test have the same len
                    self.assertEqual(len(train_data), expected_train_len)
                    self.assertEqual(len(dev_data), expected_testdev_len)
                    if test_path is None:
                        self.assertIsNone(test_data)
                    else:
                        self.assertEqual(len(test_data), expected_testdev_len)

                    # check the segmentation: src and trg attributes are lists
                    self.assertIs(type(train_data.examples[0].src), list)
                    self.assertIs(type(train_data.examples[0].trg), list)
                    self.assertIs(type(dev_data.examples[0].src), list)
                    self.assertIs(type(dev_data.examples[0].trg), list)
                    if test_path is not None:
                        self.assertIs(type(test_data.examples[0].src), list)
                        self.assertFalse(hasattr(test_data.examples[0], "trg"))

                    # check the length filtering of the training examples
                    self.assertFalse(
                        any([
                            len(ex.src) > self.max_sent_length
                            for ex in train_data.examples
                        ]))
                    self.assertFalse(
                        any([
                            len(ex.trg) > self.max_sent_length
                            for ex in train_data.examples
                        ]))

                    # check the lowercasing
                    if lowercase:
                        self.assertTrue(
                            all([
                                " ".join(ex.src).lower() == " ".join(ex.src)
                                for ex in train_data.examples
                            ]))
                        self.assertTrue(
                            all([
                                " ".join(ex.src).lower() == " ".join(ex.src)
                                for ex in dev_data.examples
                            ]))
                        self.assertTrue(
                            all([
                                " ".join(ex.trg).lower() == " ".join(ex.trg)
                                for ex in train_data.examples
                            ]))
                        self.assertTrue(
                            all([
                                " ".join(ex.trg).lower() == " ".join(ex.trg)
                                for ex in dev_data.examples
                            ]))
                        if test_path is not None:
                            self.assertTrue(
                                all([
                                    " ".join(ex.src).lower() == " ".join(
                                        ex.src) for ex in test_data.examples
                                ]))

                    # check the first example from the training set
                    expected_srcs = {
                        "char":
                        "Danke.",
                        "word":
                        "David Gallo: Das ist Bill Lange."
                        " Ich bin Dave Gallo."
                    }
                    expected_trgs = {
                        "char":
                        "Thank you.",
                        "word":
                        "David Gallo: This is Bill Lange. "
                        "I'm Dave Gallo."
                    }
                    if level == "char":
                        if lowercase:
                            comparison_src = list(expected_srcs[level].lower())
                            comparison_trg = list(expected_trgs[level].lower())
                        else:
                            comparison_src = list(expected_srcs[level])
                            comparison_trg = list(expected_trgs[level])
                    else:
                        if lowercase:
                            comparison_src = expected_srcs[level].lower().\
                                split()
                            comparison_trg = expected_trgs[level].lower(). \
                                split()
                        else:
                            comparison_src = expected_srcs[level].split()
                            comparison_trg = expected_trgs[level].split()
                    self.assertEqual(train_data.examples[0].src,
                                     comparison_src)
                    self.assertEqual(train_data.examples[0].trg,
                                     comparison_trg)
Пример #6
0
def filter_noise(cfg_file,
                 ckpt: str,
                 output_path: str = None,
                 logger: Logger = None) -> None:
    """
    Main test function. Handles loading a model from checkpoint, generating
    translations and storing them and attention plots.

    :param cfg_file: path to configuration file
    :param ckpt: path to checkpoint to load
    :param output_path: path to output
    :param logger: log output to this logger (creates new logger if not set)
    """

    if logger is None:
        logger = make_logger()

    cfg = load_config(cfg_file)

    # when checkpoint is not specified, take latest (best) from model dir
    if ckpt is None:
        model_dir = cfg["training"]["model_dir"]
        ckpt = get_latest_checkpoint(model_dir)
        if ckpt is None:
            raise FileNotFoundError(
                "No checkpoint found in directory {}.".format(model_dir))
        try:
            step = ckpt.split(model_dir + "/")[1].split(".ckpt")[0]
        except IndexError:
            step = "best"

    use_cuda = cfg["training"].get("use_cuda", False)
    max_output_length = cfg["training"].get("max_output_length", None)

    # load the data
    if cfg.get("speech", True):
        train_data, _, _, src_vocab, trg_vocab = load_audio_data(cfg=cfg)
    else:
        train_data, _, _, src_vocab, trg_vocab = load_data(
            data_cfg=cfg["data"])

    data_to_predict = ("train", train_data)

    # load model state from disk
    model_checkpoint = load_checkpoint(ckpt, use_cuda=use_cuda)

    # build model and load parameters into it
    if cfg.get("speech", True):
        model = build_speech_model(cfg["model"],
                                   src_vocab=src_vocab,
                                   trg_vocab=trg_vocab)
    else:
        model = build_model(cfg["model"],
                            src_vocab=src_vocab,
                            trg_vocab=trg_vocab)
    model.load_state_dict(model_checkpoint["model_state"])

    if use_cuda:
        model.cuda()

    pad_index = model.pad_index
    label_smoothing = 0.0
    loss_function = XentLoss(pad_index=pad_index, smoothing=label_smoothing)

    data_set_name, data_set = data_to_predict

    #pylint: disable=unused-variable
    ppls_list = generate_perplexities_on_data(
        model,
        data=data_set,
        max_output_length=max_output_length,
        use_cuda=use_cuda,
        loss_function=loss_function,
        logger=logger)
    #pylint: enable=unused-variable

    if output_path is None:
        raise ValueError("Output path must be specified")

    else:
        if not os.path.isdir(output_path):
            os.makedirs(output_path)
        output_path_set = os.path.join(output_path,
                                       data_set_name + "_perplexities.txt")
        with open(output_path_set, "w") as outfile:
            first_iteration = True
            for ppls in ppls_list:
                if not first_iteration:
                    outfile.write("\n")
                outfile.write(str(ppls))
                first_iteration = False

        logger.info("Perplexities saved to: %s", output_path_set)
Пример #7
0
def test(cfg_file,
         ckpt: str,
         output_path: str = None,
         save_attention: bool = False,
         logger: Logger = None) -> None:
    """
    Main test function. Handles loading a model from checkpoint, generating
    translations and storing them and attention plots.

    :param cfg_file: path to configuration file
    :param ckpt: path to checkpoint to load
    :param output_path: path to output
    :param save_attention: whether to save the computed attention weights
    :param logger: log output to this logger (creates new logger if not set)
    """

    if logger is None:
        logger = make_logger()

    cfg = load_config(cfg_file)

    if "test" not in cfg["data"].keys():
        raise ValueError("Test data must be specified in config.")

    # when checkpoint is not specified, take latest (best) from model dir
    if ckpt is None:
        model_dir = cfg["training"]["model_dir"]
        ckpt = get_latest_checkpoint(model_dir)
        if ckpt is None:
            raise FileNotFoundError(
                "No checkpoint found in directory {}.".format(model_dir))
        try:
            step = ckpt.split(model_dir + "/")[1].split(".ckpt")[0]
        except IndexError:
            step = "best"

    batch_size = cfg["training"].get("eval_batch_size",
                                     cfg["training"]["batch_size"])
    batch_type = cfg["training"].get(
        "eval_batch_type", cfg["training"].get("batch_type", "sentence"))
    use_cuda = cfg["training"].get("use_cuda", False)
    level = cfg["data"]["level"]
    eval_metric = cfg["training"]["eval_metric"]
    max_output_length = cfg["training"].get("max_output_length", None)

    # load the data
    if cfg.get("speech", True):
        _, dev_data, test_data, src_vocab, trg_vocab = load_audio_data(cfg=cfg)
    else:
        _, dev_data, test_data, src_vocab, trg_vocab = load_data(
            data_cfg=cfg["data"])

    data_to_predict = {"dev": dev_data, "test": test_data}

    # load model state from disk
    model_checkpoint = load_checkpoint(ckpt, use_cuda=use_cuda)

    # build model and load parameters into it
    if cfg.get("speech", True):
        model = build_speech_model(cfg["model"],
                                   src_vocab=src_vocab,
                                   trg_vocab=trg_vocab)
    else:
        model = build_model(cfg["model"],
                            src_vocab=src_vocab,
                            trg_vocab=trg_vocab)
    model.load_state_dict(model_checkpoint["model_state"])

    if use_cuda:
        model.cuda()

    # whether to use beam search for decoding, 0: greedy decoding
    if "testing" in cfg.keys():
        beam_size = cfg["testing"].get("beam_size", 1)
        beam_alpha = cfg["testing"].get("alpha", -1)
    else:
        beam_size = 1
        beam_alpha = -1

    for data_set_name, data_set in data_to_predict.items():

        #pylint: disable=unused-variable
        score, loss, ppl, sources, sources_raw, references, hypotheses, \
            hypotheses_raw, attention_scores = validate_on_data(
                model, data=data_set, batch_size=batch_size,
                batch_type=batch_type, level=level,
                max_output_length=max_output_length, eval_metric=eval_metric,
                use_cuda=use_cuda, loss_function=None, beam_size=beam_size,
                beam_alpha=beam_alpha, logger=logger)
        #pylint: enable=unused-variable

        if "trg" in data_set.fields:
            decoding_description = "Greedy decoding" if beam_size < 2 else \
                "Beam search decoding with beam size = {} and alpha = {}".\
                format(beam_size, beam_alpha)
            logger.info("%4s %s: %6.4f [%s]", data_set_name, eval_metric,
                        score, decoding_description)
        else:
            logger.info("No references given for %s -> no evaluation.",
                        data_set_name)

        if save_attention:
            if attention_scores:
                attention_name = "{}.{}.att".format(data_set_name, step)
                attention_path = os.path.join(model_dir, attention_name)
                logger.info(
                    "Saving attention plots. This might take a while..")
                store_attention_plots(attentions=attention_scores,
                                      targets=hypotheses_raw,
                                      sources=data_set.src,
                                      indices=range(len(hypotheses)),
                                      output_prefix=attention_path)
                logger.info("Attention plots saved to: %s", attention_path)
            else:
                logger.warning("Attention scores could not be saved. "
                               "Note that attention scores are not available "
                               "when using beam search. "
                               "Set beam_size to 1 for greedy decoding.")

        if output_path is not None:
            output_path_set = "{}.{}".format(output_path, data_set_name)
            with open(output_path_set, mode="w", encoding="utf-8") as out_file:
                for hyp in hypotheses:
                    out_file.write(hyp + "\n")
            logger.info("Translations saved to: %s", output_path_set)