Exemplo n.º 1
0
    def setup_task(cls, args, **kwargs):
        """Setup the task (e.g., load dictionaries).

        Args:
            args (argparse.Namespace): parsed command-line arguments
        """
        args.left_pad_source = utils.eval_bool(args.left_pad_source)
        args.left_pad_target = utils.eval_bool(args.left_pad_target)

        paths = utils.split_paths(args.data)
        assert len(paths) > 0

        # set language pair
        args.source_lang = "char"
        args.target_lang = "label"

        # load dictionaries
        src_dict = cls.load_dictionary(
            os.path.join(paths[0], "dict.{}.txt".format(args.source_lang))
        )
        tgt_dict = cls.load_dictionary(
            os.path.join(paths[0], "dict.{}.txt".format(args.target_lang))
        )
        assert src_dict.pad() == tgt_dict.pad()
        assert src_dict.eos() == tgt_dict.eos()
        assert src_dict.unk() == tgt_dict.unk()
        logger.info("[{}] dictionary: {} types".format(args.source_lang, len(src_dict)))
        logger.info("[{}] dictionary: {} types".format(args.target_lang, len(tgt_dict)))

        return cls(args, src_dict, tgt_dict)
Exemplo n.º 2
0
    def add_args(parser):
        """Add model-specific arguments to the parser."""
        # fmt: off
        parser.add_argument("--dropout", type=float, metavar="D",
                            help="dropout probability")
        parser.add_argument("--encoder-conv-channels", type=str, metavar="EXPR",
                            help="list of encoder convolution's out channels")
        parser.add_argument("--encoder-conv-kernel-sizes", type=str, metavar="EXPR",
                            help="list of encoder convolution's kernel sizes")
        parser.add_argument("--encoder-conv-strides", type=str, metavar="EXPR",
                            help="list of encoder convolution's strides")
        parser.add_argument("--encoder-rnn-hidden-size", type=int, metavar="N",
                            help="encoder rnn's hidden size")
        parser.add_argument("--encoder-rnn-layers", type=int, metavar="N",
                            help="number of rnn encoder layers")
        parser.add_argument("--encoder-rnn-bidirectional",
                            type=lambda x: utils.eval_bool(x),
                            help="make all rnn layers of encoder bidirectional")
        parser.add_argument("--encoder-rnn-residual",
                            type=lambda x: utils.eval_bool(x),
                            help="create residual connections for rnn encoder "
                            "layers (starting from the 2nd layer), i.e., the actual "
                            "output of such layer is the sum of its input and output")
        parser.add_argument("--encoder-multilayer-rnn-as-single-module",
                            type=lambda x: utils.eval_bool(x),
                            help="if True use a single nn.Module.LSTM for multilayer LSTMs "
                            "(faster and may fix a possible cuDNN error); otherwise use "
                            "nn.ModuleList(for back-compatibility). Note: if True then "
                            "encoder_rnn_residual is set to False")

        # Granular dropout settings (if not specified these default to --dropout)
        parser.add_argument("--encoder-rnn-dropout-in", type=float, metavar="D",
                            help="dropout probability for encoder rnn's input")
        parser.add_argument("--encoder-rnn-dropout-out", type=float, metavar="D",
                            help="dropout probability for encoder rnn's output")
Exemplo n.º 3
0
    def setup_task(cls, args, **kwargs):
        """Setup the task (e.g., load dictionaries).

        Args:
            args (argparse.Namespace): parsed command-line arguments
        """

        args.left_pad_source = utils.eval_bool(args.left_pad_source)
        args.left_pad_target = utils.eval_bool(args.left_pad_target)

        paths = utils.split_paths(args.data)
        assert len(paths) > 0
        # find language pair automatically
        if args.source_lang is None or args.target_lang is None:
            args.source_lang, args.target_lang = data_utils.infer_language_pair(
                paths[0])
        if args.source_lang is None or args.target_lang is None:
            raise Exception(
                "Could not infer language pair, please provide it explicitly")

        dictionary = cls.load_dictionary(os.path.join(paths[0], "dict.txt"))

        # langs:"en-zh,my-en"
        logger.info("args.add_lang_token: {} ".format(args.add_lang_token))
        if args.add_lang_token and len(args.langs) > 0:
            languages = args.langs.split(",")
            for lang_pair in languages:
                if lang_pair == "-": continue
                logger.info("{} was add to dictionary".format(lang_pair))
                lang = lang_pair.split("-")
                dictionary.add_symbol("[{}]".format(lang[0]))
                dictionary.add_symbol("[{}]".format(lang[1]))
        return cls(args, dictionary, dictionary)
Exemplo n.º 4
0
    def setup_task(cls, args, **kwargs):
        """Setup the task (e.g., load dictionaries).

        Args:
            args (argparse.Namespace): parsed command-line arguments
        """
        args.left_pad_source = utils.eval_bool(args.left_pad_source)
        args.left_pad_target = utils.eval_bool(args.left_pad_target)

        paths = utils.split_paths(args.data)
        assert len(paths) > 0
        # find language pair automatically
        if args.source_lang is None or args.target_lang is None:
            args.source_lang, args.target_lang = data_utils.infer_language_pair(
                paths[0])
        if args.source_lang is None or args.target_lang is None:
            raise Exception(
                'Could not infer language pair, please provide it explicitly')

        # load dictionaries
        src_dict = cls.load_dictionary(
            os.path.join(paths[0], 'dict.{}.txt'.format(args.source_lang)))
        tgt_dict = cls.load_dictionary(
            os.path.join(paths[0], 'dict.{}.txt'.format(args.target_lang)))
        assert src_dict.pad() == tgt_dict.pad()
        assert src_dict.eos() == tgt_dict.eos()
        assert src_dict.unk() == tgt_dict.unk()
        logger.info('[{}] dictionary: {} types'.format(args.source_lang,
                                                       len(src_dict)))
        logger.info('[{}] dictionary: {} types'.format(args.target_lang,
                                                       len(tgt_dict)))

        return cls(args, src_dict, tgt_dict)
Exemplo n.º 5
0
class LSTMLanguageModelEspressoConfig(FairseqDataclass):
    dropout: float = field(default=0.1, metadata={"help": "dropout probability"})
    decoder_embed_dim: int = field(
        default=48, metadata={"help": "decoder embedding dimension"}
    )
    decoder_embed_path: Optional[str] = field(
        default=None, metadata={"help": "path to pre-trained decoder embedding"}
    )
    decoder_freeze_embed: bool = field(
        default=False, metadata={"help": "freeze decoder embeddings"}
    )
    decoder_hidden_size: int = field(
        default=650, metadata={"help": "decoder hidden size"}
    )
    decoder_layers: int = field(
        default=2, metadata={"help": "number of decoder layers"}
    )
    decoder_out_embed_dim: int = field(
        default=650, metadata={"help": "decoder output embedding dimension"}
    )
    decoder_rnn_residual: lambda x: utils.eval_bool(x) = field(
        default=False,
        metadata={
            "help": "create residual connections for rnn decoder layers "
            "(starting from the 2nd layer), i.e., the actual output of such "
            "layer is the sum of its input and output"
        },
    )
    adaptive_softmax_cutoff: Optional[str] = field(
        default=None,
        metadata={
            "help": "comma separated list of adaptive softmax cutoff points. "
            "Must be used with adaptive_loss criterion"
        },
    )
    share_embed: lambda x: utils.eval_bool(x) = field(
        default=False, metadata={"help": "share input and output embeddings"}
    )
    is_wordlm: bool = field(
        default=False,
        metadata={
            "help": "whether it is word LM or subword LM. Only relevant for ASR decoding "
            "with LM, and it determines how the underlying decoder instance gets the "
            "dictionary from the task instance when calling cls.build_model()"
        },
    )
    decoder_dropout_in: float = field(
        default=0.1,
        metadata={"help": "dropout probability for decoder input embedding"}
    )
    decoder_dropout_out: float = field(
        default=0.1,
        metadata={"help": "dropout probability for decoder output"}
    )
    # TODO common var add to parent
    add_bos_token: bool = II("task.add_bos_token")
    tokens_per_sample: int = II("task.tokens_per_sample")
    max_target_positions: Optional[int] = II("task.max_target_positions")
    tpu: bool = II("params.common.tpu")
Exemplo n.º 6
0
    def prepare(cls, load_dictionary, args, **kargs):
        args.left_pad_source = utils.eval_bool(args.left_pad_source)
        args.left_pad_target = utils.eval_bool(args.left_pad_target)

        if not hasattr(args, "shuffle_instance"):
            args.shuffle_instance = False
        if args.langtoks is None:
            args.langtoks = {}
        if "main" not in args.langtoks:
            src_langtok_spec = args.encoder_langtok if args.encoder_langtok else None
            tgt_langtok_spec = "tgt" if args.decoder_langtok else None
            args.langtoks["main"] = (src_langtok_spec, tgt_langtok_spec)

        def check_langs(langs, pairs):
            messages = []
            for src, tgt in pairs:
                if src not in langs or tgt not in langs:
                    messages.append(
                        f"language pair {src}-{tgt} contains languages "
                        "that are not in the language dictionary")
            if len(messages) > 0:
                raise ValueError(" ".join(messages) + f"; langs: {langs}")

        if args.lang_pairs is None:
            raise ValueError(
                "--lang-pairs is required. List all the language pairs in the training objective."
            )
        if isinstance(args.lang_pairs, str):
            args.lang_pairs = args.lang_pairs.split(",")
        if args.source_lang is not None or args.target_lang is not None:
            training = False
        else:
            training = True
        language_list = cls.load_langs(args, **kargs)
        check_langs(
            language_list,
            ([p.split("-")
              for p in args.lang_pairs] if training else [(args.source_lang,
                                                           args.target_lang)]),
        )

        def load_dictionary_and_postproc(path):
            d = load_dictionary(path)
            augment_dictionary(
                dictionary=d,
                language_list=language_list,
                lang_tok_style=args.lang_tok_style,
                langtoks_specs=args.langtoks_specs,
                extra_data=args.extra_data,
            )
            return d

        dicts = cls.load_all_dictionaries(args, language_list,
                                          load_dictionary_and_postproc,
                                          training)
        return language_list, dicts, training
Exemplo n.º 7
0
    def update_args(cls, args):
        args.left_pad_source = utils.eval_bool(args.left_pad_source)
        args.left_pad_target = utils.eval_bool(args.left_pad_target)

        if args.lang_pairs is None:
            raise ValueError(
                "--lang-pairs is required. List all the language pairs in the training objective."
            )
        if isinstance(args.lang_pairs, str):
            args.lang_pairs = args.lang_pairs.split(",")
Exemplo n.º 8
0
    def add_args(parser):
        """Add model-specific arguments to the parser."""
        # fmt: off
        parser.add_argument("--dropout",
                            type=float,
                            metavar="D",
                            help="dropout probability")
        parser.add_argument("--encoder-conv-channels",
                            type=str,
                            metavar="EXPR",
                            help="list of encoder convolution's out channels")
        parser.add_argument("--encoder-conv-kernel-sizes",
                            type=str,
                            metavar="EXPR",
                            help="list of encoder convolution's kernel sizes")
        parser.add_argument("--encoder-conv-strides",
                            type=str,
                            metavar="EXPR",
                            help="list of encoder convolution's strides")
        parser.add_argument("--encoder-rnn-hidden-size",
                            type=int,
                            metavar="N",
                            help="encoder rnn's hidden size")
        parser.add_argument("--encoder-rnn-layers",
                            type=int,
                            metavar="N",
                            help="number of rnn encoder layers")
        parser.add_argument(
            "--encoder-rnn-bidirectional",
            type=lambda x: utils.eval_bool(x),
            help="make all rnn layers of encoder bidirectional")
        parser.add_argument(
            "--encoder-rnn-residual",
            type=lambda x: utils.eval_bool(x),
            help="create residual connections for rnn encoder "
            "layers (starting from the 2nd layer), i.e., the actual "
            "output of such layer is the sum of its input and output")

        # Granular dropout settings (if not specified these default to --dropout)
        parser.add_argument("--encoder-rnn-dropout-in",
                            type=float,
                            metavar="D",
                            help="dropout probability for encoder rnn's input")
        parser.add_argument(
            "--encoder-rnn-dropout-out",
            type=float,
            metavar="D",
            help="dropout probability for encoder rnn's output")
Exemplo n.º 9
0
    def add_args(parser):
        """Add model-specific arguments to the parser."""
        # fmt: off
        parser.add_argument("--dropout",
                            type=float,
                            metavar="D",
                            help="dropout probability")
        parser.add_argument("--decoder-embed-dim",
                            type=int,
                            metavar="N",
                            help="decoder embedding dimension")
        parser.add_argument("--decoder-embed-path",
                            type=str,
                            metavar="STR",
                            help="path to pre-trained decoder embedding")
        parser.add_argument("--decoder-freeze-embed",
                            action="store_true",
                            help="freeze decoder embeddings")
        parser.add_argument("--decoder-hidden-size",
                            type=int,
                            metavar="N",
                            help="decoder hidden size")
        parser.add_argument("--decoder-layers",
                            type=int,
                            metavar="N",
                            help="number of decoder layers")
        parser.add_argument("--decoder-out-embed-dim",
                            type=int,
                            metavar="N",
                            help="decoder output embedding dimension")
        parser.add_argument(
            "--adaptive-softmax-cutoff",
            metavar="EXPR",
            help="comma separated list of adaptive softmax cutoff points. "
            "Must be used with adaptive_loss criterion")
        parser.add_argument("--share-embed",
                            type=lambda x: utils.eval_bool(x),
                            help="share input and output embeddings")
        parser.add_argument(
            "--is-wordlm",
            action="store_true",
            help="whether it is word LM or subword LM. Only "
            "relevant for ASR decoding with LM, and it determines "
            "how the underlying decoder instance gets the dictionary "
            "from the task instance when calling cls.build_model()")

        # Granular dropout settings (if not specified these default to --dropout)
        parser.add_argument(
            "--decoder-dropout-in",
            type=float,
            metavar="D",
            help="dropout probability for decoder input embedding")
        parser.add_argument("--decoder-dropout-out",
                            type=float,
                            metavar="D",
                            help="dropout probability for decoder output")
Exemplo n.º 10
0
    def prepare(cls, args, **kargs):
        args.left_pad_source = utils.eval_bool(args.left_pad_source)
        args.left_pad_target = utils.eval_bool(args.left_pad_target)

        if args.lang_pairs is None:
            raise ValueError(
                '--lang-pairs is required. List all the language pairs in the training objective.'
            )
        if isinstance(args.lang_pairs, str):
            args.lang_pairs = args.lang_pairs.split(',')
        sorted_langs = sorted(
            list({
                x
                for lang_pair in args.lang_pairs for x in lang_pair.split('-')
            }))
        if args.source_lang is not None or args.target_lang is not None:
            training = False
        else:
            training = True

        # load dictionaries
        dicts = OrderedDict()
        for lang in sorted_langs:
            paths = utils.split_paths(args.data)
            assert len(paths) > 0
            dicts[lang] = cls.load_dictionary(
                os.path.join(paths[0], 'dict.{}.txt'.format(lang)))
            if len(dicts) > 0:
                assert dicts[lang].pad() == dicts[sorted_langs[0]].pad()
                assert dicts[lang].eos() == dicts[sorted_langs[0]].eos()
                assert dicts[lang].unk() == dicts[sorted_langs[0]].unk()
            if args.encoder_langtok is not None or args.decoder_langtok:
                for lang_to_add in sorted_langs:
                    dicts[lang].add_symbol(_lang_token(lang_to_add))
            logger.info('[{}] dictionary: {} types'.format(
                lang, len(dicts[lang])))
        return dicts, training
Exemplo n.º 11
0
    def setup_task(cls, args, **kwargs):
        """Setup the task (e.g., load dictionaries).

        Args:
            args (argparse.Namespace): parsed command-line arguments
        """

        # get padding...
        args.left_pad_source = utils.eval_bool(args.left_pad_source)
        args.left_pad_target = utils.eval_bool(args.left_pad_target)
        paths = utils.split_paths(args.data)
        assert len(paths) > 0
        # find language pair automatically
        if args.source_lang is None or args.target_lang is None:
            args.source_lang, args.target_lang = data_utils.infer_language_pair(
                paths[0]
            )

        print("path:",os.path.join(paths[0], "/Dicts/dict.txt"))
        dictionary = cls.load_dictionary(
            os.path.join(paths[0]+"/Dicts/", "dict.txt")
        )

        return cls(args, dictionary,paths)
Exemplo n.º 12
0
    def add_args(parser):
        """Add model-specific arguments to the parser."""
        # fmt: off
        parser.add_argument("--dropout",
                            type=float,
                            metavar="D",
                            help="dropout probability")
        parser.add_argument("--hidden-sizes",
                            type=str,
                            metavar="EXPR",
                            help="list of hidden sizes for all Tdnn layers")
        parser.add_argument("--kernel-sizes",
                            type=str,
                            metavar="EXPR",
                            help="list of all Tdnn layer\'s kernel sizes")
        parser.add_argument("--strides",
                            type=str,
                            metavar="EXPR",
                            help="list of all Tdnn layer\'s strides")
        parser.add_argument("--dilations",
                            type=str,
                            metavar="EXPR",
                            help="list of all Tdnn layer\'s dilations")
        parser.add_argument("--num-layers",
                            type=int,
                            metavar="N",
                            help="number of Tdnn layers")
        parser.add_argument(
            "--residual",
            type=lambda x: utils.eval_bool(x),
            help="create residual connections for rnn encoder "
            "layers (starting from the 2nd layer), i.e., the actual "
            "output of such layer is the sum of its input and output")

        # Granular dropout settings (if not specified these default to --dropout)
        parser.add_argument("--dropout-in",
                            type=float,
                            metavar="D",
                            help="dropout probability for encoder\'s input")
        parser.add_argument(
            "--dropout-out",
            type=float,
            metavar="D",
            help="dropout probability for Tdnn layers\' output")
Exemplo n.º 13
0
    def prepare(cls, load_dictionary, args, **kargs):
        args.left_pad_source = utils.eval_bool(args.left_pad_source)
        args.left_pad_target = utils.eval_bool(args.left_pad_target)

        if not hasattr(args, "shuffle_instance"):
            args.shuffle_instance = False
        if args.langtoks is None:
            args.langtoks = {}
        if "main" not in args.langtoks:
            src_langtok_spec = args.encoder_langtok if args.encoder_langtok else None
            tgt_langtok_spec = "tgt" if args.decoder_langtok else None
            args.langtoks["main"] = (src_langtok_spec, tgt_langtok_spec)

        def check_langs(langs, pairs):
            messages = []
            for src, tgt in pairs:
                if src not in langs or tgt not in langs:
                    messages.append(
                        f"language pair {src}-{tgt} contains languages "
                        "that are not in the language dictionary"
                    )
            if len(messages) > 0:
                raise ValueError(" ".join(messages) + f"; langs: {langs}")

        if args.lang_pairs is None:
            raise ValueError(
                "--lang-pairs is required. List all the language pairs in the training objective."
            )
        if isinstance(args.lang_pairs, str):
            args.lang_pairs = args.lang_pairs.split(",")
        if args.source_lang is not None or args.target_lang is not None:
            training = False
        else:
            training = True
        language_list = cls.load_langs(args, **kargs)
        check_langs(
            language_list,
            (
                [p.split("-") for p in args.lang_pairs]
                if training
                else [(args.source_lang, args.target_lang)]
            ),
        )

        # load dictionaries
        if training:
            extra_lang_pairs = (
                list(
                    {p for _, v in args.extra_lang_pairs.items() for p in v.split(",")}
                )
                if args.extra_lang_pairs
                else []
            )
            langs_to_load_dicts = sorted(
                {x for p in args.lang_pairs + extra_lang_pairs for x in p.split("-")}
            )
        else:
            langs_to_load_dicts = sorted([args.source_lang, args.target_lang])

        dicts = OrderedDict()
        paths = utils.split_paths(args.data)
        assert len(paths) > 0
        for lang in langs_to_load_dicts:
            dicts[lang] = load_dictionary(
                os.path.join(paths[0], "dict.{}.txt".format(lang))
            )
            augment_dictionary(
                dictionary=dicts[lang],
                language_list=language_list,
                lang_tok_style=args.lang_tok_style,
                langtoks_specs=args.langtoks_specs,
                extra_data=args.extra_data,
            )
            if len(dicts) > 0:
                assert dicts[lang].pad() == dicts[langs_to_load_dicts[0]].pad()
                assert dicts[lang].eos() == dicts[langs_to_load_dicts[0]].eos()
                assert dicts[lang].unk() == dicts[langs_to_load_dicts[0]].unk()
            logger.info("[{}] dictionary: {} types".format(lang, len(dicts[lang])))
        return language_list, dicts, training
Exemplo n.º 14
0
    def build_model(cls, args, task):
        """Build a new model instance."""
        # make sure that all args are properly defaulted (in case there are any new ones)
        base_architecture(args)

        if args.encoder_layers != args.decoder_layers:
            raise ValueError("--encoder-layers must match --decoder-layers")

        max_source_positions = getattr(
            args, "max_source_positions", DEFAULT_MAX_SOURCE_POSITIONS
        )
        max_target_positions = getattr(
            args, "max_target_positions", DEFAULT_MAX_TARGET_POSITIONS
        )

        def load_pretrained_embedding_from_file(embed_path, dictionary, embed_dim):
            num_embeddings = len(dictionary)
            padding_idx = dictionary.pad()
            embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
            embed_dict = utils.parse_embedding(embed_path)
            utils.print_embed_overlap(embed_dict, dictionary)
            return utils.load_embedding(embed_dict, dictionary, embed_tokens)

        if args.encoder_embed_path:
            pretrained_encoder_embed = load_pretrained_embedding_from_file(
                args.encoder_embed_path, task.source_dictionary, args.encoder_embed_dim
            )
        else:
            num_embeddings = len(task.source_dictionary)
            pretrained_encoder_embed = Embedding(
                num_embeddings, args.encoder_embed_dim, task.source_dictionary.pad()
            )

        if args.share_all_embeddings:
            # double check all parameters combinations are valid
            if task.source_dictionary != task.target_dictionary:
                raise ValueError("--share-all-embeddings requires a joint dictionary")
            if args.decoder_embed_path and (
                args.decoder_embed_path != args.encoder_embed_path
            ):
                raise ValueError(
                    "--share-all-embed not compatible with --decoder-embed-path"
                )
            if args.encoder_embed_dim != args.decoder_embed_dim:
                raise ValueError(
                    "--share-all-embeddings requires --encoder-embed-dim to "
                    "match --decoder-embed-dim"
                )
            pretrained_decoder_embed = pretrained_encoder_embed
            args.share_decoder_input_output_embed = True
        else:
            # separate decoder input embeddings
            pretrained_decoder_embed = None
            if args.decoder_embed_path:
                pretrained_decoder_embed = load_pretrained_embedding_from_file(
                    args.decoder_embed_path,
                    task.target_dictionary,
                    args.decoder_embed_dim,
                )
        # one last double check of parameter combinations
        if args.share_decoder_input_output_embed and (
            args.decoder_embed_dim != args.decoder_out_embed_dim
        ):
            raise ValueError(
                "--share-decoder-input-output-embeddings requires "
                "--decoder-embed-dim to match --decoder-out-embed-dim"
            )

        if args.encoder_freeze_embed:
            pretrained_encoder_embed.weight.requires_grad = False
        if args.decoder_freeze_embed:
            pretrained_decoder_embed.weight.requires_grad = False

        encoder = LSTMEncoder(
            dictionary=task.source_dictionary,
            embed_dim=args.encoder_embed_dim,
            hidden_size=args.encoder_hidden_size,
            num_layers=args.encoder_layers,
            dropout_in=args.encoder_dropout_in,
            dropout_out=args.encoder_dropout_out,
            bidirectional=args.encoder_bidirectional,
            pretrained_embed=pretrained_encoder_embed,
            max_source_positions=max_source_positions,
        )
        decoder = LSTMDecoder(
            dictionary=task.target_dictionary,
            embed_dim=args.decoder_embed_dim,
            hidden_size=args.decoder_hidden_size,
            out_embed_dim=args.decoder_out_embed_dim,
            num_layers=args.decoder_layers,
            dropout_in=args.decoder_dropout_in,
            dropout_out=args.decoder_dropout_out,
            attention=utils.eval_bool(args.decoder_attention),
            encoder_output_units=encoder.output_units,
            pretrained_embed=pretrained_decoder_embed,
            share_input_output_embed=args.share_decoder_input_output_embed,
            adaptive_softmax_cutoff=(
                utils.eval_str_list(args.adaptive_softmax_cutoff, type=int)
                if args.criterion == "adaptive_loss"
                else None
            ),
            max_target_positions=max_target_positions,
            residuals=False,
        )
        return cls(encoder, decoder)
Exemplo n.º 15
0
    def load_dataset_only(self,
                          split,
                          lang_pairs,
                          do_mask=True,
                          epoch=1,
                          combine=False):
        paths = utils.split_paths(self.args.data)
        assert len(paths) > 0
        data_path = paths[(epoch - 1) % len(paths)]

        # TODO unk token will be considered as first word too, though it might be an unknown phoneme within a word
        # get_whole_word_mask returns a tensor (size V by 1 ) to indicate if a token is a word start token
        mask_whole_src_words = gen_whole_word_mask(self.args, self.src_dict)
        language_without_segmentations = self.args.no_whole_word_mask_langs.split(
            ",")
        lang_datasets = []
        eos_bos = []
        lang_pairs = lang_pairs.split(",") if lang_pairs != "" else []
        assert len(lang_pairs) > 0
        for lp in lang_pairs:
            src, tgt = lp.split("-")
            lang_mask_whole_src_words = (mask_whole_src_words if src
                                         not in language_without_segmentations
                                         else None)

            end_token = (self.source_dictionary.index(
                PairedDenoisingTask.LANG_TAG_TEMPLATE.format(src))
                         if self.args.add_src_lang_token else None)
            bos_token = (self.target_dictionary.index(
                PairedDenoisingTask.LANG_TAG_TEMPLATE.format(tgt))
                         if self.args.add_tgt_lang_token else None)
            src_lang_id = None

            if self.args.add_src_lang_token or self.args.add_tgt_lang_token:
                eos_bos.append((end_token, bos_token))

            dataset = PairedDenoisingTask.language_pair_denoising_dataset(
                data_path,
                do_mask,
                split,
                src,
                self.source_dictionary,
                tgt,
                self.target_dictionary,
                self.mask_idx,
                lang_mask_whole_src_words,
                self.args.seed,
                self.args,
                self.args.dataset_impl,
                combine=combine,
                left_pad_source=utils.eval_bool(self.args.left_pad_source),
                left_pad_target=utils.eval_bool(self.args.left_pad_target),
                max_source_positions=self.args.max_source_positions,
                max_target_positions=self.args.max_target_positions,
                src_lang_id=src_lang_id,
            )

            lang_datasets.append(dataset)

        if len(lang_datasets) == 0:
            return
        elif len(lang_datasets) == 1:
            dataset = lang_datasets[0]
            if self.args.add_src_lang_token or self.args.add_tgt_lang_token:
                end_token, bos_token = eos_bos[0]
                dataset = TransformEosLangPairDataset(
                    dataset,
                    src_eos=self.source_dictionary.eos(),
                    new_src_eos=end_token,
                    tgt_bos=self.target_dictionary.eos(),
                    new_tgt_bos=bos_token,
                )
        else:
            end_tokens = [item[0] for item in eos_bos if item[0] is not None]
            bos_tokens = [item[1] for item in eos_bos if item[1] is not None]
            lang_datasets = self.resample_datasets(lang_datasets, lang_pairs,
                                                   epoch)
            dataset = TransformEosConcatLangPairDataset(
                lang_datasets,
                self.source_dictionary.eos(),
                self.target_dictionary.eos(),
                new_src_eos=end_tokens,
                new_tgt_bos=bos_tokens,
            )
        return dataset
Exemplo n.º 16
0
    def prepare(cls, load_dictionary, args, **kargs):
        args.left_pad_source = utils.eval_bool(args.left_pad_source)
        args.left_pad_target = utils.eval_bool(args.left_pad_target)

        if not hasattr(args, "shuffle_instance"):
            args.shuffle_instance = False
        if args.langtoks is None:
            args.langtoks = {}
        if "main" not in args.langtoks:
            src_langtok_spec = args.encoder_langtok if args.encoder_langtok else None
            tgt_langtok_spec = "tgt" if args.decoder_langtok else None
            args.langtoks["main"] = (src_langtok_spec, tgt_langtok_spec)

        def check_langs(langs, pairs):
            messages = []
            for src, tgt in pairs:
                if src not in langs or tgt not in langs:
                    messages.append(
                        f"language pair {src}-{tgt} contains languages "
                        "that are not in the language dictionary"
                    )
            if len(messages) > 0:
                raise ValueError(" ".join(messages) + f"; langs: {langs}")

        if args.lang_pairs is None:
            raise ValueError(
                "--lang-pairs is required. List all the language pairs in the training objective."
            )
        if isinstance(args.lang_pairs, str):
            args.lang_pairs = args.lang_pairs.split(",")
        if args.source_lang is not None or args.target_lang is not None:
            training = False
        else:
            training = True
        sorted_langs = cls.load_langs(args, **kargs)
        check_langs(
            sorted_langs,
            (
                [p.split("-") for p in args.lang_pairs]
                if training
                else [(args.source_lang, args.target_lang)]
            ),
        )

        # load dictionaries
        if training:
            extra_lang_pairs = (
                list(
                    {p for _, v in args.extra_lang_pairs.items() for p in v.split(",")}
                )
                if args.extra_lang_pairs
                else []
            )
            langs_to_load_dicts = sorted(
                {x for p in args.lang_pairs + extra_lang_pairs for x in p.split("-")}
            )
        else:
            langs_to_load_dicts = sorted([args.source_lang, args.target_lang])

        dicts = OrderedDict()
        supported_langtok_specs = args.langtoks_specs
        for lang in langs_to_load_dicts:
            paths = utils.split_paths(args.data)
            assert len(paths) > 0
            dicts[lang] = load_dictionary(
                os.path.join(paths[0], "dict.{}.txt".format(lang))
            )
            if len(dicts) > 0:
                assert dicts[lang].pad() == dicts[langs_to_load_dicts[0]].pad()
                assert dicts[lang].eos() == dicts[langs_to_load_dicts[0]].eos()
                assert dicts[lang].unk() == dicts[langs_to_load_dicts[0]].unk()

            # keep the langs consistent for all experiments with the same lang dict
            # for finetuning regardless of whether lang_tok is required or not just add the tokens to the dicts
            for spec in supported_langtok_specs:
                for lang_to_add in sorted_langs:
                    dicts[lang].add_symbol(
                        MultilingualDatasetManager.get_lang_tok(lang_to_add, args, spec)
                    )
            if args.lang_tok_style == "mbart" or (
                args.extra_data and "mono_dae" in args.extra_data
            ):
                dicts[lang].add_symbol("<mask>")
            logger.info("[{}] dictionary: {} types".format(lang, len(dicts[lang])))
        return sorted_langs, dicts, training
Exemplo n.º 17
0
    def setup_task(cls, args, **kwargs):
        """Setup the task (e.g., load dictionaries).

        Args:
            args (argparse.Namespace): parsed command-line arguments
        """
        args.left_pad_source = utils.eval_bool(args.left_pad_source)
        args.left_pad_target = utils.eval_bool(args.left_pad_target)

        paths = utils.split_paths(args.data)
        assert len(paths) > 0
        # find language pair automatically
        if args.source_lang is None or args.target_lang is None:
            args.source_lang, args.target_lang = data_utils.infer_language_pair(
                paths[0])
        if args.source_lang is None or args.target_lang is None:
            raise Exception(
                'Could not infer language pair, please provide it explicitly')

        # load dictionaries
        if args.use_bert_model:
            tgt_first = True
        else:
            tgt_first = False
        if tgt_first:
            tgt_dict = cls.load_dictionary(os.path.join(
                paths[0], 'dict.{}.txt'.format(args.target_lang)),
                                           custom_bos=args.bos,
                                           custom_pad=args.pad,
                                           custom_eos=args.eos,
                                           custom_unk=args.unk,
                                           add_sentence_limit_words_after=True)
            bos_id_tgt = tgt_dict.bos()
            pad_id_tgt = tgt_dict.pad()
            eos_id_tgt = tgt_dict.eos()
            unk_id_tgt = tgt_dict.unk()
            src_dict = cls.load_dictionary(os.path.join(
                paths[0], 'dict.{}.txt'.format(args.source_lang)),
                                           custom_bos=args.bos,
                                           custom_pad=args.pad,
                                           custom_eos=args.eos,
                                           custom_unk=args.unk,
                                           add_sentence_limit_words_after=True,
                                           tgt_first=tgt_first,
                                           bos_id_tgt=bos_id_tgt,
                                           pad_id_tgt=pad_id_tgt,
                                           eos_id_tgt=eos_id_tgt,
                                           unk_id_tgt=unk_id_tgt)
        else:
            src_dict = cls.load_dictionary(
                os.path.join(paths[0], 'dict.{}.txt'.format(args.source_lang)))
            tgt_dict = cls.load_dictionary(
                os.path.join(paths[0], 'dict.{}.txt'.format(args.target_lang)))

        # print(src_dict.pad(), '', tgt_dict.pad())
        # print(src_dict.bos(), '', tgt_dict.bos())
        # print(src_dict.eos(), '', tgt_dict.eos())
        # print(src_dict.unk(), '', tgt_dict.unk())
        assert src_dict.pad() == tgt_dict.pad()
        assert src_dict.eos() == tgt_dict.eos()
        assert src_dict.unk() == tgt_dict.unk()
        logger.info('[{}] dictionary: {} types'.format(args.source_lang,
                                                       len(src_dict)))
        logger.info('[{}] dictionary: {} types'.format(args.target_lang,
                                                       len(tgt_dict)))

        return cls(args, src_dict, tgt_dict)
Exemplo n.º 18
0
    def add_args(parser):
        """Add model-specific arguments to the parser."""
        # fmt: off
        parser.add_argument("--dropout", type=float, metavar="D",
                            help="dropout probability")
        parser.add_argument("--encoder-conv-channels", type=str, metavar="EXPR",
                            help="list of encoder convolution\'s out channels")
        parser.add_argument("--encoder-conv-kernel-sizes", type=str, metavar="EXPR",
                            help="list of encoder convolution\'s kernel sizes")
        parser.add_argument("--encoder-conv-strides", type=str, metavar="EXPR",
                            help="list of encoder convolution\'s strides")
        parser.add_argument("--encoder-rnn-hidden-size", type=int, metavar="N",
                            help="encoder rnn\'s hidden size")
        parser.add_argument("--encoder-rnn-layers", type=int, metavar="N",
                            help="number of rnn encoder layers")
        parser.add_argument("--encoder-rnn-bidirectional",
                            type=lambda x: utils.eval_bool(x),
                            help="make all rnn layers of encoder bidirectional")
        parser.add_argument("--encoder-rnn-residual",
                            type=lambda x: utils.eval_bool(x),
                            help="create residual connections for rnn encoder "
                            "layers (starting from the 2nd layer), i.e., the actual "
                            "output of such layer is the sum of its input and output")
        parser.add_argument("--decoder-embed-dim", type=int, metavar="N",
                            help="decoder embedding dimension")
        parser.add_argument("--decoder-embed-path", type=str, metavar="STR",
                            help="path to pre-trained decoder embedding")
        parser.add_argument("--decoder-freeze-embed", action="store_true",
                            help="freeze decoder embeddings")
        parser.add_argument("--decoder-hidden-size", type=int, metavar="N",
                            help="decoder hidden size")
        parser.add_argument("--decoder-layers", type=int, metavar="N",
                            help="number of decoder layers")
        parser.add_argument("--decoder-out-embed-dim", type=int, metavar="N",
                            help="decoder output embedding dimension")
        parser.add_argument("--decoder-rnn-residual",
                            type=lambda x: utils.eval_bool(x),
                            help="create residual connections for rnn decoder "
                            "layers (starting from the 2nd layer), i.e., the actual "
                            "output of such layer is the sum of its input and output")
        parser.add_argument("--attention-type", type=str, metavar="STR",
                            choices=["bahdanau", "luong"],
                            help="attention type")
        parser.add_argument("--attention-dim", type=int, metavar="N",
                            help="attention dimension")
        parser.add_argument("--need-attention", action="store_true",
                            help="need to return attention tensor for the caller")
        parser.add_argument("--adaptive-softmax-cutoff", metavar="EXPR",
                            help="comma separated list of adaptive softmax cutoff points. "
                                 "Must be used with adaptive_loss criterion")
        parser.add_argument("--share-decoder-input-output-embed",
                            type=lambda x: utils.eval_bool(x),
                            help="share decoder input and output embeddings")
        parser.add_argument("--pretrained-lm-checkpoint", type=str, metavar="STR",
                            help="path to load checkpoint from pretrained language model(LM), "
                            "which will be present and kept fixed during training.")

        # Granular dropout settings (if not specified these default to --dropout)
        parser.add_argument("--encoder-rnn-dropout-in", type=float, metavar="D",
                            help="dropout probability for encoder rnn\'s input")
        parser.add_argument("--encoder-rnn-dropout-out", type=float, metavar="D",
                            help="dropout probability for encoder rnn\'s output")
        parser.add_argument("--decoder-dropout-in", type=float, metavar="D",
                            help="dropout probability for decoder input embedding")
        parser.add_argument("--decoder-dropout-out", type=float, metavar="D",
                            help="dropout probability for decoder output")

        # Scheduled sampling options
        parser.add_argument("--scheduled-sampling-probs", type=lambda p: utils.eval_str_list(p),
                            metavar="P_1,P_2,...,P_N", default=[1.0],
                            help="scheduled sampling probabilities of sampling the truth "
                            "labels for N epochs starting from --start-schedule-sampling-epoch; "
                            "all later epochs using P_N")
        parser.add_argument("--start-scheduled-sampling-epoch", type=int,
                            metavar="N", default=1,
                            help="start scheduled sampling from the specified epoch")