Exemplo n.º 1
0
    def add_args(parser):
        parser.add_argument(
            "data",
            help="colon separated path to data directories list, \
                            will be iterated upon during epochs in round-robin manner",
            action=FileContentsAction,
        )
        parser.add_argument(
            "--langs",
            default=None,
            type=csv_str_list,
            help="a list of languages comma sperated languages which can appear in lang-pairs; "
            "note that the ordering determines language token IDs",
            action=FileContentsAction,
        )
        parser.add_argument(
            "--lang-dict",
            default=None,
            type=str,
            help="an external file which contains a list of "
            "languages which can appear in lang-pairs; "
            "note that the ordering determines language token IDs; "
            "--langs and --lang-dict are two exclusive options",
        )
        parser.add_argument(
            "--lang-tok-style",
            default=LangTokStyle.multilingual.value,
            type=str,
            choices=[LangTokStyle.multilingual.value, LangTokStyle.mbart.value],
            help="language token styles",
        )

        parser.add_argument(
            "--load-alignments",
            action="store_true",
            help="load the binarized alignments",
        )
        parser.add_argument(
            "--left-pad-source",
            default="True",
            type=str,
            metavar="BOOL",
            help="pad the source on the left",
        )
        parser.add_argument(
            "--left-pad-target",
            default="False",
            type=str,
            metavar="BOOL",
            help="pad the target on the left",
        )
        parser.add_argument(
            "--max-source-positions",
            default=1024,
            type=int,
            metavar="N",
            help="max number of tokens in the source sequence",
        )
        parser.add_argument(
            "--max-target-positions",
            default=1024,
            type=int,
            metavar="N",
            help="max number of tokens in the target sequence",
        )
        parser.add_argument(
            "--upsample-primary",
            default=1,
            type=int,
            help="amount to upsample primary dataset",
        )
        parser.add_argument(
            "--truncate-source",
            action="store_true",
            default=False,
            help="truncate source to max-source-positions",
        )
        parser.add_argument(
            "--encoder-langtok",
            default=None,
            type=str,
            choices=[EncoderLangtok.src.value, EncoderLangtok.tgt.value],
            metavar="SRCTGT",
            help="prepend to the beginning of source sentence the source or target "
            "language token. (src/tgt)",
        )
        parser.add_argument(
            "--decoder-langtok",
            action="store_true",
            help="prepend to the beginning of target sentence the target language token",
        )
        parser.add_argument(
            "--lang-tok-replacing-bos-eos", action="store_true", default=False
        )
        parser.add_argument(
            "--enable-lang-ids",
            default=False,
            action="store_true",
            help="whether to include language IDs in samples",
        )
        parser.add_argument(
            "--enable-reservsed-directions-shared-datasets",
            default=False,
            action="store_true",
            help="whether to allow datasets be used in reversed directions",
        )

        parser.add_argument(
            "--extra-data",
            help='a dictionary of data name to this path, \
                            e.g. {"mined", path_to_mined_data, "denoised": path_to_denoised_data}',
            type=lambda uf: eval_str_dict(uf, type=str),
            default=None,
        )
        parser.add_argument(
            "--extra-lang-pairs",
            help='a dictionary of data name to the language pairs they serve, \
                            e.g. {"mined": comma-separated-lang-pairs, "denoised":  comma-separated-lang-pairs}',
            type=lambda uf: eval_str_dict(uf, type=str),
            default=None,
        )
        parser.add_argument(
            "--langtoks-specs",
            help='a list of comma separated data types that a set of language tokens to be specialized for, \
                            e.g. "main,dae,mined". There will be a set of language tokens added to the vocab to \
                            distinguish languages in different training data types. If not specified, default language \
                            tokens per languages will be added',
            default=LangTokSpec.main.value,
            type=csv_str_list,
        )
        parser.add_argument(
            "--langtoks",
            help='a dictionary of how to add language tokens, \
                            e.g. {"mined": (None, "tgt"), "mono_dae": ("src.dae", "tgt"), "main": \
                            ("src", "tgt")}, or {"mined": ("src.mined", "tgt")}',
            default=None,
            type=lambda uf: eval_str_dict(uf, type=str),
        )
        parser.add_argument(
            "--sampling-weights-from-file",
            help='a file contain a python dictionary of how to sample data sets, \
                                e.g. { "main:en_XX-es_XX": 0.2, "mined:en_XX-pt_XX": 0.5, \
                                    "mono_dae:es_XX-es_XX: 0.3, "main:en_xx-fr_XX": 0.8 }',
            default=None,
            type=str,
        )
        parser.add_argument(
            "--sampling-weights",
            help='a dictionary of how to sample data sets, \
                            e.g. { "main:en_XX-es_XX": 0.2, "mined:en_XX-pt_XX": 0.5, \
                                   "mono_dae:es_XX-es_XX: 0.3, "main:en_xx-fr_XX": 0.8 }',
            default=None,
            type=lambda uf: eval_str_dict(uf, type=str),
        )
        parser.add_argument(
            "--virtual-epoch-size",
            default=1000000,
            type=int,
            help="virtual epoch size to speed up data loading",
        )
        parser.add_argument(
            "--virtual-data-size",
            default=None,
            type=int,
            help="virtual data size of the whole joint dataset to speed"
            "up data loading and have specific dynamic sampling strategy interval",
        )
    def add_args(parser):
        parser.add_argument(
            'data',
            help='colon separated path to data directories list, \
                            will be iterated upon during epochs in round-robin manner'
        )
        parser.add_argument(
            '--langs',
            default=None,
            type=csv_str_list,
            help=
            'a list of languages comma sperated languages which can appear in lang-pairs; '
            'note that the ordering determines language token IDs',
        )
        parser.add_argument(
            '--lang-dict',
            default=None,
            type=str,
            help='an external file which contains a list of '
            'languages which can appear in lang-pairs; '
            'note that the ordering determines language token IDs; '
            '--langs and --lang-dict are two exclusive options')
        parser.add_argument('--lang-tok-style',
                            default='multilingual',
                            type=str,
                            choices=['multilingual', 'mbart'],
                            help='language token styles')

        parser.add_argument('--load-alignments',
                            action='store_true',
                            help='load the binarized alignments')
        parser.add_argument('--left-pad-source',
                            default='True',
                            type=str,
                            metavar='BOOL',
                            help='pad the source on the left')
        parser.add_argument('--left-pad-target',
                            default='False',
                            type=str,
                            metavar='BOOL',
                            help='pad the target on the left')
        parser.add_argument('--max-source-positions',
                            default=1024,
                            type=int,
                            metavar='N',
                            help='max number of tokens in the source sequence')
        parser.add_argument('--max-target-positions',
                            default=1024,
                            type=int,
                            metavar='N',
                            help='max number of tokens in the target sequence')
        parser.add_argument('--upsample-primary',
                            default=1,
                            type=int,
                            help='amount to upsample primary dataset')
        parser.add_argument('--truncate-source',
                            action='store_true',
                            default=False,
                            help='truncate source to max-source-positions')
        parser.add_argument(
            '--encoder-langtok',
            default=None,
            type=str,
            choices=['src', 'tgt'],
            metavar='SRCTGT',
            help=
            'prepend to the beginning of source sentence the source or target '
            'language token. (src/tgt)')
        parser.add_argument(
            '--decoder-langtok',
            action='store_true',
            help=
            'prepend to the beginning of target sentence the target language token'
        )
        parser.add_argument('--lang-tok-replacing-bos-eos',
                            action='store_true',
                            default=False)
        parser.add_argument('--enable-lang-ids',
                            default=False,
                            action='store_true',
                            help='whether to include language IDs in samples')
        parser.add_argument(
            '--enable-reservsed-directions-shared-datasets',
            default=False,
            action='store_true',
            help='whether to allow datasets be used in reversed directions')

        parser.add_argument('--extra-data',
                            help='a dictionary of data name to this path, \
                            e.g. {"mined", path_to_mined_data, "denoised": path_to_denoised_data}',
                            type=lambda uf: eval_str_dict(uf, type=str),
                            default=None)
        parser.add_argument(
            '--extra-lang-pairs',
            help='a dictionary of data name to the language pairs they serve, \
                            e.g. {"mined": comma-separated-lang-pairs, "denoised":  comma-separated-lang-pairs}',
            type=lambda uf: eval_str_dict(uf, type=str),
            default=None)
        parser.add_argument(
            '--langtoks-specs',
            help=
            'a list of comma separated data types that a set of language tokens to be specialized for, \
                            e.g. "main,dae,mined". There will be a set of language tokens added to the vocab to \
                            distinguish languages in different training data types. If not specified, default language \
                            tokens per languages will be added',
            default='main',
            type=csv_str_list,
        )
        parser.add_argument(
            '--langtoks',
            help='a dictionary of how to add language tokens, \
                            e.g. {"mined": (None, "tgt"), "mono_dae": ("src.dae", "tgt"), "main": \
                            ("src", "tgt")}, or {"mined": ("src.mined", "tgt")}',
            default=None,
            type=lambda uf: eval_str_dict(uf, type=str),
        )
        parser.add_argument(
            '--sampling-weights-from-file',
            help=
            'a file contain a python dictionary of how to sample data sets, \
                                e.g. { "main:en_XX-es_XX": 0.2, "mined:en_XX-pt_XX": 0.5, \
                                    "mono_dae:es_XX-es_XX: 0.3, "main:en_xx-fr_XX": 0.8 }',
            default=None,
            type=str,
        )
        parser.add_argument(
            '--sampling-weights',
            help='a dictionary of how to sample data sets, \
                            e.g. { "main:en_XX-es_XX": 0.2, "mined:en_XX-pt_XX": 0.5, \
                                   "mono_dae:es_XX-es_XX: 0.3, "main:en_xx-fr_XX": 0.8 }',
            default=None,
            type=lambda uf: eval_str_dict(uf, type=str),
        )
        parser.add_argument('--virtual-epoch-size',
                            default=1000000,
                            type=int,
                            help='virtual epoch size to speed up data loading')
        parser.add_argument(
            '--virtual-data-size',
            default=None,
            type=int,
            help='virtual data size of the whole joint dataset to speed'
            'up data loading and have specific dynamic sampling strategy interval'
        )