def add_optimization_args(parser): group = parser.add_argument_group('Optimization') group.add_argument('--max-epoch', '--me', default=0, type=int, metavar='N', help='force stop training at specified epoch') group.add_argument('--max-update', '--mu', default=0, type=int, metavar='N', help='force stop training at specified update') group.add_argument('--target-bleu', default=0.0, type=float, metavar='TARGET', help='force stop training after reaching target bleu') group.add_argument('--clip-norm', default=25, type=float, metavar='NORM', help='clip threshold of gradients') group.add_argument('--sentence-avg', action='store_true', help='normalize gradients by the number of sentences in a batch' ' (default is to normalize by number of tokens)') group.add_argument('--update-freq', default='1', metavar='N', help='update parameters every N_i batches, when in epoch i') # Optimizer definitions can be found under fairseq/optim/ group.add_argument('--optimizer', default='nag', metavar='OPT', choices=OPTIMIZER_REGISTRY.keys(), help='optimizer: {} (default: nag)'.format(', '.join(OPTIMIZER_REGISTRY.keys()))) group.add_argument('--lr', '--learning-rate', default='0.25', metavar='LR_1,LR_2,...,LR_N', help='learning rate for the first N epochs; all epochs >N using LR_N' ' (note: this may be interpreted differently depending on --lr-scheduler)') group.add_argument('--momentum', default=0.99, type=float, metavar='M', help='momentum factor') group.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD', help='weight decay') # Learning rate schedulers can be found under fairseq/optim/lr_scheduler/ group.add_argument('--lr-scheduler', default='reduce_lr_on_plateau', help='learning rate scheduler: {} (default: reduce_lr_on_plateau)'.format( ', '.join(LR_SCHEDULER_REGISTRY.keys()))) group.add_argument('--lr-shrink', default=0.1, type=float, metavar='LS', help='learning rate shrink factor for annealing, lr_new = (lr * lr_shrink)') group.add_argument('--min-lr', default=1e-5, type=float, metavar='LR', help='minimum learning rate') group.add_argument('--min-loss-scale', default=1e-4, type=float, metavar='D', help='minimum loss scale (for FP16 training)') # Criterion args parser.add_argument('--label-smoothing', default=0., type=float, metavar='D', help='epsilon for label smoothing, 0 means no label smoothing') # Parallel backward + all-reduce optimization group.add_argument('--enable-parallel-backward-allred-opt', action='store_true', help='enable all reduce of w-gradients in parallel with backward propagation (only for FP16 training)') group.add_argument('--parallel-backward-allred-opt-threshold', type=int, default=0, metavar='N', help='min num of contiguous gradient elements before all-reduce is triggered') group.add_argument('--enable-parallel-backward-allred-opt-correctness-check', action='store_true', help='compare w-gradient values obtained doing all-reduce in parallel vs. at the end') return group
def add_optimization_args(parser): group = parser.add_argument_group('Optimization') # fmt: off group.add_argument('--max-epoch', '--me', default=0, type=int, metavar='N', help='force stop training at specified epoch') group.add_argument('--max-update', '--mu', default=0, type=int, metavar='N', help='force stop training at specified update') group.add_argument('--clip-norm', default=25, type=float, metavar='NORM', help='clip threshold of gradients') group.add_argument('--sentence-avg', action='store_true', help='normalize gradients by the number of sentences in a batch' ' (default is to normalize by number of tokens)') group.add_argument('--update-freq', default='1', metavar='N1,N2,...,N_K', type=lambda uf: eval_str_list(uf, type=int), help='update parameters every N_i batches, when in epoch i') # Optimizer definitions can be found under fairseq/optim/ group.add_argument('--optimizer', default='nag', metavar='OPT', choices=OPTIMIZER_REGISTRY.keys(), help='Optimizer') group.add_argument('--lr', '--learning-rate', default='0.25', type=eval_str_list, metavar='LR_1,LR_2,...,LR_N', help='learning rate for the first N epochs; all epochs >N using LR_N' ' (note: this may be interpreted differently depending on --lr-scheduler)') # Learning rate schedulers can be found under fairseq/optim/lr_scheduler/ group.add_argument('--lr-scheduler', default='fixed', choices=LR_SCHEDULER_REGISTRY.keys(), help='Learning Rate Scheduler') group.add_argument('--min-lr', default=-1, type=float, metavar='LR', help='stop training when the learning rate reaches this minimum') # fmt: on return group
def add_optimization_args(parser): group = parser.add_argument_group('Optimization') group.add_argument('--max-epoch', '--me', default=0, type=int, metavar='N', help='force stop training at specified epoch') group.add_argument('--max-update', '--mu', default=0, type=int, metavar='N', help='force stop training at specified update') group.add_argument('--clip-norm', default=25, type=float, metavar='NORM', help='clip threshold of gradients') group.add_argument('--sentence-avg', action='store_true', help='normalize gradients by the number of sentences in a batch' ' (default is to normalize by number of tokens)') group.add_argument('--update-freq', default='1', metavar='N', help='update parameters every N_i batches, when in epoch i') # Optimizer definitions can be found under fairseq/optim/ group.add_argument('--optimizer', default='nag', metavar='OPT', choices=OPTIMIZER_REGISTRY.keys(), help='optimizer: {} (default: nag)'.format(', '.join(OPTIMIZER_REGISTRY.keys()))) group.add_argument('--lr', '--learning-rate', default='0.25', metavar='LR_1,LR_2,...,LR_N', help='learning rate for the first N epochs; all epochs >N using LR_N' ' (note: this may be interpreted differently depending on --lr-scheduler)') group.add_argument('--momentum', default=0.99, type=float, metavar='M', help='momentum factor') group.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD', help='weight decay') # Learning rate schedulers can be found under fairseq/optim/lr_scheduler/ group.add_argument('--lr-scheduler', default='reduce_lr_on_plateau', help='learning rate scheduler: {} (default: reduce_lr_on_plateau)'.format( ', '.join(LR_SCHEDULER_REGISTRY.keys()))) group.add_argument('--lr-shrink', default=0.1, type=float, metavar='LS', help='learning rate shrink factor for annealing, lr_new = (lr * lr_shrink)') group.add_argument('--min-lr', default=1e-5, type=float, metavar='LR', help='minimum learning rate') group.add_argument('--min-loss-scale', default=1e-4, type=float, metavar='D', help='minimum loss scale (for FP16 training)') group.add_argument('--adv_bias', default=8000, type=int, metavar='N', help='threshold for rare-pop words') group.add_argument('--adv_lambda', default=0.0, type=float, metavar='D', help='weight of adversarial loss') group.add_argument('--adv_lr', default=0.01, type=float, metavar='D', help='weight of adversarial loss') group.add_argument('--adv_wdecay', default=0.0, type=float, metavar='D', help='weight of adversarial loss') group.add_argument('--adv_updates', default=1, type=int, metavar='N', help='weight of adversarial loss') return group
def add_optimization_args(parser): group = parser.add_argument_group('Optimization') # fmt: off group.add_argument('--max-epoch', '--me', default=0, type=int, metavar='N', help='force stop training at specified epoch') group.add_argument('--max-update', '--mu', default=0, type=int, metavar='N', help='force stop training at specified update') group.add_argument('--clip-norm', default=25, type=float, metavar='NORM', help='clip threshold of gradients') group.add_argument('--sentence-avg', action='store_true', help='normalize gradients by the number of sentences in a batch' ' (default is to normalize by number of tokens)') group.add_argument('--update-freq', default='1', metavar='N1,N2,...,N_K', type=lambda uf: eval_str_list(uf, type=int), help='update parameters every N_i batches, when in epoch i') group.add_argument('--ema-decay', default=0.9999, type=float, metavar='D', help='exponetail moving average decay') group.add_argument('--no-ema', action='store_true', help='disable exponetial moving average') # Optimizer definitions can be found under fairseq/optim/ group.add_argument('--optimizer', default='nag', metavar='OPT', choices=OPTIMIZER_REGISTRY.keys(), help='Optimizer') group.add_argument('--lr', '--learning-rate', default='0.25', type=eval_str_list, metavar='LR_1,LR_2,...,LR_N', help='learning rate for the first N epochs; all epochs >N using LR_N' ' (note: this may be interpreted differently depending on --lr-scheduler)') group.add_argument('--momentum', default=0.99, type=float, metavar='M', help='momentum factor') group.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD', help='weight decay') # Learning rate schedulers can be found under fairseq/optim/lr_scheduler/ group.add_argument('--lr-scheduler', default='reduce_lr_on_plateau', choices=LR_SCHEDULER_REGISTRY.keys(), help='Learning Rate Scheduler') group.add_argument('--lr-shrink', default=0.1, type=float, metavar='LS', help='learning rate shrink factor for annealing, lr_new = (lr * lr_shrink)') group.add_argument('--min-lr', default=1e-5, type=float, metavar='LR', help='minimum learning rate') # fmt: on return group
def add_optimization_args(parser): group = parser.add_argument_group('Optimization') group.add_argument('--max-epoch', '--me', default=0, type=int, metavar='N', help='force stop training at specified epoch') group.add_argument('--max-update', '--mu', default=0, type=int, metavar='N', help='force stop training at specified update') group.add_argument('--clip-norm', default=25, type=float, metavar='NORM', help='clip threshold of gradients') group.add_argument('--sentence-avg', action='store_true', help='normalize gradients by the number of sentences in a batch' ' (default is to normalize by number of tokens)') group.add_argument('--update-freq', default='1', metavar='N', help='update parameters every N_i batches, when in epoch i') # Optimizer definitions can be found under fairseq/optim/ group.add_argument('--optimizer', default='nag', metavar='OPT', choices=OPTIMIZER_REGISTRY.keys(), help='Optimizer') group.add_argument('--lr', '--learning-rate', default='0.25', metavar='LR_1,LR_2,...,LR_N', help='learning rate for the first N epochs; all epochs >N using LR_N' ' (note: this may be interpreted differently depending on --lr-scheduler)') group.add_argument('--momentum', default=0.99, type=float, metavar='M', help='momentum factor') group.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD', help='weight decay') # Learning rate schedulers can be found under fairseq/optim/lr_scheduler/ group.add_argument('--lr-scheduler', default='reduce_lr_on_plateau', choices=LR_SCHEDULER_REGISTRY.keys(), help='Learning Rate Scheduler') group.add_argument('--lr-shrink', default=0.1, type=float, metavar='LS', help='learning rate shrink factor for annealing, lr_new = (lr * lr_shrink)') group.add_argument('--min-lr', default=1e-5, type=float, metavar='LR', help='minimum learning rate') group.add_argument('--min-loss-scale', default=1e-4, type=float, metavar='D', help='minimum loss scale (for FP16 training)') return group
def add_optimization_args(parser): group = parser.add_argument_group('Optimization') group.add_argument('--max-epoch', '--me', default=0, type=int, metavar='N', help='force stop training at specified epoch') group.add_argument('--max-update', '--mu', default=0, type=int, metavar='N', help='force stop training at specified update') group.add_argument('--clip-norm', default=25, type=float, metavar='NORM', help='clip threshold of gradients') group.add_argument( '--sentence-avg', action='store_true', help='normalize gradients by the number of sentences in a batch' ' (default is to normalize by number of tokens)') # Optimizer definitions can be found under fairseq/optim/ group.add_argument('--optimizer', default='nag', metavar='OPT', choices=OPTIMIZER_REGISTRY.keys(), help='optimizer: {} (default: nag)'.format(', '.join( OPTIMIZER_REGISTRY.keys()))) group.add_argument( '--lr', '--learning-rate', default='0.25', metavar='LR_1,LR_2,...,LR_N', help='learning rate for the first N epochs; all epochs >N using LR_N' ' (note: this may be interpreted differently depending on --lr-scheduler)' ) group.add_argument('--momentum', default=0.99, type=float, metavar='M', help='momentum factor') group.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD', help='weight decay') # Learning rate schedulers can be found under fairseq/optim/lr_scheduler/ group.add_argument( '--lr-scheduler', default='reduce_lr_on_plateau', help='learning rate scheduler: {} (default: reduce_lr_on_plateau)'. format(', '.join(LR_SCHEDULER_REGISTRY.keys()))) group.add_argument( '--lr-shrink', default=0.1, type=float, metavar='LS', help= 'learning rate shrink factor for annealing, lr_new = (lr * lr_shrink)') group.add_argument('--min-lr', default=1e-5, type=float, metavar='LR', help='minimum learning rate') group.add_argument( '--sample-without-replacement', default=0, type=int, metavar='N', help='If bigger than 0, use that number of mini-batches for each epoch,' ' where each sample is drawn randomly without replacement from the' ' dataset') group.add_argument('--curriculum', default=0, type=int, metavar='N', help='sort batches by source length for first N epochs') return group
def add_optimization_args(parser): group = parser.add_argument_group('Optimization') group.add_argument('--max-epoch', '--me', default=0, type=int, metavar='N', help='force stop training at specified epoch') group.add_argument('--max-update', '--mu', default=0, type=int, metavar='N', help='force stop training at specified update') group.add_argument('--clip-norm', default=25, type=float, metavar='NORM', help='clip threshold of gradients') group.add_argument( '--sentence-avg', action='store_true', help='normalize gradients by the number of sentences in a batch' ' (default is to normalize by number of tokens)') group.add_argument( '--update-freq', default='1', metavar='N', help='update parameters every N_i batches, when in epoch i') group.add_argument('--assistant', action='store_true', help='whether use assistant as batch sampler') group.add_argument( '--batch_method', type=str, default='sentences', help='the method to distribute instances [sentences, bins]') group.add_argument('--use-tfidf', action='store_true', help='whether use tf-idf version of assistant') group.add_argument('--spl', action='store_true', help='whether use spl') # Optimizer definitions can be found under fairseq/optim/ group.add_argument('--optimizer', default='nag', metavar='OPT', choices=OPTIMIZER_REGISTRY.keys(), help='Optimizer') group.add_argument( '--lr', '--learning-rate', default='0.25', metavar='LR_1,LR_2,...,LR_N', help='learning rate for the first N epochs; all epochs >N using LR_N' ' (note: this may be interpreted differently depending on --lr-scheduler)' ) group.add_argument('--momentum', default=0.99, type=float, metavar='M', help='momentum factor') group.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD', help='weight decay') # Learning rate schedulers can be found under fairseq/optim/lr_scheduler/ group.add_argument('--lr-scheduler', default='reduce_lr_on_plateau', choices=LR_SCHEDULER_REGISTRY.keys(), help='Learning Rate Scheduler') group.add_argument( '--lr-shrink', default=0.1, type=float, metavar='LS', help= 'learning rate shrink factor for annealing, lr_new = (lr * lr_shrink)') group.add_argument('--min-lr', default=1e-5, type=float, metavar='LR', help='minimum learning rate') group.add_argument('--min-loss-scale', default=1e-4, type=float, metavar='D', help='minimum loss scale (for FP16 training)') return group
def add_optimization_args(parser): group = parser.add_argument_group('Optimization') group.add_argument('--max-epoch', '--me', default=0, type=int, metavar='N', help='force stop training at specified epoch') group.add_argument('--max-update', '--mu', default=0, type=int, metavar='N', help='force stop training at specified update') group.add_argument('--target-bleu', default=0.0, type=float, metavar='TARGET', help='force stop training after reaching target bleu') group.add_argument('--clip-norm', default=25, type=float, metavar='NORM', help='clip threshold of gradients') group.add_argument( '--update-freq', default=[1], nargs='+', type=int, help='update parameters every N_i batches, when in epoch i') # Optimizer definitions can be found under fairseq/optim/ group.add_argument('--optimizer', default='nag', metavar='OPT', choices=OPTIMIZER_REGISTRY.keys(), help='optimizer: {} (default: nag)'.format(', '.join( OPTIMIZER_REGISTRY.keys()))) group.add_argument( '--lr', '--learning-rate', default=[0.25], nargs='+', type=float, help='learning rate for the first N epochs; all epochs >N using LR_N' ' (note: this may be interpreted differently depending on --lr-scheduler)' ) group.add_argument('--momentum', default=0.99, type=float, metavar='M', help='momentum factor') group.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD', help='weight decay') # Learning rate schedulers can be found under fairseq/optim/lr_scheduler/ group.add_argument( '--lr-scheduler', default='reduce_lr_on_plateau', help='learning rate scheduler: {} (default: reduce_lr_on_plateau)'. format(', '.join(LR_SCHEDULER_REGISTRY.keys()))) group.add_argument( '--lr-shrink', default=0.1, type=float, metavar='LS', help= 'learning rate shrink factor for annealing, lr_new = (lr * lr_shrink)') group.add_argument('--min-lr', default=1e-5, type=float, metavar='LR', help='minimum learning rate') # Criterion args parser.add_argument( '--label-smoothing', default=0., type=float, metavar='D', help='epsilon for label smoothing, 0 means no label smoothing') return group
def add_optimization_args(parser): group = parser.add_argument_group('Optimization') group.add_argument('--max-epoch', '--me', default=-1, type=int, metavar='N', help='force stop training at specified epoch') group.add_argument('--max-update', '--mu', default=0, type=int, metavar='N', help='force stop training at specified update') group.add_argument('--target-bleu', default=0.0, type=float, metavar='TARGET', help='force stop training after reaching target bleu') group.add_argument('--clip-norm', default=25, type=float, metavar='NORM', help='clip threshold of gradients') group.add_argument( '--sentence-avg', action='store_true', help='normalize gradients by the number of sentences in a batch' ' (default is to normalize by number of tokens)') group.add_argument( '--update-freq', default='1', metavar='N', help='update parameters every N_i batches, when in epoch i') # Optimizer definitions can be found under fairseq/optim/ group.add_argument('--optimizer', default='nag', metavar='OPT', choices=OPTIMIZER_REGISTRY.keys(), help='optimizer: {} (default: nag)'.format(', '.join( OPTIMIZER_REGISTRY.keys()))) group.add_argument( '--lr', '--learning-rate', default='0.25', metavar='LR_1,LR_2,...,LR_N', help='learning rate for the first N epochs; all epochs >N using LR_N' ' (note: this may be interpreted differently depending on --lr-scheduler)' ) group.add_argument('--momentum', default=0.99, type=float, metavar='M', help='momentum factor') group.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD', help='weight decay') # Distributed weight update parameters group.add_argument('--distributed-weight-update', '--dwu', default=0, type=int, metavar='DWU', help='select distributed weight update strategy') group.add_argument( '--dwu-group-size', '--dwugs', default=0, type=int, metavar='DWUGS', help= 'distributed weight update group size. If arg is 0, defaults to one node' ) group.add_argument('--dwu-num-blocks', '--dwunb', default=8, type=int, metavar='DWUNB', help='number of blocks in dwu scheme') group.add_argument('--dwu-num-chunks', '--dwunc', default=8, type=int, metavar='DWUNC', help='number of chunks in dwu scheme') group.add_argument( '--dwu-num-rs-pg', '--dwurspg', default=2, type=int, metavar='DWURSPG', help='number of reduction-scatter streams in dwu scheme') group.add_argument('--dwu-num-ar-pg', '--dwuarpg', default=4, type=int, metavar='DWUARPG', help='number of all-reduce streams in dwu scheme') group.add_argument('--dwu-num-ag-pg', '--dwuagpg', default=2, type=int, metavar='DWUAGPG', help='number of all-gather streams in dwu scheme') group.add_argument('--dwu-full-pipeline', action='store_true', help='whether to do full or partial pipeline') group.add_argument('--dwu-overlap-reductions', action='store_true', help='whether to overlap reductions with backprop') group.add_argument('--dwu-compute-L2-grad-norm', action='store_true', help='whether to compute L2 grad norm') group.add_argument( '--dwu-flat-mt', action='store_true', help='whether to flatten gradients with multi tensor scale') group.add_argument('--dwu-e5m2-allgather', action='store_true', help='do allgather with e5m2 floats') group.add_argument( '--dwu-do-not-flatten-model', action='store_true', help='whether it is allowed to flatten model parameters') # Learning rate schedulers can be found under fairseq/optim/lr_scheduler/ group.add_argument( '--lr-scheduler', default='reduce_lr_on_plateau', help='learning rate scheduler: {} (default: reduce_lr_on_plateau)'. format(', '.join(LR_SCHEDULER_REGISTRY.keys()))) group.add_argument( '--lr-shrink', default=0.1, type=float, metavar='LS', help= 'learning rate shrink factor for annealing, lr_new = (lr * lr_shrink)') group.add_argument('--min-lr', default=1e-5, type=float, metavar='LR', help='minimum learning rate') group.add_argument('--min-loss-scale', default=1e-4, type=float, metavar='D', help='minimum loss scale (for FP16 training)') # Parallel backward + all-reduce optimization group.add_argument( '--enable-parallel-backward-allred-opt', action='store_true', help= 'enable all-reduce of w-gradients in parallel with backward propagation (only for FP16 training)' ) group.add_argument('--parallel-backward-allred-cuda-nstreams', type=int, default=1, metavar='N', help='num of CUDA streams used for parallel all-reduce') group.add_argument( '--parallel-backward-allred-opt-threshold', type=int, default=0, metavar='N', help= 'min num of contiguous gradient elements before all-reduce is triggered' ) group.add_argument( '--enable-parallel-backward-allred-opt-correctness-check', action='store_true', help= 'compare w-gradient values obtained doing all-reduce in parallel vs. at the end' ) group.add_argument('--dataloader-num-workers', type=int, default=1, metavar='N', help='num subprocesses for train data loader') group.add_argument('--enable-dataloader-pin-memory', action='store_true', help='enable pin_memory for train data loader') return group
def add_optimization_args(parser): group = parser.add_argument_group('Optimization') group.add_argument('--max-epoch', '--me', default=0, type=int, metavar='N', help='force stop training at specified epoch') group.add_argument('--max-update', '--mu', default=0, type=int, metavar='N', help='force stop training at specified update') group.add_argument('--clip-norm', default=25, type=float, metavar='NORM', help='clip threshold of gradients') group.add_argument( '--sentence-avg', action='store_true', help='normalize gradients by the number of sentences in a batch' ' (default is to normalize by number of tokens)') group.add_argument( '--update-freq', default='1', metavar='N', help='update parameters every N_i batches, when in epoch i') # Optimizer definitions can be found under fairseq/optim/ group.add_argument('--optimizer', default='nag', metavar='OPT', choices=OPTIMIZER_REGISTRY.keys(), help='optimizer: {} (default: nag)'.format(', '.join( OPTIMIZER_REGISTRY.keys()))) group.add_argument( '--lr', '--learning-rate', default='0.25', metavar='LR_1,LR_2,...,LR_N', help='learning rate for the first N epochs; all epochs >N using LR_N' ' (note: this may be interpreted differently depending on --lr-scheduler)' ) group.add_argument('--momentum', default=0.99, type=float, metavar='M', help='momentum factor') group.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD', help='weight decay') # Learning rate schedulers can be found under fairseq/optim/lr_scheduler/ group.add_argument( '--lr-scheduler', default='reduce_lr_on_plateau', help='learning rate scheduler: {} (default: reduce_lr_on_plateau)'. format(', '.join(LR_SCHEDULER_REGISTRY.keys()))) group.add_argument( '--lr-shrink', default=0.1, type=float, metavar='LS', help= 'learning rate shrink factor for annealing, lr_new = (lr * lr_shrink)') group.add_argument('--min-lr', default=1e-5, type=float, metavar='LR', help='minimum learning rate') group.add_argument('--min-loss-scale', default=1e-4, type=float, metavar='D', help='minimum loss scale (for FP16 training)') # For rl training specification group.add_argument( '--multinomial-sample-train', default=False, action='store_true', help= 'use multinomial sampling instead of beam search for rl training sample' ) group.add_argument( '--delta-reward', default=False, action='store_true', help='use delta reward instead of total reward, default is total reward' ) group.add_argument( '--mle-weight', default=0.0, type=float, help='use mle loss combined with rl loss, weight of mle loss.') group.add_argument( '--rl-weight', default=1.0, type=float, help='use mle loss combined with rl loss, weight of rl loss.') group.add_argument('--max-order', default=4, type=int, help='max order for gleu') group.add_argument( '--sample-beam', default=5, type=int, help= 'number of translation sentences generated by one source sentence in v2 loss' ) group.add_argument( '--gram', default=0, type=int, help="if not 0, only count grams with length 'gram' in GLEU computation" ) group.add_argument('--modgleu', default=False, action='store_true', help='use modified version of GLEU computation') return group