示例#1
0
def parse_args():
    """
    Parse commandline arguments.
    """
    parser = argparse.ArgumentParser(
        description='GNMT training',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # dataset
    dataset = parser.add_argument_group('dataset setup')
    dataset.add_argument(
        '--dataset-dir',
        default=None,
        required=True,
        help='path to directory with training/validation data')
    dataset.add_argument('--max-size',
                         default=None,
                         type=int,
                         help='use at most MAX_SIZE elements from training \
                        dataset (useful for benchmarking), by default \
                        uses entire dataset')

    # results
    results = parser.add_argument_group('results setup')
    results.add_argument('--results-dir',
                         default='results',
                         help='path to directory with results, it it will be \
                        automatically created if does not exist')
    results.add_argument('--save',
                         default='gnmt_wmt16',
                         help='defines subdirectory within RESULTS_DIR for \
                        results from this training run')
    results.add_argument('--print-freq',
                         default=10,
                         type=int,
                         help='print log every PRINT_FREQ batches')

    # model
    model = parser.add_argument_group('model setup')
    model.add_argument('--model-config',
                       default="{'hidden_size': 1024,'num_layers': 4, \
                        'dropout': 0.2, 'share_embedding': True}",
                       help='GNMT architecture configuration')
    model.add_argument('--smoothing',
                       default=0.1,
                       type=float,
                       help='label smoothing, if equal to zero model will use \
                        CrossEntropyLoss, if not zero model will be trained \
                        with label smoothing loss')

    # setup
    general = parser.add_argument_group('general setup')
    general.add_argument('--math',
                         default='fp16',
                         choices=['fp32', 'fp16'],
                         help='arithmetic type')
    general.add_argument('--seed',
                         default=None,
                         type=int,
                         help='set random number generator seed')
    general.add_argument('--disable-eval',
                         action='store_true',
                         default=False,
                         help='disables validation after every epoch')
    general.add_argument('--workers',
                         default=0,
                         type=int,
                         help='number of workers for data loading')

    cuda_parser = general.add_mutually_exclusive_group(required=False)
    cuda_parser.add_argument(
        '--cuda',
        dest='cuda',
        action='store_true',
        help='enables cuda (use \'--no-cuda\' to disable)')
    cuda_parser.add_argument('--no-cuda',
                             dest='cuda',
                             action='store_false',
                             help=argparse.SUPPRESS)
    cuda_parser.set_defaults(cuda=True)

    cudnn_parser = general.add_mutually_exclusive_group(required=False)
    cudnn_parser.add_argument(
        '--cudnn',
        dest='cudnn',
        action='store_true',
        help='enables cudnn (use \'--no-cudnn\' to disable)')
    cudnn_parser.add_argument('--no-cudnn',
                              dest='cudnn',
                              action='store_false',
                              help=argparse.SUPPRESS)
    cudnn_parser.set_defaults(cudnn=True)

    # training
    training = parser.add_argument_group('training setup')
    training.add_argument('--batch-size',
                          default=128,
                          type=int,
                          help='batch size for training')
    training.add_argument('--epochs',
                          default=8,
                          type=int,
                          help='number of total epochs to run')
    training.add_argument('--optimization-config',
                          default="{'optimizer': 'Adam', 'lr': 5e-4}",
                          type=str,
                          help='optimizer config')
    training.add_argument('--grad-clip',
                          default=5.0,
                          type=float,
                          help='enabled gradient clipping and sets maximum \
                        gradient norm value')
    training.add_argument('--max-length-train',
                          default=50,
                          type=int,
                          help='maximum sequence length for training')
    training.add_argument('--min-length-train',
                          default=0,
                          type=int,
                          help='minimum sequence length for training')

    bucketing_parser = training.add_mutually_exclusive_group(required=False)
    bucketing_parser.add_argument(
        '--bucketing',
        dest='bucketing',
        action='store_true',
        help='enables bucketing (use \'--no-bucketing\' to disable)')
    bucketing_parser.add_argument('--no-bucketing',
                                  dest='bucketing',
                                  action='store_false',
                                  help=argparse.SUPPRESS)
    bucketing_parser.set_defaults(bucketing=True)

    # validation
    validation = parser.add_argument_group('validation setup')
    validation.add_argument('--val-batch-size',
                            default=128,
                            type=int,
                            help='batch size for validation')
    validation.add_argument('--max-length-val',
                            default=80,
                            type=int,
                            help='maximum sequence length for validation')
    validation.add_argument('--min-length-val',
                            default=0,
                            type=int,
                            help='minimum sequence length for validation')

    # test
    test = parser.add_argument_group('test setup')
    test.add_argument('--test-batch-size',
                      default=128,
                      type=int,
                      help='batch size for test')
    test.add_argument('--max-length-test',
                      default=150,
                      type=int,
                      help='maximum sequence length for test')
    test.add_argument('--min-length-test',
                      default=0,
                      type=int,
                      help='minimum sequence length for test')
    test.add_argument('--beam-size', default=5, type=int, help='beam size')
    test.add_argument('--len-norm-factor',
                      default=0.6,
                      type=float,
                      help='length normalization factor')
    test.add_argument('--cov-penalty-factor',
                      default=0.1,
                      type=float,
                      help='coverage penalty factor')
    test.add_argument('--len-norm-const',
                      default=5.0,
                      type=float,
                      help='length normalization constant')
    test.add_argument('--target-bleu',
                      default=None,
                      type=float,
                      help='target accuracy')
    test.add_argument('--intra-epoch-eval',
                      default=0,
                      type=int,
                      help='evaluate within epoch')

    # checkpointing
    checkpoint = parser.add_argument_group('checkpointing setup')
    checkpoint.add_argument('--start-epoch',
                            default=0,
                            type=int,
                            help='manually set initial epoch counter')
    checkpoint.add_argument('--resume',
                            default=None,
                            type=str,
                            metavar='PATH',
                            help='resumes training from checkpoint from PATH')
    checkpoint.add_argument('--save-all',
                            action='store_true',
                            default=False,
                            help='saves checkpoint after every epoch')
    checkpoint.add_argument('--save-freq',
                            default=5000,
                            type=int,
                            help='save checkpoint every SAVE_FREQ batches')
    checkpoint.add_argument(
        '--keep-checkpoints',
        default=0,
        type=int,
        help='keep only last KEEP_CHECKPOINTS checkpoints, \
                        affects only checkpoints controlled by --save-freq \
                        option')

    # distributed support
    distributed = parser.add_argument_group('distributed setup')
    distributed.add_argument(
        '--rank',
        default=0,
        type=int,
        help='rank of the process, do not set! Done by multiproc module')
    distributed.add_argument(
        '--world-size',
        default=1,
        type=int,
        help='number of processes, do not set! Done by multiproc module')
    distributed.add_argument('--dist-url',
                             default='tcp://localhost:23456',
                             type=str,
                             help='url used to set up distributed training')

    return parser.parse_args()
示例#2
0
def parse_args():
    """
    Parse commandline arguments.
    """
    def exclusive_group(group, name, default, help):
        destname = name.replace('-', '_')
        subgroup = group.add_mutually_exclusive_group(required=False)
        subgroup.add_argument(f'--{name}',
                              dest=f'{destname}',
                              action='store_true',
                              help=f'{help} (use \'--no-{name}\' to disable)')
        subgroup.add_argument(f'--no-{name}',
                              dest=f'{destname}',
                              action='store_false',
                              help=argparse.SUPPRESS)
        subgroup.set_defaults(**{destname: default})

    parser = argparse.ArgumentParser(
        description='GNMT training',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # dataset
    dataset = parser.add_argument_group('dataset setup')
    dataset.add_argument('--dataset-dir',
                         default='data/wmt16_de_en',
                         help='path to the directory with training/test data')
    dataset.add_argument('--max-size',
                         default=None,
                         type=int,
                         help='use at most MAX_SIZE elements from training \
                         dataset (useful for benchmarking), by default \
                         uses entire dataset')

    # results
    results = parser.add_argument_group('results setup')
    results.add_argument('--results-dir',
                         default='results',
                         help='path to directory with results, it it will be \
                         automatically created if does not exist')
    results.add_argument('--save',
                         default='gnmt',
                         help='defines subdirectory within RESULTS_DIR for \
                         results from this training run')
    results.add_argument('--print-freq',
                         default=10,
                         type=int,
                         help='print log every PRINT_FREQ batches')

    # model
    model = parser.add_argument_group('model setup')
    model.add_argument('--hidden-size',
                       default=1024,
                       type=int,
                       help='model hidden size')
    model.add_argument('--num-layers',
                       default=4,
                       type=int,
                       help='number of RNN layers')
    model.add_argument('--dropout', default=0.2, type=float, help='dropout')

    exclusive_group(group=model,
                    name='share-embedding',
                    default=True,
                    help='share embedding')

    model.add_argument('--smoothing',
                       default=0.1,
                       type=float,
                       help='label smoothing, if equal to zero model will use \
                       CrossEntropyLoss, if not zero model will be trained \
                       with label smoothing loss')

    # setup
    general = parser.add_argument_group('general setup')
    general.add_argument('--math',
                         default='fp16',
                         choices=['fp16', 'fp32'],
                         help='arithmetic type')
    general.add_argument('--seed',
                         default=None,
                         type=int,
                         help='set random number generator seed')

    exclusive_group(group=general,
                    name='eval',
                    default=True,
                    help='run validation and test after every epoch')
    exclusive_group(group=general,
                    name='env',
                    default=True,
                    help='print info about execution env')
    exclusive_group(group=general,
                    name='cuda',
                    default=True,
                    help='enables cuda')
    exclusive_group(group=general,
                    name='cudnn',
                    default=True,
                    help='enables cudnn')

    # training
    training = parser.add_argument_group('training setup')
    training.add_argument('--train-batch-size',
                          default=128,
                          type=int,
                          help='training batch size per worker')
    training.add_argument('--train-global-batch-size',
                          default=None,
                          type=int,
                          help='global training batch size')
    training.add_argument('--train-iter-size',
                          default=1,
                          type=int,
                          help='training iter size')
    training.add_argument('--epochs',
                          default=6,
                          type=int,
                          help='number of training epochs')

    training.add_argument('--grad-clip',
                          default=5.0,
                          type=float,
                          help='enables gradient clipping and sets maximum \
                          gradient norm value')
    training.add_argument('--max-length-train',
                          default=50,
                          type=int,
                          help='maximum sequence length for training')
    training.add_argument('--min-length-train',
                          default=0,
                          type=int,
                          help='minimum sequence length for training')
    training.add_argument('--train-loader-workers',
                          default=2,
                          type=int,
                          help='number of workers for training data loading')
    training.add_argument('--batching',
                          default='sharding',
                          type=str,
                          choices=['random', 'sharding', 'bucketing'],
                          help='select batching algorithm')
    training.add_argument('--shard-size',
                          default=80,
                          type=int,
                          help='shard size for "sharding" batching algorithm, \
                          in multiples of global batch size')
    training.add_argument('--num-buckets',
                          default=5,
                          type=int,
                          help='number of buckets for "bucketing" batching \
                          algorithm')

    # optimizer
    optimizer = parser.add_argument_group('optimizer setup')
    optimizer.add_argument('--optimizer',
                           type=str,
                           default='Adam',
                           help='training optimizer')
    optimizer.add_argument('--lr',
                           type=float,
                           default=1.00e-3,
                           help='learning rate')
    optimizer.add_argument('--optimizer-extra',
                           type=str,
                           default="{}",
                           help='extra options for the optimizer')

    # scheduler
    scheduler = parser.add_argument_group('learning rate scheduler setup')
    scheduler.add_argument('--warmup-steps',
                           type=str,
                           default='200',
                           help='number of learning rate warmup iterations')
    scheduler.add_argument('--remain-steps',
                           type=str,
                           default='0.666',
                           help='starting iteration for learning rate decay')
    scheduler.add_argument('--decay-interval',
                           type=str,
                           default='None',
                           help='interval between learning rate decay steps')
    scheduler.add_argument('--decay-steps',
                           type=int,
                           default=4,
                           help='max number of learning rate decay steps')
    scheduler.add_argument('--decay-factor',
                           type=float,
                           default=0.5,
                           help='learning rate decay factor')

    # validation
    val = parser.add_argument_group('validation setup')
    val.add_argument('--val-batch-size',
                     default=128,
                     type=int,
                     help='batch size for validation')
    val.add_argument('--max-length-val',
                     default=80,
                     type=int,
                     help='maximum sequence length for validation')
    val.add_argument('--min-length-val',
                     default=0,
                     type=int,
                     help='minimum sequence length for validation')
    val.add_argument('--val-loader-workers',
                     default=0,
                     type=int,
                     help='number of workers for validation data loading')

    # test
    test = parser.add_argument_group('test setup')
    test.add_argument('--test-batch-size',
                      default=128,
                      type=int,
                      help='batch size for test')
    test.add_argument('--max-length-test',
                      default=150,
                      type=int,
                      help='maximum sequence length for test')
    test.add_argument('--min-length-test',
                      default=0,
                      type=int,
                      help='minimum sequence length for test')
    test.add_argument('--beam-size', default=5, type=int, help='beam size')
    test.add_argument('--len-norm-factor',
                      default=0.6,
                      type=float,
                      help='length normalization factor')
    test.add_argument('--cov-penalty-factor',
                      default=0.1,
                      type=float,
                      help='coverage penalty factor')
    test.add_argument('--len-norm-const',
                      default=5.0,
                      type=float,
                      help='length normalization constant')
    test.add_argument('--intra-epoch-eval',
                      default=0,
                      type=int,
                      help='evaluate within epoch')
    test.add_argument('--test-loader-workers',
                      default=0,
                      type=int,
                      help='number of workers for test data loading')

    # checkpointing
    chkpt = parser.add_argument_group('checkpointing setup')
    chkpt.add_argument('--start-epoch',
                       default=0,
                       type=int,
                       help='manually set initial epoch counter')
    chkpt.add_argument('--resume',
                       default=None,
                       type=str,
                       metavar='PATH',
                       help='resumes training from checkpoint from PATH')
    chkpt.add_argument('--save-all',
                       action='store_true',
                       default=False,
                       help='saves checkpoint after every epoch')
    chkpt.add_argument('--save-freq',
                       default=5000,
                       type=int,
                       help='save checkpoint every SAVE_FREQ batches')
    chkpt.add_argument('--keep-checkpoints',
                       default=0,
                       type=int,
                       help='keep only last KEEP_CHECKPOINTS checkpoints, \
                       affects only checkpoints controlled by --save-freq \
                       option')

    # benchmarking
    benchmark = parser.add_argument_group('benchmark setup')
    benchmark.add_argument('--target-perf',
                           default=None,
                           type=float,
                           help='target training performance (in tokens \
                           per second)')
    benchmark.add_argument('--target-bleu',
                           default=None,
                           type=float,
                           help='target accuracy')

    # distributed
    distributed = parser.add_argument_group('distributed setup')
    distributed.add_argument('--rank',
                             default=0,
                             type=int,
                             help='global rank of the process, do not set!')
    distributed.add_argument('--local_rank',
                             default=0,
                             type=int,
                             help='local rank of the process, do not set!')

    args = parser.parse_args()

    args.warmup_steps = literal_eval(args.warmup_steps)
    args.remain_steps = literal_eval(args.remain_steps)
    args.decay_interval = literal_eval(args.decay_interval)

    return args
def parse_args():
    """
    Parse commandline arguments.
    """
    def exclusive_group(group, name, default, help):
        destname = name.replace('-', '_')
        subgroup = group.add_mutually_exclusive_group(required=False)
        subgroup.add_argument(f'--{name}',
                              dest=f'{destname}',
                              action='store_true',
                              help=f'{help} (use \'--no-{name}\' to disable)')
        subgroup.add_argument(f'--no-{name}',
                              dest=f'{destname}',
                              action='store_false',
                              help=argparse.SUPPRESS)
        subgroup.set_defaults(**{destname: default})

    parser = argparse.ArgumentParser(
        description='GNMT training',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # dataset
    dataset = parser.add_argument_group('dataset setup')
    dataset.add_argument('--dataset-dir',
                         default='data/wmt16_de_en',
                         help='path to the directory with training/test data')

    dataset.add_argument('--src-lang', default='en', help='source language')
    dataset.add_argument('--tgt-lang', default='de', help='target language')

    dataset.add_argument('--vocab',
                         default='vocab.bpe.32000',
                         help='path to the vocabulary file \
                         (relative to DATASET_DIR directory)')
    dataset.add_argument('-bpe',
                         '--bpe-codes',
                         default='bpe.32000',
                         help='path to the file with bpe codes \
                         (relative to DATASET_DIR directory)')

    dataset.add_argument('--train-src',
                         default='train.tok.clean.bpe.32000.en',
                         help='path to the training source data file \
                         (relative to DATASET_DIR directory)')
    dataset.add_argument('--train-tgt',
                         default='train.tok.clean.bpe.32000.de',
                         help='path to the training target data file \
                         (relative to DATASET_DIR directory)')

    dataset.add_argument('--val-src',
                         default='newstest_dev.tok.clean.bpe.32000.en',
                         help='path to the validation source data file \
                         (relative to DATASET_DIR directory)')
    dataset.add_argument('--val-tgt',
                         default='newstest_dev.tok.clean.bpe.32000.de',
                         help='path to the validation target data file \
                         (relative to DATASET_DIR directory)')

    dataset.add_argument('--test-src',
                         default='newstest2014.tok.bpe.32000.en',
                         help='path to the test source data file \
                         (relative to DATASET_DIR directory)')
    dataset.add_argument('--test-tgt',
                         default='newstest2014.de',
                         help='path to the test target data file \
                         (relative to DATASET_DIR directory)')

    # results
    results = parser.add_argument_group('results setup')
    results.add_argument('--results-dir',
                         default='results',
                         help='path to directory with results, it will be \
                         automatically created if it does not exist')
    results.add_argument('--save-dir',
                         default='gnmt',
                         help='defines subdirectory within RESULTS_DIR for \
                         results from this training run')
    results.add_argument('--print-freq',
                         default=10,
                         type=int,
                         help='print log every PRINT_FREQ batches')

    # model
    model = parser.add_argument_group('model setup')
    model.add_argument('--hidden-size',
                       default=1024,
                       type=int,
                       help='hidden size of the model')
    model.add_argument('--num-layers',
                       default=4,
                       type=int,
                       help='number of RNN layers in encoder and in decoder')
    model.add_argument('--dropout',
                       default=0.2,
                       type=float,
                       help='dropout applied to input of RNN cells')

    exclusive_group(group=model,
                    name='share-embedding',
                    default=True,
                    help='use shared embeddings for encoder and decoder')

    model.add_argument('--smoothing',
                       default=0.1,
                       type=float,
                       help='label smoothing, if equal to zero model will use \
                       CrossEntropyLoss, if not zero model will be trained \
                       with label smoothing loss')

    # setup
    general = parser.add_argument_group('general setup')
    general.add_argument('--math',
                         default='fp16',
                         choices=['fp16', 'fp32', 'manual_fp16'],
                         help='precision')
    general.add_argument('--seed',
                         default=None,
                         type=int,
                         help='master seed for random number generators, if \
                         "seed" is undefined then the master seed will be \
                         sampled from random.SystemRandom()')
    general.add_argument('--prealloc-mode',
                         default='always',
                         type=str,
                         choices=['off', 'once', 'always'],
                         help='controls preallocation')

    exclusive_group(group=general,
                    name='eval',
                    default=True,
                    help='run validation and test after every epoch')
    exclusive_group(group=general,
                    name='env',
                    default=True,
                    help='print info about execution env')
    exclusive_group(group=general,
                    name='cuda',
                    default=True,
                    help='enables cuda')
    exclusive_group(group=general,
                    name='cudnn',
                    default=True,
                    help='enables cudnn')
    exclusive_group(group=general,
                    name='log-all-ranks',
                    default=True,
                    help='enables logging from all distributed ranks, if \
                    disabled then only logs from rank 0 are reported')

    # training
    training = parser.add_argument_group('training setup')
    dataset.add_argument('--train-max-size',
                         default=None,
                         type=int,
                         help='use at most TRAIN_MAX_SIZE elements from \
                         training dataset (useful for benchmarking), by \
                         default uses entire dataset')
    training.add_argument('--train-batch-size',
                          default=128,
                          type=int,
                          help='training batch size per worker')
    training.add_argument('--train-global-batch-size',
                          default=None,
                          type=int,
                          help='global training batch size, this argument \
                          does not have to be defined, if it is defined it \
                          will be used to automatically \
                          compute train_iter_size \
                          using the equation: train_iter_size = \
                          train_global_batch_size // (train_batch_size * \
                          world_size)')
    training.add_argument('--train-iter-size',
                          metavar='N',
                          default=1,
                          type=int,
                          help='training iter size, training loop will \
                          accumulate gradients over N iterations and execute \
                          optimizer every N steps')
    training.add_argument('--epochs',
                          default=6,
                          type=int,
                          help='max number of training epochs')

    training.add_argument('--grad-clip',
                          default=5.0,
                          type=float,
                          help='enables gradient clipping and sets maximum \
                          norm of gradients')
    training.add_argument('--train-max-length',
                          default=50,
                          type=int,
                          help='maximum sequence length for training \
                          (including special BOS and EOS tokens)')
    training.add_argument('--train-min-length',
                          default=0,
                          type=int,
                          help='minimum sequence length for training \
                          (including special BOS and EOS tokens)')
    training.add_argument('--train-loader-workers',
                          default=2,
                          type=int,
                          help='number of workers for training data loading')
    training.add_argument('--batching',
                          default='bucketing',
                          type=str,
                          choices=['random', 'sharding', 'bucketing'],
                          help='select batching algorithm')
    training.add_argument('--shard-size',
                          default=80,
                          type=int,
                          help='shard size for "sharding" batching algorithm, \
                          in multiples of global batch size')
    training.add_argument('--num-buckets',
                          default=5,
                          type=int,
                          help='number of buckets for "bucketing" batching \
                          algorithm')

    # optimizer
    optimizer = parser.add_argument_group('optimizer setup')
    optimizer.add_argument('--optimizer',
                           type=str,
                           default='Adam',
                           help='training optimizer')
    optimizer.add_argument('--lr',
                           type=float,
                           default=2.00e-3,
                           help='learning rate')
    optimizer.add_argument('--optimizer-extra',
                           type=str,
                           default="{}",
                           help='extra options for the optimizer')

    # mixed precision loss scaling
    loss_scaling = parser.add_argument_group(
        'mixed precision loss scaling setup')
    loss_scaling.add_argument('--init-scale',
                              type=float,
                              default=8192,
                              help='initial loss scale')
    loss_scaling.add_argument('--upscale-interval',
                              type=float,
                              default=128,
                              help='loss upscaling interval')

    # scheduler
    scheduler = parser.add_argument_group('learning rate scheduler setup')
    scheduler.add_argument('--warmup-steps',
                           type=str,
                           default='200',
                           help='number of learning rate warmup iterations')
    scheduler.add_argument('--remain-steps',
                           type=str,
                           default='0.666',
                           help='starting iteration for learning rate decay')
    scheduler.add_argument('--decay-interval',
                           type=str,
                           default='None',
                           help='interval between learning rate decay steps')
    scheduler.add_argument('--decay-steps',
                           type=int,
                           default=4,
                           help='max number of learning rate decay steps')
    scheduler.add_argument('--decay-factor',
                           type=float,
                           default=0.5,
                           help='learning rate decay factor')

    # validation
    val = parser.add_argument_group('validation setup')
    val.add_argument('--val-batch-size',
                     default=64,
                     type=int,
                     help='batch size for validation')
    val.add_argument('--val-max-length',
                     default=125,
                     type=int,
                     help='maximum sequence length for validation \
                     (including special BOS and EOS tokens)')
    val.add_argument('--val-min-length',
                     default=0,
                     type=int,
                     help='minimum sequence length for validation \
                     (including special BOS and EOS tokens)')
    val.add_argument('--val-loader-workers',
                     default=0,
                     type=int,
                     help='number of workers for validation data loading')

    # test
    test = parser.add_argument_group('test setup')
    test.add_argument('--test-batch-size',
                      default=128,
                      type=int,
                      help='batch size for test')
    test.add_argument('--test-max-length',
                      default=150,
                      type=int,
                      help='maximum sequence length for test \
                      (including special BOS and EOS tokens)')
    test.add_argument('--test-min-length',
                      default=0,
                      type=int,
                      help='minimum sequence length for test \
                      (including special BOS and EOS tokens)')
    test.add_argument('--beam-size', default=5, type=int, help='beam size')
    test.add_argument('--len-norm-factor',
                      default=0.6,
                      type=float,
                      help='length normalization factor')
    test.add_argument('--cov-penalty-factor',
                      default=0.1,
                      type=float,
                      help='coverage penalty factor')
    test.add_argument('--len-norm-const',
                      default=5.0,
                      type=float,
                      help='length normalization constant')
    test.add_argument('--intra-epoch-eval',
                      metavar='N',
                      default=0,
                      type=int,
                      help='evaluate within training epoch, this option will \
                      enable extra N equally spaced evaluations executed \
                      during each training epoch')
    test.add_argument('--test-loader-workers',
                      default=0,
                      type=int,
                      help='number of workers for test data loading')

    # checkpointing
    chkpt = parser.add_argument_group('checkpointing setup')
    chkpt.add_argument('--start-epoch',
                       default=0,
                       type=int,
                       help='manually set initial epoch counter')
    chkpt.add_argument('--resume',
                       default=None,
                       type=str,
                       metavar='PATH',
                       help='resumes training from checkpoint from PATH')
    chkpt.add_argument('--save-all',
                       action='store_true',
                       default=False,
                       help='saves checkpoint after every epoch')
    chkpt.add_argument('--save-freq',
                       default=5000,
                       type=int,
                       help='save checkpoint every SAVE_FREQ batches')
    chkpt.add_argument('--keep-checkpoints',
                       default=0,
                       type=int,
                       help='keep only last KEEP_CHECKPOINTS checkpoints, \
                       affects only checkpoints controlled by --save-freq \
                       option')

    # benchmarking
    benchmark = parser.add_argument_group('benchmark setup')
    benchmark.add_argument('--target-perf',
                           default=None,
                           type=float,
                           help='target training performance (in tokens \
                           per second)')
    benchmark.add_argument('--target-bleu',
                           default=None,
                           type=float,
                           help='target accuracy')

    # distributed
    distributed = parser.add_argument_group('distributed setup')
    distributed.add_argument('--rank',
                             default=0,
                             type=int,
                             help='global rank of the process, do not set!')
    distributed.add_argument('--local_rank',
                             default=0,
                             type=int,
                             help='local rank of the process, do not set!')

    args = parser.parse_args()

    args.lang = {'src': args.src_lang, 'tgt': args.tgt_lang}

    args.save_dir = os.path.join(args.results_dir, args.save_dir)
    args.vocab = os.path.join(args.dataset_dir, args.vocab)
    args.bpe_codes = os.path.join(args.dataset_dir, args.bpe_codes)
    args.train_src = os.path.join(args.dataset_dir, args.train_src)
    args.train_tgt = os.path.join(args.dataset_dir, args.train_tgt)
    args.val_src = os.path.join(args.dataset_dir, args.val_src)
    args.val_tgt = os.path.join(args.dataset_dir, args.val_tgt)
    args.test_src = os.path.join(args.dataset_dir, args.test_src)
    args.test_tgt = os.path.join(args.dataset_dir, args.test_tgt)

    args.warmup_steps = literal_eval(args.warmup_steps)
    args.remain_steps = literal_eval(args.remain_steps)
    args.decay_interval = literal_eval(args.decay_interval)

    return args
示例#4
0
def parse_args():
    """
    Parse commandline arguments.
    """
    def exclusive_group(group, name, default, help):
        destname = name.replace('-', '_')
        subgroup = group.add_mutually_exclusive_group(required=False)
        subgroup.add_argument(f'--{name}', dest=f'{destname}',
                              action='store_true',
                              help=f'{help} (use \'--no-{name}\' to disable)')
        subgroup.add_argument(f'--no-{name}', dest=f'{destname}',
                              action='store_false', help=argparse.SUPPRESS)
        subgroup.set_defaults(**{destname: default})

    parser = argparse.ArgumentParser(
        description='GNMT training',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # dataset
    dataset = parser.add_argument_group('dataset setup')
    dataset.add_argument('--dataset-dir', default='data/wmt16_de_en',
                         help='path to the directory with training/test data')
    dataset.add_argument('--max-size', default=None, type=int,
                         help='use at most MAX_SIZE elements from training \
                         dataset (useful for benchmarking), by default \
                         uses entire dataset')
    dataset.add_argument('--preproc-data-dir', default='/tmp/preprocessed',
                         help='path to the directory with preprocessed \
                         training/test data')
    exclusive_group(group=dataset, name='use-preproc-data', default=True,
                    help='use preprocessed dataset')

    # results
    results = parser.add_argument_group('results setup')
    results.add_argument('--results-dir', default='results',
                         help='path to directory with results, it will be \
                         automatically created if it does not exist')
    results.add_argument('--save', default='gnmt',
                         help='defines subdirectory within RESULTS_DIR for \
                         results from this training run')
    results.add_argument('--print-freq', default=10, type=int,
                         help='print log every PRINT_FREQ batches')

    # model
    model = parser.add_argument_group('model setup')
    model.add_argument('--hidden-size', default=1024, type=int,
                       help='model hidden size')
    model.add_argument('--num-layers', default=4, type=int,
                       help='number of RNN layers in encoder and in decoder')
    model.add_argument('--dropout', default=0.2, type=float,
                       help='dropout applied to input of RNN cells')

    exclusive_group(group=model, name='share-embedding', default=True,
                    help='use shared embeddings for encoder and decoder')

    model.add_argument('--smoothing', default=0.1, type=float,
                       help='label smoothing, if equal to zero model will use \
                       CrossEntropyLoss, if not zero model will be trained \
                       with label smoothing loss')

    # setup
    general = parser.add_argument_group('general setup')
    general.add_argument('--math', default='fp16',
                         choices=['fp32', 'fp16'],
                         help='arithmetic type')
    general.add_argument('--seed', default=None, type=int,
                         help='master seed for random number generators, if \
                         "seed" is undefined then the master seed will be \
                         sampled from random.SystemRandom()')
    general.add_argument('--prealloc-mode', default='always', type=str,
                         choices=['off', 'once', 'always'],
                         help='controls preallocation')

    exclusive_group(group=general, name='eval', default=True,
                    help='run validation and test after every epoch')
    exclusive_group(group=general, name='env', default=False,
                    help='print info about execution env')
    exclusive_group(group=general, name='cuda', default=True,
                    help='enables cuda')
    exclusive_group(group=general, name='cudnn', default=True,
                    help='enables cudnn')
    exclusive_group(group=general, name='log-all-ranks', default=True,
                    help='enables logging from all distributed ranks, if \
                    disabled then only logs from rank 0 are reported')
    exclusive_group(group=general, name='fused-attention', default=False,
                    help='enables fused attention')
    exclusive_group(group=general, name='fused-xentropy', default=True,
                    help='enables fused cross cross entropy with label \
                    smoothing')

    # training
    training = parser.add_argument_group('training setup')
    training.add_argument('--train-batch-size', default=128, type=int,
                          help='training batch size per worker')
    training.add_argument('--train-global-batch-size', default=None, type=int,
                          help='global training batch size, this argument \
                          does not have to be defined, if it is defined it \
                          will be used to automatically \
                          compute train_iter_size \
                          using the equation: train_iter_size = \
                          train_global_batch_size // (train_batch_size * \
                          world_size)')
    training.add_argument('--train-iter-size', metavar='N', default=1,
                          type=int,
                          help='training iter size, training loop will \
                          accumulate gradients over N iterations and execute \
                          optimizer every N steps')
    training.add_argument('--epochs', default=8, type=int,
                          help='max number of training epochs')

    training.add_argument('--grad-clip', default=5.0, type=float,
                          help='enables gradient clipping and sets maximum \
                          norm of gradients')
    training.add_argument('--max-length-train', default=50, type=int,
                          help='maximum sequence length for training \
                          (including special BOS and EOS tokens)')
    training.add_argument('--min-length-train', default=0, type=int,
                          help='minimum sequence length for training \
                          (including special BOS and EOS tokens)')
    training.add_argument('--train-loader-workers', default=1, type=int,
                          help='number of workers for training data loading')
    training.add_argument('--batching', default='bucketing', type=str,
                          choices=['random', 'sharding', 'bucketing'],
                          help='select batching algorithm')
    training.add_argument('--shard-size', default=80, type=int,
                          help='shard size for "sharding" batching algorithm, \
                          in multiples of global batch size')
    training.add_argument('--num-buckets', default=5, type=int,
                          help='number of buckets for "bucketing" batching \
                          algorithm')

    # optimizer
    optimizer = parser.add_argument_group('optimizer setup')
    optimizer.add_argument('--optimizer', type=str, default='Adam',
                           help='training optimizer')
    optimizer.add_argument('--lr', type=float, default=1.00e-3,
                           help='learning rate')
    optimizer.add_argument('--optimizer-extra', type=str,
                           default="{}",
                           help='extra options for the optimizer')

    # mixed precision loss scaling
    loss_scaling = parser.add_argument_group('mixed precision loss scaling \
                                             setup')
    loss_scaling.add_argument('--init-scale', type=float, default=1024,
                              help='initial loss scale')
    loss_scaling.add_argument('--upscale-interval', type=float, default=128,
                              help='loss upscaling interval')

    # scheduler
    scheduler = parser.add_argument_group('learning rate scheduler setup')
    scheduler.add_argument('--warmup-steps', type=str, default='200',
                           help='number of learning rate warmup iterations')
    scheduler.add_argument('--remain-steps', type=str, default='0.666',
                           help='starting iteration for learning rate decay')
    scheduler.add_argument('--decay-interval', type=str, default='None',
                           help='interval between learning rate decay steps')
    scheduler.add_argument('--decay-steps', type=int, default=4,
                           help='max number of learning rate decay steps')
    scheduler.add_argument('--decay-factor', type=float, default=0.5,
                           help='learning rate decay factor')

    # test
    test = parser.add_argument_group('test setup')
    test.add_argument('--test-batch-size', default=128, type=int,
                      help='batch size for test')
    test.add_argument('--max-length-test', default=150, type=int,
                      help='maximum sequence length for test \
                      (including special BOS and EOS tokens)')
    test.add_argument('--min-length-test', default=0, type=int,
                      help='minimum sequence length for test \
                      (including special BOS and EOS tokens)')
    test.add_argument('--beam-size', default=5, type=int,
                      help='beam size')
    test.add_argument('--len-norm-factor', default=0.6, type=float,
                      help='length normalization factor')
    test.add_argument('--cov-penalty-factor', default=0.1, type=float,
                      help='coverage penalty factor')
    test.add_argument('--len-norm-const', default=5.0, type=float,
                      help='length normalization constant')
    test.add_argument('--intra-epoch-eval', metavar='N', default=0, type=int,
                      help='evaluate within training epoch, this option will \
                      enable extra N equally spaced evaluations executed \
                      during each training epoch')
    test.add_argument('--test-loader-workers', default=0, type=int,
                      help='number of workers for test data loading')

    # checkpointing
    chkpt = parser.add_argument_group('checkpointing setup')
    chkpt.add_argument('--start-epoch', default=0, type=int,
                       help='manually set initial epoch counter')
    chkpt.add_argument('--resume', default=None, type=str, metavar='PATH',
                       help='resumes training from checkpoint from PATH')
    chkpt.add_argument('--save-all', action='store_true', default=False,
                       help='saves checkpoint after every epoch')
    chkpt.add_argument('--save-freq', default=5000, type=int,
                       help='save checkpoint every SAVE_FREQ batches')
    chkpt.add_argument('--keep-checkpoints', default=0, type=int,
                       help='keep only last KEEP_CHECKPOINTS checkpoints, \
                       affects only checkpoints controlled by --save-freq \
                       option')

    # benchmarking
    benchmark = parser.add_argument_group('benchmark setup')
    benchmark.add_argument('--target-bleu', default=24.0, type=float,
                           help='target accuracy, training will be stopped \
                           when the target is achieved')

    # distributed
    distributed = parser.add_argument_group('distributed setup')
    distributed.add_argument('--rank', default=0, type=int,
                             help='global rank of the process, do not set!')
    distributed.add_argument('--local_rank', default=os.getenv('LOCAL_RANK', 0), type=int,
                             help='local rank of the process, do not set!')
    distributed.add_argument('--enable-apex-allreduce-overlap',
                             action='store_true', default=False,
                             help='enable overlap of allreduce communication \
                             with bprop')
    distributed.add_argument('--apex-num-allreduce-streams',
                             default=1, type=int,
                             help='num. allreduce streams')
    distributed.add_argument('--apex-message-size', default=1e7, type=int,
                             help='min. number of elements in communication \
                             bucket')

    # distributed weight update
    dwu_group = parser.add_argument_group('distributed weight update setup')
    dwu_group.add_argument('--distributed-weight-update', '--dwu', default=0, type=int, metavar='DWU',
                       help='select distributed weight update strategy')
    dwu_group.add_argument('--dwu-group-size', '--dwugs', default=0, type=int, metavar='DWUGS',
                       help='distributed weight update group size. If arg is 0, defaults to one node')
    dwu_group.add_argument('--dwu-num-blocks', '--dwunb', default=8, type=int, metavar='DWUNB',
                       help='number of blocks in dwu scheme')
    dwu_group.add_argument('--dwu-num-chunks', '--dwuchks', default=4, type=int,
                       help='number of chunks of each parameters block')
    dwu_group.add_argument('--dwu-num-rs-pg', '--dwurspg', default=2, type=int, metavar='DWURSPG',
                       help='number of reduction-scatter streams in dwu scheme')
    dwu_group.add_argument('--dwu-num-ar-pg', '--dwuarpg', default=4, type=int, metavar='DWUARPG',
                       help='number of all-reduce streams in dwu scheme')
    dwu_group.add_argument('--dwu-num-ag-pg', '--dwuagpg', default=2, type=int, metavar='DWUAGPG',
                       help='number of all-gather streams in dwu scheme')
    dwu_group.add_argument('--dwu-full-pipeline', action='store_true', 
                       help='whether to do full or partial pipeline')
    dwu_group.add_argument('--dwu-overlap-reductions', action='store_true',
                       help='whether to overlap reductions with backprop')
    dwu_group.add_argument('--dwu-grad-norm', action='store_true',
                       help='whether to compute L2 grad norm')
    dwu_group.add_argument('--dwu-e5m2-allgather', action='store_true',
                       help='whether to use e5m2 allgather')

    args = parser.parse_args()

    args.warmup_steps = literal_eval(args.warmup_steps)
    args.remain_steps = literal_eval(args.remain_steps)
    args.decay_interval = literal_eval(args.decay_interval)

    return args