Exemplo n.º 1
0
def get_grid(args):
    return [
        hyperparam('--save-interval', 1),
        hyperparam('--no-epoch-checkpoints'),
        hyperparam('--warmup', 0.1),
        hyperparam('--arch',
                   'finetuning_sentence_pair_classifier',
                   save_dir_key=lambda val: val),
        hyperparam('--task', 'sentence_pair_classification'),
        hyperparam('--max-update', [max_update],
                   save_dir_key=lambda val: f'mxup{val}'),
        hyperparam('--optimizer', 'bert_adam', save_dir_key=lambda val: val),
        hyperparam('--lr', [3e-05, 2e-05],
                   save_dir_key=lambda val: f'lr{val}'),
        hyperparam('--t-total', max_update),
        hyperparam('--bert-path',
                   '/checkpoint/yinhanliu/hf_bert_implement_512.pt',
                   save_dir_key=lambda val: f'bert'),
        hyperparam('--min-lr', 1e-9),
        hyperparam('--criterion', ['cross_entropy'],
                   save_dir_key=lambda val: f'crs_ent'),
        hyperparam('--sentence-avg', True, binary_flag=True),
        hyperparam('--num-labels', 2),
        hyperparam('--max-tokens', [2000, 4000],
                   save_dir_key=lambda val: f'mxtk{val}'),
        hyperparam('--seed', [4, 5], save_dir_key=lambda val: f'seed{val}'),
        hyperparam('--skip-invalid-size-inputs-valid-test'),
        hyperparam('--log-format', 'json'),
        hyperparam('--log-interval', [500]),
        hyperparam('--model-dim', 768),
        hyperparam('--final-dropout', [0.1],
                   save_dir_key=lambda val: f'f_drp{val}'),
    ]
Exemplo n.º 2
0
def get_grid(args):
    grid = []

    total_num_udpates = 20000
    warmup_updates = 500
    num_data_loaders = 4
    arch = 'bart_large'
    task = 'translation'
    criterion = 'label_smoothed_cross_entropy'

    adam_eps = 1e-08
    weight_decay = 0.01

    update_freq = 4 if args.num_nodes == 1 else 1
    grid += [
        hyperparam(
            "--restore-file",
            "/private/home/namangoyal/src/fairseq_denoising_codepush/fairseq-py/bart.large/model.pt"
        )
    ]

    # model settings
    grid += [
        hyperparam("--arch", arch, save_dir_key=lambda val: val),
        hyperparam("--task", task),
        hyperparam('--criterion', criterion),
        hyperparam('--source-lang', 'source'),
        hyperparam('--target-lang', 'target'),
        hyperparam('--truncate-source'),
        hyperparam('--label-smoothing',
                   0.1,
                   save_dir_key=lambda val: f"ls{val}"),
    ]

    grid += [
        hyperparam("--max-tokens", 2048, save_dir_key=lambda val: f"mt{val}"),
        hyperparam("--update-freq",
                   update_freq,
                   save_dir_key=lambda val: f"uf{val}"),
        hyperparam("--max-update",
                   total_num_udpates,
                   save_dir_key=lambda val: f"mu{val}"),
        hyperparam("--required-batch-size-multiple", 1),
    ]
    # regularization
    grid += [
        hyperparam("--dropout", 0.1, save_dir_key=lambda val: f"dr{val}"),
        hyperparam("--attention-dropout",
                   0.1,
                   save_dir_key=lambda val: f"atdr{val}"),
        hyperparam("--relu-dropout",
                   0.0,
                   save_dir_key=lambda val: f"actdr{val}"),
        hyperparam("--weight-decay",
                   weight_decay,
                   save_dir_key=lambda val: f"wd{val}"),
    ]

    # optimization settings
    grid += [
        hyperparam("--optimizer", "adam", save_dir_key=lambda val: val),
        hyperparam("--adam-betas",
                   "(0.9, 0.999)",
                   save_dir_key=lambda val: "beta9999"),
        hyperparam("--adam-eps",
                   adam_eps,
                   save_dir_key=lambda val: f"eps{val}"),
        hyperparam("--clip-norm", 0.1, save_dir_key=lambda val: f"clip{val}"),
    ]

    # lr scheduler
    grid += [
        hyperparam("--lr-scheduler", "polynomial_decay"),
        hyperparam("--lr", 3e-05, save_dir_key=lambda val: f"lr{val}"),
        hyperparam("--total-num-update", total_num_udpates),
        hyperparam("--warmup-updates",
                   warmup_updates,
                   save_dir_key=lambda val: f"warm{val}"),
    ]
    grid += [
        hyperparam("--fp16", save_dir_key=lambda val: "fp16"),
    ]

    # data loading settings
    grid += [
        hyperparam("--num-workers", num_data_loaders),
    ]

    # validation and checkpoint settings
    grid += [
        # hyperparam("--no-save"),
        hyperparam("--no-epoch-checkpoints"),
        hyperparam('--reset-meters'),
        hyperparam('--reset-optimizer')
    ]

    grid += [
        hyperparam('--share-all-embeddings'),
        hyperparam('--layernorm-embedding'),
        hyperparam('--share-decoder-input-output-embed'),
    ]

    # logging settings
    grid += [
        hyperparam("--skip-invalid-size-inputs-valid-test"),
        hyperparam("--log-format", "json"),
        hyperparam("--log-interval", 10),
    ]

    if args.local:
        grid += [
            hyperparam("--log-format", "json"),
            hyperparam("--log-interval", 1),
        ]
    return grid
Exemplo n.º 3
0
def get_grid(args):
    return [
        hyperparam('--save-interval', 1),
        hyperparam('--no-epoch-checkpoints'),
        hyperparam('--warmup', 0.1),
        hyperparam('--arch', 'finetuning_squad', save_dir_key=lambda val: val),
        hyperparam('--task', 'squad'),
        hyperparam('--max-update', [max_update],
                   save_dir_key=lambda val: f'mxup{val}'),
        hyperparam('--data-file',
                   '/private/home/yinhanliu/data/squad_bert/dev-v1.1.json'),
        hyperparam('--optimizer', 'bert_adam', save_dir_key=lambda val: val),
        hyperparam('--lr', [3e-05, 2e-05],
                   save_dir_key=lambda val: f'lr{val}'),
        hyperparam('--t-total', max_update),
        hyperparam(
            '--bert-path',
            '/checkpoint/yinhanliu/20190322/hf_bert_implement/bert512.eps-06_0.0002/checkpoint_best.pt',
            save_dir_key=lambda val: f'bert'),
        hyperparam('--criterion', ['squad'],
                   save_dir_key=lambda val: f'crs_ent'),
        hyperparam('--max-tokens', [
            1334,
        ],
                   save_dir_key=lambda val: f'mxtk{val}'),
        hyperparam('--seed', [3, 4, 5, 6],
                   save_dir_key=lambda val: f'seed{val}'),
        hyperparam('--skip-invalid-size-inputs-valid-test'),
        hyperparam('--log-format', 'json'),
        hyperparam('--log-interval', [500]),
        hyperparam('--min-lr', 1e-09),
        hyperparam('--model-dim', 768),
    ]
Exemplo n.º 4
0
def get_grid_levenshtein(args):
    return [
        # task, model, criterion
        hyperparam('--task', 'translation_lev'),
        hyperparam('--arch',
                   'levenshtein_transformer_wmt_en_de',
                   save_dir_key=lambda val: MODEL[val]),
        # hyperparam('--arch', [
        #     'levenshtein_transformer_wmt_en_de_big',
        #     'levenshtein_transformer_wmt_en_de'
        # ],
        #            save_dir_key=lambda val: MODEL[val]),
        hyperparam('--criterion', 'label_smoothed_dual_imitation'),
        hyperparam('--noise', 'random_delete'),

        # task specific
        hyperparam('--fixed-validation-seed', 7),
        hyperparam('--append-bos', binary_flag=True),

        # model
        hyperparam('--encoder-learned-pos', binary_flag=True),
        hyperparam('--decoder-learned-pos',
                   binary_flag=True,
                   save_dir_key=lambda val: f'lp' if val else f'sp'),
        hyperparam('--share-all-embeddings', binary_flag=True),
        hyperparam('--apply-bert-init',
                   binary_flag=True,
                   save_dir_key=lambda val: f'bert' if val else f''),
        hyperparam('--early-exit',
                   '(6,6,6)',
                   save_dir_key=lambda val: f'ext-{val}'),

        # general
        hyperparam('--activation-fn',
                   'gelu',
                   save_dir_key=lambda val: f'act-{val}'),
        # hyperparam('--max-tokens', 8192, save_dir_key=lambda val: f'b{val}'),
        hyperparam('--max-tokens', 4096, save_dir_key=lambda val: f'b8192'),
        hyperparam('--update-freq', 2),
        hyperparam('--fp16', binary_flag=True),
        hyperparam('--optimizer', 'adam'),
        hyperparam('--lr', 0.0005, save_dir_key=lambda val: f'lr{val}'),
        hyperparam('--min-lr', '1e-09'),
        hyperparam('--lr-scheduler', 'inverse_sqrt'),
        hyperparam('--max-update', 400000),
        hyperparam('--warmup-updates', 10000),
        hyperparam('--keep-last-epochs', 15),
        hyperparam('--keep-interval-updates', 5),
        hyperparam('--warmup-init-lr', '1e-07'),
        hyperparam('--adam-betas', '(0.9, 0.999)'),
        hyperparam('--dropout', 0.3),
        hyperparam('--label-smoothing', 0.1),
        hyperparam('--weight-decay', 0.01),
        hyperparam('--save-interval-updates', 10000),
        hyperparam('--log-format', 'simple'),
        hyperparam('--log-interval', 5),
        hyperparam('--seed', 2),
        # hyperparam('--seed', [1, 11], save_dir_key=lambda val: f'prefix{val % 10}'),
        # hyperparam('--seed', [3, 5, 7, 13, 15, 17], save_dir_key=lambda val: f'prefix{val % 10}'),
        # hyperparam('--seed', 5, save_dir_key=lambda val: f'fuse-0.{val}'),
    ]
def get_grid(args):
    """
    Replicates the `16-bit+cumul+2x lr` results from Table 1 of
    "Scaling Neural Machine Translation" (https://arxiv.org/abs/1806.00187)
    """
    return [
        hyperparam('--fp16', save_dir_key=lambda val: 'fp16'),
        #hyperparam('--ddp-backend', 'no_c10d', save_dir_key=lambda val: 'no_c10d'),
        hyperparam('--max-epoch', 70),

        # equivalent to training on 16x GPUs
        hyperparam('--update-freq', 16 if not args.local else 1, save_dir_key=lambda val: f'updatefreq{val}'),

        hyperparam('--arch', 'transformer_wmt_en_de_big', save_dir_key=lambda val: val),
        hyperparam('--share-all-embeddings', [True], binary_flag=True, save_dir_key=lambda val: 'shareemb'),

        hyperparam('--optimizer', 'adam', save_dir_key=lambda val: val),
        hyperparam('--adam-betas', '(0.9, 0.98)', save_dir_key=lambda val: 'beta0.9,0.98'),
        hyperparam('--lr-scheduler', 'inverse_sqrt'),
        hyperparam('--warmup-init-lr', 1e-7, save_dir_key=lambda val: f'initlr{val}'),
        hyperparam('--warmup-updates', 4000, save_dir_key=lambda val: f'warmup{val}'),
        # use double the default learning rate, since we're using --update-freq=16
        hyperparam('--lr', 10e-4, save_dir_key=lambda val: f'lr{val}'),
        hyperparam('--min-lr', 1e-9),
        hyperparam('--clip-norm', 0.0, save_dir_key=lambda val: f'clip{val}'),

        hyperparam('--dropout', 0.3, save_dir_key=lambda val: f'drop{val}'),
        hyperparam('--weight-decay', 0.0, save_dir_key=lambda val: f'wd{val}'),
        hyperparam('--criterion', 'label_smoothed_cross_entropy'),
        hyperparam('--label-smoothing', 0.1, save_dir_key=lambda val: f'ls{val}'),

        hyperparam('--max-tokens', 3584, save_dir_key=lambda val: f'maxtok{val}'),
        hyperparam('--seed', [2], save_dir_key=lambda val: f'seed{val}'),

        hyperparam('--log-format', 'json'),
        hyperparam('--log-interval', 100 if not args.local else 10),
    ]
Exemplo n.º 6
0
def get_grid_cmlm(args):
    return [
        # task, model, criterion
        hyperparam('--task', 'translation_lev'),
        hyperparam('--arch',
                   'cmlm_transformer',
                   save_dir_key=lambda val: MODEL[val]),
        hyperparam('--criterion', 'label_smoothed_dual_imitation'),

        # task specific
        hyperparam('--fixed-validation-seed', 7),
        hyperparam('--append-bos', binary_flag=True),
        hyperparam('--noise', 'random_mask'),

        # model
        hyperparam('--encoder-learned-pos', True, binary_flag=True),
        hyperparam('--decoder-learned-pos',
                   True,
                   binary_flag=True,
                   save_dir_key=lambda val: f'lp' if val else f'sp'),
        hyperparam('--share-all-embeddings', binary_flag=True),
        hyperparam('--apply-bert-init',
                   binary_flag=True,
                   save_dir_key=lambda val: f'bert' if val else f''),

        # length prediction
        hyperparam('--pred-length-offset', binary_flag=True),
        # hyperparam('--sg-length-pred', binary_flag=True, save_dir_key=lambda val: f'sg' if val else f''),
        hyperparam('--length-loss-factor',
                   0.1,
                   save_dir_key=lambda val: f'lf{val}'),

        # general
        hyperparam('--activation-fn',
                   'gelu',
                   save_dir_key=lambda val: f'{val}'),
        hyperparam('--max-tokens', 4096, save_dir_key=lambda val: f'b{val}'),
        hyperparam('--update-freq', 2, save_dir_key=lambda val: f'u{val}'),
        hyperparam('--fp16', binary_flag=True),
        hyperparam('--optimizer', 'adam'),
        hyperparam('--lr', 0.0005, save_dir_key=lambda val: f'lr{val}'),
        hyperparam('--min-lr', '1e-09'),
        hyperparam('--lr-scheduler', 'inverse_sqrt'),
        hyperparam('--max-update', 400000),
        hyperparam('--warmup-updates', 10000),
        hyperparam('--keep-last-epochs', 5),
        hyperparam('--keep-interval-updates', 5),
        hyperparam('--warmup-init-lr', '1e-07'),
        hyperparam('--adam-betas', '(0.9, 0.999)'),
        hyperparam('--dropout', 0.3),
        hyperparam('--label-smoothing', 0.1),
        hyperparam('--weight-decay', 0.01),
        hyperparam('--save-interval-updates', 10000),
        hyperparam('--log-format', 'simple'),
        hyperparam('--log-interval', 5),
    ]
Exemplo n.º 7
0
def get_grid_insertion(args):
    return [
        # task, model, criterion
        hyperparam('--task', 'translation_lev'),
        hyperparam('--arch',
                   'insertion_transformer',
                   save_dir_key=lambda val: MODEL[val]),
        hyperparam('--criterion', 'label_smoothed_dual_imitation'),
        hyperparam('--noise', 'random_delete'),

        # task specific
        hyperparam('--fixed-validation-seed', 7),
        hyperparam('--append-bos', binary_flag=True),

        # model
        hyperparam('--encoder-learned-pos', binary_flag=True),
        hyperparam('--decoder-learned-pos',
                   binary_flag=True,
                   save_dir_key=lambda val: f'lp' if val else f'sp'),
        hyperparam('--share-all-embeddings', binary_flag=True),
        hyperparam('--apply-bert-init',
                   binary_flag=True,
                   save_dir_key=lambda val: f'bert' if val else f''),
        hyperparam('--label-tau',
                   1,
                   save_dir_key=lambda val: f'tau{val}'
                   if val < 1000 else f'uniform'),

        # general
        hyperparam('--activation-fn',
                   'gelu',
                   save_dir_key=lambda val: f'act-{val}'),
        # hyperparam('--max-tokens', 6144, save_dir_key=lambda val: f'bz{val}'),
        hyperparam('--max-tokens', 8192, save_dir_key=lambda val: f'b{val}'),
        hyperparam('--fp16', binary_flag=True),
        hyperparam('--optimizer', 'adam'),
        hyperparam('--lr', 0.0005, save_dir_key=lambda val: f'lr{val}'),
        hyperparam('--min-lr', '1e-09'),
        hyperparam('--lr-scheduler', 'inverse_sqrt'),
        hyperparam('--max-update', 400000),
        hyperparam('--warmup-updates', 10000),
        hyperparam('--warmup-init-lr', '1e-07'),
        hyperparam('--adam-betas', '(0.9, 0.999)'),
        hyperparam('--dropout', 0.3),
        hyperparam('--label-smoothing', 0.1),
        hyperparam('--weight-decay', 0.01),
        hyperparam('--save-interval-updates', 10000),
        hyperparam('--keep-last-epochs', 15),
        hyperparam('--keep-interval-updates', 5),
        hyperparam('--log-format', 'simple'),
        hyperparam('--log-interval', 5),
        # hyperparam('--seed', [1, 2, 3, 4, 5, 6, 7], save_dir_key=lambda val: f'rb-{val}'),
    ]
Exemplo n.º 8
0
def get_at_grid(args):
    """
    Auto-regressive Transformer
    """
    return [
        hyperparam('--fp16', save_dir_key=lambda val: 'fp16'),
        #hyperparam('--ddp-backend', 'no_c10d', save_dir_key=lambda val: 'no_c10d'),
        hyperparam('--max-update', 300000),

        # equivalent to training on 16x GPUs
        # hyperparam('--update-freq', 16, save_dir_key=lambda val: f'updatefreq{val}'),
        hyperparam('--arch', ['transformer_small'],
                   save_dir_key=lambda val: val),
        hyperparam('--share-all-embeddings', [True],
                   binary_flag=True,
                   save_dir_key=lambda val: 'shareemb'),
        hyperparam('--optimizer', 'adam', save_dir_key=lambda val: val),
        hyperparam('--adam-betas',
                   '(0.9, 0.98)',
                   save_dir_key=lambda val: 'beta0.9,0.98'),
        hyperparam('--lr-scheduler', 'inverse_sqrt'),
        hyperparam('--warmup-init-lr',
                   1e-7,
                   save_dir_key=lambda val: f'initlr{val}'),
        hyperparam('--warmup-updates',
                   4000,
                   save_dir_key=lambda val: f'warmup{val}'),

        #  use double the default learning rate, since we're using --update-freq=16
        hyperparam('--lr', 0.0005, save_dir_key=lambda val: f'lr{val}'),
        hyperparam('--min-lr', 1e-9),
        hyperparam('--clip-norm', 25, save_dir_key=lambda val: f'clip{val}'),
        hyperparam('--dropout', 0.1, save_dir_key=lambda val: f'drop{val}'),
        hyperparam('--weight-decay',
                   0.0001,
                   save_dir_key=lambda val: f'wd{val}'),
        hyperparam('--criterion', 'label_smoothed_cross_entropy'),
        hyperparam('--label-smoothing',
                   0.1,
                   save_dir_key=lambda val: f'ls{val}'),
        hyperparam('--max-tokens',
                   4096,
                   save_dir_key=lambda val: f'maxtok{val}'),
        hyperparam('--seed', [2], save_dir_key=lambda val: f'seed{val}'),
        hyperparam('--keep-last-epochs', 15),
        hyperparam('--keep-interval-updates', 5),
        hyperparam('--log-format', 'simple'),
        hyperparam('--log-interval', 100),
    ]
Exemplo n.º 9
0
def get_grid_inat(args):
    return [
        # task, model, criterion
        hyperparam('--task', 'translation_lev'),
        hyperparam('--arch',
                   'iterative_nonautoregressive_transformer',
                   save_dir_key=lambda val: MODEL[val]),
        hyperparam('--criterion', 'label_smoothed_dual_imitation'),

        # task specific
        hyperparam('--fixed-validation-seed', 7),
        hyperparam('--append-bos', binary_flag=True),
        hyperparam('--noise', 'full_mask'),

        # model
        hyperparam('--encoder-learned-pos', True, binary_flag=True),
        hyperparam('--decoder-learned-pos',
                   True,
                   binary_flag=True,
                   save_dir_key=lambda val: f'lp' if val else f'sp'),
        hyperparam('--share-all-embeddings', binary_flag=True),
        hyperparam('--apply-bert-init',
                   binary_flag=True,
                   save_dir_key=lambda val: f'bert' if val else f''),

        # iterative refinement settings
        hyperparam('--train-step', 3, save_dir_key=lambda val: f'iter{val}'),
        hyperparam('--dae-ratio', 0.5, save_dir_key=lambda val: f'dae{val}'),
        hyperparam('--stochastic-approx',
                   True,
                   binary_flag=True,
                   save_dir_key=lambda val: 'sa'),

        # length prediction
        hyperparam('--pred-length-offset', binary_flag=True),
        # hyperparam('--sg-length-pred', binary_flag=True, save_dir_key=lambda val: f'sg' if val else f''),
        hyperparam('--length-loss-factor',
                   0.1,
                   save_dir_key=lambda val: f'lf{val}'),
        # hyperparam('--src-embedding-copy', [True, False],
        #            binary_flag=True,
        #            save_dir_key=lambda val: 'copy'),
        # n-gram loss
        # hyperparam('--ngram-predictor',
        #            4,
        #            save_dir_key=lambda val: f'{val}-gram'),

        # general
        hyperparam('--activation-fn',
                   'gelu',
                   save_dir_key=lambda val: f'{val}'),
        hyperparam('--max-tokens', 2048, save_dir_key=lambda val: f'b{val}'),
        hyperparam('--update-freq', 2, save_dir_key=lambda val: f'u{val}'),
        hyperparam('--fp16', binary_flag=True),
        hyperparam('--optimizer', 'adam'),
        hyperparam('--lr', 0.0005, save_dir_key=lambda val: f'lr{val}'),
        hyperparam('--min-lr', '1e-09'),
        hyperparam('--lr-scheduler', 'inverse_sqrt'),
        hyperparam('--max-update', 400000),
        hyperparam('--warmup-updates', 10000),
        hyperparam('--keep-last-epochs', 5),
        hyperparam('--keep-interval-updates', 5),
        hyperparam('--warmup-init-lr', '1e-07'),
        hyperparam('--adam-betas', '(0.9, 0.999)'),
        hyperparam('--dropout', 0.3),
        hyperparam('--label-smoothing', 0.1),
        hyperparam('--weight-decay', 0.01),
        hyperparam('--save-interval-updates', 10000),
        hyperparam('--log-format', 'simple'),
        hyperparam('--log-interval', 5),

        # hyperparam('--seed', [1, 2, 3, 4, 5, 6, 7], save_dir_key=lambda val: f'rb-{val}'),
    ]
def get_grid(args):
    return [
        hyperparam('--fp16', save_dir_key=lambda val: 'fp16'),
        hyperparam('--max-update', 50000),
        hyperparam('--task', 'language_modeling'),
        hyperparam('--arch', 'hf_gpt2', save_dir_key=lambda val: val),
        #hyperparam('--arch', 'transformer_lm_gpt', save_dir_key=lambda val: val),
        #hyperparam('--share-decoder-input-output-embed', save_dir_key=lambda val: 'shareemb'),
        hyperparam('--dropout', 0.1, save_dir_key=lambda val: f'drop{val}'),
        hyperparam('--optimizer', 'adam', save_dir_key=lambda val: val),
        hyperparam('--adam-betas',
                   '(0.9, 0.98)',
                   save_dir_key=lambda val: 'beta0.9,0.98'),
        hyperparam('--weight-decay', 0.01,
                   save_dir_key=lambda val: f'wd{val}'),
        hyperparam('--clip-norm', 0.0, save_dir_key=lambda val: f'clip{val}'),
        hyperparam('--lr', 5e-4, save_dir_key=lambda val: f'lr{val}'),
        hyperparam('--lr-scheduler', 'inverse_sqrt'),
        hyperparam('--warmup-updates',
                   4000,
                   save_dir_key=lambda val: f'warmup{val}'),
        hyperparam('--warmup-init-lr',
                   1e-7,
                   save_dir_key=lambda val: f'initlr{val}'),
        hyperparam('--tokens-per-sample',
                   512,
                   save_dir_key=lambda val: f'sampletok{val}'),
        hyperparam('--sample-break-mode',
                   'none',
                   save_dir_key=lambda val: f'break{val}'),
        hyperparam('--max-tokens',
                   2048,
                   save_dir_key=lambda val: f'maxtok{val}'),
        hyperparam('--update-freq',
                   4,
                   save_dir_key=lambda val: f'updatefreq{val}'),
        hyperparam('--seed', [2], save_dir_key=lambda val: f'seed{val}'),
        hyperparam('--log-format', 'json'),
        hyperparam('--log-interval', 25),
    ]
Exemplo n.º 11
0
def get_grid(args):

    max_update = 500000

    return [
        hyperparam('--train-subset', 'train' if not args.local else 'valid'),
        hyperparam('--skip-invalid-size-inputs-valid-test'),
        hyperparam('--fast-stat-sync', save_dir_key=lambda _: 'faststatsync'),
        hyperparam('--memory-efficient-fp16',
                   save_dir_key=lambda val: 'me_fp16'),
        hyperparam('--num-workers', 2),
        hyperparam('--task', 'masked_lm'),
        hyperparam('--criterion', 'masked_lm'),
        hyperparam('--arch', 'roberta_base', save_dir_key=lambda val: val),
        hyperparam('--sample-break-mode',
                   'complete',
                   save_dir_key=lambda val: 'cmpltdoc'),
        hyperparam('--tokens-per-sample',
                   512,
                   save_dir_key=lambda val: f'tps{val}'),
        hyperparam('--optimizer', 'adam', save_dir_key=lambda val: val),
        hyperparam('--adam-betas',
                   '(0.9, 0.98)',
                   save_dir_key=lambda val: 'b2_0.98'),
        hyperparam('--adam-eps', 1e-6, save_dir_key=lambda val: f'eps{val}'),
        hyperparam('--clip-norm', 0.0, save_dir_key=lambda val: f'cl{val}'),
        hyperparam('--lr-scheduler', 'polynomial_decay'),
        hyperparam('--lr', 6e-4, save_dir_key=lambda val: f'lr{val}'),
        hyperparam('--warmup-updates',
                   24000,
                   save_dir_key=lambda val: f'wu{val}'),
        hyperparam('--total-num-update', max_update),
        hyperparam('--dropout', 0.1, save_dir_key=lambda val: f'dr{val}'),
        hyperparam('--attention-dropout',
                   0.1,
                   save_dir_key=lambda val: f'atdr{val}'),
        hyperparam('--weight-decay', 0.01,
                   save_dir_key=lambda val: f'wd{val}'),
        hyperparam('--max-sentences', 32, save_dir_key=lambda val: f'ms{val}'),
        hyperparam('--update-freq', 1, save_dir_key=lambda val: f'uf{val}'),
        hyperparam('--max-update',
                   max_update,
                   save_dir_key=lambda val: f'mu{val}'),
        hyperparam('--seed', 1, save_dir_key=lambda val: f's{val}'),
        hyperparam('--log-format', 'json'),
        hyperparam('--log-interval', 25),
    ]
Exemplo n.º 12
0
def get_grid(args):

    config = '8k'  # 2k

    if config == '8k':
        max_update = 100000
        save_interval = 5000
        valid_interval = 5000
        update_freq = 1
        lr = 5.2e-4
        warmup = 5000
    else:
        max_update = 100000
        save_interval = 5000
        valid_interval = 5000
        update_freq = 4
        lr = 5e-4
        warmup = 5000

    seeds = [0]
    grid = [
        # hyperparam('--train-subset', 'train' if not args.local else 'test'),
        hyperparam('--train-subset', 'valid'),
        hyperparam('--fp16', save_dir_key=lambda val: 'fp16'),
        hyperparam('--num-workers', 4),
        hyperparam('--task', 'multilingual_masked_lm'),
        hyperparam('--criterion', 'masked_lm'),
        hyperparam('--arch', 'roberta_large', save_dir_key=lambda val: val),
        hyperparam('--sample-break-mode',
                   'complete',
                   save_dir_key=lambda val: 'cmplt'),
        hyperparam('--tokens-per-sample',
                   512,
                   save_dir_key=lambda val: f'tps{val}'),
        hyperparam('--optimizer', 'adam', save_dir_key=lambda val: val),
        hyperparam('--adam-betas',
                   '(0.9, 0.98)',
                   save_dir_key=lambda val: 'b2_0.98'),
        hyperparam('--adam-eps', 1e-6, save_dir_key=lambda val: f'eps{val}'),
        hyperparam('--clip-norm', 1.0, save_dir_key=lambda val: f'cl{val}'),
        hyperparam('--lr-scheduler', 'polynomial_decay'),
        hyperparam('--lr', lr, save_dir_key=lambda val: f'lr{val}'),
        hyperparam('--warmup-updates',
                   warmup,
                   save_dir_key=lambda val: f'wu{val}'),
        hyperparam('--total-num-update', max_update),
        hyperparam('--dropout', 0.1, save_dir_key=lambda val: f'dr{val}'),
        hyperparam('--attention-dropout',
                   0.1,
                   save_dir_key=lambda val: f'atdr{val}'),
        hyperparam('--weight-decay', 0.01,
                   save_dir_key=lambda val: f'wd{val}'),

        # hyperparam('--max-tokens', 3200, save_dir_key=lambda val: f'mt{val}'),
        hyperparam('--max-sentences', 12, save_dir_key=lambda val: f'ms{val}'),
        hyperparam('--update-freq',
                   update_freq,
                   save_dir_key=lambda val: f'uf{val}'),
        hyperparam('--max-update',
                   max_update,
                   save_dir_key=lambda val: f'mu{val}'),
        hyperparam('--multilang-sampling-alpha',
                   0.7,
                   save_dir_key=lambda val: f's{val}'),
    ]
    grid += [
        hyperparam("--skip-invalid-size-inputs-valid-test"),
        hyperparam("--log-format", "json"),
        hyperparam("--log-interval", 100),
    ]

    # random seed
    grid += [
        hyperparam("--seed", seeds, save_dir_key=lambda val: f"seed{val}"),
    ]

    grid += [
        hyperparam("--validate-interval", valid_interval),
    ]
    grid += [
        hyperparam("--save-interval-updates", save_interval),
        hyperparam("--no-epoch-checkpoints"),
    ]

    if args.local:
        grid += [
            hyperparam("--log-format", "json"),
            hyperparam("--log-interval", 1),
        ]
    return grid
Exemplo n.º 13
0
def get_grid(args):

    model_size = 'large'

    return [
        hyperparam('--train-subset', 'train' if not args.local else 'valid'),
        hyperparam('data',
                   list(tasks.keys()),
                   positional_arg=True,
                   save_dir_key=lambda val: get_save_dir_key(val)),
        hyperparam('--no-epoch-checkpoints'),
        hyperparam('--no-last-checkpoints'),
        hyperparam('--no-save-optimizer-state'),
        hyperparam('--save-interval-updates', 1000),
        hyperparam('--reset-optimizer'),
        hyperparam('--reset-dataloader'),
        hyperparam('--reset-meters'),
        hyperparam('--best-checkpoint-metric', 'accuracy'),
        hyperparam('--maximize-best-checkpoint-metric', [True],
                   binary_flag=True),
        hyperparam('--restore-file', '/private/home/myleott/roberta.' +
                   model_size + '/model.pt'),
        hyperparam('--fp16', save_dir_key=lambda val: 'fp16'),
        hyperparam('--ddp-backend', 'no_c10d'),
        hyperparam('--num-workers', 1 if not args.local else 0),
        hyperparam('--task',
                   'sentence_prediction',
                   save_dir_key=lambda val: 'sentpred'),
        hyperparam('--init-token', 0, save_dir_key=lambda val: f'bos{val}'),
        hyperparam('--separator-token',
                   2,
                   save_dir_key=lambda val: f'sep{val}'),
        hyperparam('--max-positions', 512),
        hyperparam('--regression-target', [False], binary_flag=True),
        hyperparam('--arch',
                   'roberta_' + model_size,
                   save_dir_key=lambda val: val),
        hyperparam('--bpe', 'gpt2'),
        hyperparam('--criterion', 'sentence_prediction'),
        hyperparam('--num-classes', [None]),
        hyperparam('--optimizer', 'adam', save_dir_key=lambda val: val),
        hyperparam('--adam-betas',
                   '(0.9, 0.98)',
                   save_dir_key=lambda val: 'b2_0.98'),
        hyperparam('--adam-eps', 1e-6, save_dir_key=lambda val: f'eps{val}'),
        hyperparam('--clip-norm', 0.0, save_dir_key=lambda val: f'clip{val}'),
        hyperparam('--lr-scheduler', 'polynomial_decay'),
        hyperparam('--lr', [None], save_dir_key=lambda val: f'lr{val}'),
        hyperparam('--warmup-updates', [None],
                   save_dir_key=lambda val: f'wu{val}'),
        hyperparam('--total-num-update', [None]),
        hyperparam('--dropout', 0.1, save_dir_key=lambda val: f'dr{val}'),
        hyperparam('--attention-dropout',
                   0.1,
                   save_dir_key=lambda val: f'atdr{val}'),
        hyperparam('--weight-decay', 0.01,
                   save_dir_key=lambda val: f'wd{val}'),
        hyperparam('--max-sentences', [None],
                   save_dir_key=lambda val: f'ms{val}'),
        hyperparam('--required-batch-size-multiple', 1),
        hyperparam('--update-freq', 1, save_dir_key=lambda val: f'uf{val}'),
        hyperparam('--max-update', [None],
                   save_dir_key=lambda val: f'mu{val}'),
        hyperparam('--seed', [1], save_dir_key=lambda val: f's{val}'),
        hyperparam('--log-format', 'json'),
        hyperparam('--log-interval', 25),
    ]
Exemplo n.º 14
0
def get_grid(args):

    max_update = 100000

    return [
        hyperparam('--train-subset', 'train' if not args.local else 'valid'),
        hyperparam('--fp16', save_dir_key=lambda val: 'fp16'),
        #hyperparam('--memory-efficient-fp16', save_dir_key=lambda val: 'me_fp16'),
        hyperparam('--num-workers', 2),
        hyperparam('--save-interval-updates', 10000),
        hyperparam('--no-epoch-checkpoints'),
        hyperparam('--task', 'language_modeling'),
        hyperparam('--sample-break-mode',
                   'none',
                   save_dir_key=lambda val: f'bm_{val}'),
        hyperparam('--tokens-per-sample',
                   1024,
                   save_dir_key=lambda val: f'tps{val}'),

        #hyperparam('--arch', 'transformer_lm_gpt', save_dir_key=lambda val: val),
        hyperparam('--arch',
                   'transformer_lm_gpt2_small',
                   save_dir_key=lambda val: val),
        #hyperparam('--arch', 'transformer_lm_gpt2_medium', save_dir_key=lambda val: val),
        #hyperparam('--arch', 'transformer_lm_gpt2_big', save_dir_key=lambda val: val),
        hyperparam('--share-decoder-input-output-embed',
                   save_dir_key=lambda val: 'share'),
        hyperparam('--optimizer', 'adam', save_dir_key=lambda val: val),
        hyperparam('--adam-betas',
                   '(0.9, 0.98)',
                   save_dir_key=lambda val: 'b2_0.98'),
        hyperparam('--adam-eps', 1e-8, save_dir_key=lambda val: f'eps{val}'),
        hyperparam('--clip-norm', 0.0, save_dir_key=lambda val: f'cl{val}'),
        hyperparam('--lr-scheduler', 'polynomial_decay'),
        hyperparam('--lr', 50e-4, save_dir_key=lambda val: f'lr{val}'),
        hyperparam('--total-num-update', max_update),
        hyperparam('--warmup-updates',
                   10000,
                   save_dir_key=lambda val: f'wu{val}'),
        hyperparam('--dropout', 0.1, save_dir_key=lambda val: f'dr{val}'),
        hyperparam('--attention-dropout',
                   0.1,
                   save_dir_key=lambda val: f'atdr{val}'),
        hyperparam('--weight-decay', 0.01,
                   save_dir_key=lambda val: f'wd{val}'),
        hyperparam('--max-sentences', 2, save_dir_key=lambda val: f'ms{val}'),
        hyperparam('--required-batch-size-multiple', 1),
        hyperparam('--update-freq', 1, save_dir_key=lambda val: f'uf{val}'),
        hyperparam('--max-update',
                   max_update,
                   save_dir_key=lambda val: f'mu{val}'),
        hyperparam('--seed', 1, save_dir_key=lambda val: f's{val}'),
        hyperparam('--log-format', 'json'),
        hyperparam('--log-interval', 25),
    ]