def get_grid(args): return [ hyperparam('--save-interval', 1), hyperparam('--no-epoch-checkpoints'), hyperparam('--warmup', 0.1), hyperparam('--arch', 'finetuning_sentence_pair_classifier', save_dir_key=lambda val: val), hyperparam('--task', 'sentence_pair_classification'), hyperparam('--max-update', [max_update], save_dir_key=lambda val: f'mxup{val}'), hyperparam('--optimizer', 'bert_adam', save_dir_key=lambda val: val), hyperparam('--lr', [3e-05, 2e-05], save_dir_key=lambda val: f'lr{val}'), hyperparam('--t-total', max_update), hyperparam('--bert-path', '/checkpoint/yinhanliu/hf_bert_implement_512.pt', save_dir_key=lambda val: f'bert'), hyperparam('--min-lr', 1e-9), hyperparam('--criterion', ['cross_entropy'], save_dir_key=lambda val: f'crs_ent'), hyperparam('--sentence-avg', True, binary_flag=True), hyperparam('--num-labels', 2), hyperparam('--max-tokens', [2000, 4000], save_dir_key=lambda val: f'mxtk{val}'), hyperparam('--seed', [4, 5], save_dir_key=lambda val: f'seed{val}'), hyperparam('--skip-invalid-size-inputs-valid-test'), hyperparam('--log-format', 'json'), hyperparam('--log-interval', [500]), hyperparam('--model-dim', 768), hyperparam('--final-dropout', [0.1], save_dir_key=lambda val: f'f_drp{val}'), ]
def get_grid(args): grid = [] total_num_udpates = 20000 warmup_updates = 500 num_data_loaders = 4 arch = 'bart_large' task = 'translation' criterion = 'label_smoothed_cross_entropy' adam_eps = 1e-08 weight_decay = 0.01 update_freq = 4 if args.num_nodes == 1 else 1 grid += [ hyperparam( "--restore-file", "/private/home/namangoyal/src/fairseq_denoising_codepush/fairseq-py/bart.large/model.pt" ) ] # model settings grid += [ hyperparam("--arch", arch, save_dir_key=lambda val: val), hyperparam("--task", task), hyperparam('--criterion', criterion), hyperparam('--source-lang', 'source'), hyperparam('--target-lang', 'target'), hyperparam('--truncate-source'), hyperparam('--label-smoothing', 0.1, save_dir_key=lambda val: f"ls{val}"), ] grid += [ hyperparam("--max-tokens", 2048, save_dir_key=lambda val: f"mt{val}"), hyperparam("--update-freq", update_freq, save_dir_key=lambda val: f"uf{val}"), hyperparam("--max-update", total_num_udpates, save_dir_key=lambda val: f"mu{val}"), hyperparam("--required-batch-size-multiple", 1), ] # regularization grid += [ hyperparam("--dropout", 0.1, save_dir_key=lambda val: f"dr{val}"), hyperparam("--attention-dropout", 0.1, save_dir_key=lambda val: f"atdr{val}"), hyperparam("--relu-dropout", 0.0, save_dir_key=lambda val: f"actdr{val}"), hyperparam("--weight-decay", weight_decay, save_dir_key=lambda val: f"wd{val}"), ] # optimization settings grid += [ hyperparam("--optimizer", "adam", save_dir_key=lambda val: val), hyperparam("--adam-betas", "(0.9, 0.999)", save_dir_key=lambda val: "beta9999"), hyperparam("--adam-eps", adam_eps, save_dir_key=lambda val: f"eps{val}"), hyperparam("--clip-norm", 0.1, save_dir_key=lambda val: f"clip{val}"), ] # lr scheduler grid += [ hyperparam("--lr-scheduler", "polynomial_decay"), hyperparam("--lr", 3e-05, save_dir_key=lambda val: f"lr{val}"), hyperparam("--total-num-update", total_num_udpates), hyperparam("--warmup-updates", warmup_updates, save_dir_key=lambda val: f"warm{val}"), ] grid += [ hyperparam("--fp16", save_dir_key=lambda val: "fp16"), ] # data loading settings grid += [ hyperparam("--num-workers", num_data_loaders), ] # validation and checkpoint settings grid += [ # hyperparam("--no-save"), hyperparam("--no-epoch-checkpoints"), hyperparam('--reset-meters'), hyperparam('--reset-optimizer') ] grid += [ hyperparam('--share-all-embeddings'), hyperparam('--layernorm-embedding'), hyperparam('--share-decoder-input-output-embed'), ] # logging settings grid += [ hyperparam("--skip-invalid-size-inputs-valid-test"), hyperparam("--log-format", "json"), hyperparam("--log-interval", 10), ] if args.local: grid += [ hyperparam("--log-format", "json"), hyperparam("--log-interval", 1), ] return grid
def get_grid(args): return [ hyperparam('--save-interval', 1), hyperparam('--no-epoch-checkpoints'), hyperparam('--warmup', 0.1), hyperparam('--arch', 'finetuning_squad', save_dir_key=lambda val: val), hyperparam('--task', 'squad'), hyperparam('--max-update', [max_update], save_dir_key=lambda val: f'mxup{val}'), hyperparam('--data-file', '/private/home/yinhanliu/data/squad_bert/dev-v1.1.json'), hyperparam('--optimizer', 'bert_adam', save_dir_key=lambda val: val), hyperparam('--lr', [3e-05, 2e-05], save_dir_key=lambda val: f'lr{val}'), hyperparam('--t-total', max_update), hyperparam( '--bert-path', '/checkpoint/yinhanliu/20190322/hf_bert_implement/bert512.eps-06_0.0002/checkpoint_best.pt', save_dir_key=lambda val: f'bert'), hyperparam('--criterion', ['squad'], save_dir_key=lambda val: f'crs_ent'), hyperparam('--max-tokens', [ 1334, ], save_dir_key=lambda val: f'mxtk{val}'), hyperparam('--seed', [3, 4, 5, 6], save_dir_key=lambda val: f'seed{val}'), hyperparam('--skip-invalid-size-inputs-valid-test'), hyperparam('--log-format', 'json'), hyperparam('--log-interval', [500]), hyperparam('--min-lr', 1e-09), hyperparam('--model-dim', 768), ]
def get_grid_levenshtein(args): return [ # task, model, criterion hyperparam('--task', 'translation_lev'), hyperparam('--arch', 'levenshtein_transformer_wmt_en_de', save_dir_key=lambda val: MODEL[val]), # hyperparam('--arch', [ # 'levenshtein_transformer_wmt_en_de_big', # 'levenshtein_transformer_wmt_en_de' # ], # save_dir_key=lambda val: MODEL[val]), hyperparam('--criterion', 'label_smoothed_dual_imitation'), hyperparam('--noise', 'random_delete'), # task specific hyperparam('--fixed-validation-seed', 7), hyperparam('--append-bos', binary_flag=True), # model hyperparam('--encoder-learned-pos', binary_flag=True), hyperparam('--decoder-learned-pos', binary_flag=True, save_dir_key=lambda val: f'lp' if val else f'sp'), hyperparam('--share-all-embeddings', binary_flag=True), hyperparam('--apply-bert-init', binary_flag=True, save_dir_key=lambda val: f'bert' if val else f''), hyperparam('--early-exit', '(6,6,6)', save_dir_key=lambda val: f'ext-{val}'), # general hyperparam('--activation-fn', 'gelu', save_dir_key=lambda val: f'act-{val}'), # hyperparam('--max-tokens', 8192, save_dir_key=lambda val: f'b{val}'), hyperparam('--max-tokens', 4096, save_dir_key=lambda val: f'b8192'), hyperparam('--update-freq', 2), hyperparam('--fp16', binary_flag=True), hyperparam('--optimizer', 'adam'), hyperparam('--lr', 0.0005, save_dir_key=lambda val: f'lr{val}'), hyperparam('--min-lr', '1e-09'), hyperparam('--lr-scheduler', 'inverse_sqrt'), hyperparam('--max-update', 400000), hyperparam('--warmup-updates', 10000), hyperparam('--keep-last-epochs', 15), hyperparam('--keep-interval-updates', 5), hyperparam('--warmup-init-lr', '1e-07'), hyperparam('--adam-betas', '(0.9, 0.999)'), hyperparam('--dropout', 0.3), hyperparam('--label-smoothing', 0.1), hyperparam('--weight-decay', 0.01), hyperparam('--save-interval-updates', 10000), hyperparam('--log-format', 'simple'), hyperparam('--log-interval', 5), hyperparam('--seed', 2), # hyperparam('--seed', [1, 11], save_dir_key=lambda val: f'prefix{val % 10}'), # hyperparam('--seed', [3, 5, 7, 13, 15, 17], save_dir_key=lambda val: f'prefix{val % 10}'), # hyperparam('--seed', 5, save_dir_key=lambda val: f'fuse-0.{val}'), ]
def get_grid(args): """ Replicates the `16-bit+cumul+2x lr` results from Table 1 of "Scaling Neural Machine Translation" (https://arxiv.org/abs/1806.00187) """ return [ hyperparam('--fp16', save_dir_key=lambda val: 'fp16'), #hyperparam('--ddp-backend', 'no_c10d', save_dir_key=lambda val: 'no_c10d'), hyperparam('--max-epoch', 70), # equivalent to training on 16x GPUs hyperparam('--update-freq', 16 if not args.local else 1, save_dir_key=lambda val: f'updatefreq{val}'), hyperparam('--arch', 'transformer_wmt_en_de_big', save_dir_key=lambda val: val), hyperparam('--share-all-embeddings', [True], binary_flag=True, save_dir_key=lambda val: 'shareemb'), hyperparam('--optimizer', 'adam', save_dir_key=lambda val: val), hyperparam('--adam-betas', '(0.9, 0.98)', save_dir_key=lambda val: 'beta0.9,0.98'), hyperparam('--lr-scheduler', 'inverse_sqrt'), hyperparam('--warmup-init-lr', 1e-7, save_dir_key=lambda val: f'initlr{val}'), hyperparam('--warmup-updates', 4000, save_dir_key=lambda val: f'warmup{val}'), # use double the default learning rate, since we're using --update-freq=16 hyperparam('--lr', 10e-4, save_dir_key=lambda val: f'lr{val}'), hyperparam('--min-lr', 1e-9), hyperparam('--clip-norm', 0.0, save_dir_key=lambda val: f'clip{val}'), hyperparam('--dropout', 0.3, save_dir_key=lambda val: f'drop{val}'), hyperparam('--weight-decay', 0.0, save_dir_key=lambda val: f'wd{val}'), hyperparam('--criterion', 'label_smoothed_cross_entropy'), hyperparam('--label-smoothing', 0.1, save_dir_key=lambda val: f'ls{val}'), hyperparam('--max-tokens', 3584, save_dir_key=lambda val: f'maxtok{val}'), hyperparam('--seed', [2], save_dir_key=lambda val: f'seed{val}'), hyperparam('--log-format', 'json'), hyperparam('--log-interval', 100 if not args.local else 10), ]
def get_grid_cmlm(args): return [ # task, model, criterion hyperparam('--task', 'translation_lev'), hyperparam('--arch', 'cmlm_transformer', save_dir_key=lambda val: MODEL[val]), hyperparam('--criterion', 'label_smoothed_dual_imitation'), # task specific hyperparam('--fixed-validation-seed', 7), hyperparam('--append-bos', binary_flag=True), hyperparam('--noise', 'random_mask'), # model hyperparam('--encoder-learned-pos', True, binary_flag=True), hyperparam('--decoder-learned-pos', True, binary_flag=True, save_dir_key=lambda val: f'lp' if val else f'sp'), hyperparam('--share-all-embeddings', binary_flag=True), hyperparam('--apply-bert-init', binary_flag=True, save_dir_key=lambda val: f'bert' if val else f''), # length prediction hyperparam('--pred-length-offset', binary_flag=True), # hyperparam('--sg-length-pred', binary_flag=True, save_dir_key=lambda val: f'sg' if val else f''), hyperparam('--length-loss-factor', 0.1, save_dir_key=lambda val: f'lf{val}'), # general hyperparam('--activation-fn', 'gelu', save_dir_key=lambda val: f'{val}'), hyperparam('--max-tokens', 4096, save_dir_key=lambda val: f'b{val}'), hyperparam('--update-freq', 2, save_dir_key=lambda val: f'u{val}'), hyperparam('--fp16', binary_flag=True), hyperparam('--optimizer', 'adam'), hyperparam('--lr', 0.0005, save_dir_key=lambda val: f'lr{val}'), hyperparam('--min-lr', '1e-09'), hyperparam('--lr-scheduler', 'inverse_sqrt'), hyperparam('--max-update', 400000), hyperparam('--warmup-updates', 10000), hyperparam('--keep-last-epochs', 5), hyperparam('--keep-interval-updates', 5), hyperparam('--warmup-init-lr', '1e-07'), hyperparam('--adam-betas', '(0.9, 0.999)'), hyperparam('--dropout', 0.3), hyperparam('--label-smoothing', 0.1), hyperparam('--weight-decay', 0.01), hyperparam('--save-interval-updates', 10000), hyperparam('--log-format', 'simple'), hyperparam('--log-interval', 5), ]
def get_grid_insertion(args): return [ # task, model, criterion hyperparam('--task', 'translation_lev'), hyperparam('--arch', 'insertion_transformer', save_dir_key=lambda val: MODEL[val]), hyperparam('--criterion', 'label_smoothed_dual_imitation'), hyperparam('--noise', 'random_delete'), # task specific hyperparam('--fixed-validation-seed', 7), hyperparam('--append-bos', binary_flag=True), # model hyperparam('--encoder-learned-pos', binary_flag=True), hyperparam('--decoder-learned-pos', binary_flag=True, save_dir_key=lambda val: f'lp' if val else f'sp'), hyperparam('--share-all-embeddings', binary_flag=True), hyperparam('--apply-bert-init', binary_flag=True, save_dir_key=lambda val: f'bert' if val else f''), hyperparam('--label-tau', 1, save_dir_key=lambda val: f'tau{val}' if val < 1000 else f'uniform'), # general hyperparam('--activation-fn', 'gelu', save_dir_key=lambda val: f'act-{val}'), # hyperparam('--max-tokens', 6144, save_dir_key=lambda val: f'bz{val}'), hyperparam('--max-tokens', 8192, save_dir_key=lambda val: f'b{val}'), hyperparam('--fp16', binary_flag=True), hyperparam('--optimizer', 'adam'), hyperparam('--lr', 0.0005, save_dir_key=lambda val: f'lr{val}'), hyperparam('--min-lr', '1e-09'), hyperparam('--lr-scheduler', 'inverse_sqrt'), hyperparam('--max-update', 400000), hyperparam('--warmup-updates', 10000), hyperparam('--warmup-init-lr', '1e-07'), hyperparam('--adam-betas', '(0.9, 0.999)'), hyperparam('--dropout', 0.3), hyperparam('--label-smoothing', 0.1), hyperparam('--weight-decay', 0.01), hyperparam('--save-interval-updates', 10000), hyperparam('--keep-last-epochs', 15), hyperparam('--keep-interval-updates', 5), hyperparam('--log-format', 'simple'), hyperparam('--log-interval', 5), # hyperparam('--seed', [1, 2, 3, 4, 5, 6, 7], save_dir_key=lambda val: f'rb-{val}'), ]
def get_at_grid(args): """ Auto-regressive Transformer """ return [ hyperparam('--fp16', save_dir_key=lambda val: 'fp16'), #hyperparam('--ddp-backend', 'no_c10d', save_dir_key=lambda val: 'no_c10d'), hyperparam('--max-update', 300000), # equivalent to training on 16x GPUs # hyperparam('--update-freq', 16, save_dir_key=lambda val: f'updatefreq{val}'), hyperparam('--arch', ['transformer_small'], save_dir_key=lambda val: val), hyperparam('--share-all-embeddings', [True], binary_flag=True, save_dir_key=lambda val: 'shareemb'), hyperparam('--optimizer', 'adam', save_dir_key=lambda val: val), hyperparam('--adam-betas', '(0.9, 0.98)', save_dir_key=lambda val: 'beta0.9,0.98'), hyperparam('--lr-scheduler', 'inverse_sqrt'), hyperparam('--warmup-init-lr', 1e-7, save_dir_key=lambda val: f'initlr{val}'), hyperparam('--warmup-updates', 4000, save_dir_key=lambda val: f'warmup{val}'), # use double the default learning rate, since we're using --update-freq=16 hyperparam('--lr', 0.0005, save_dir_key=lambda val: f'lr{val}'), hyperparam('--min-lr', 1e-9), hyperparam('--clip-norm', 25, save_dir_key=lambda val: f'clip{val}'), hyperparam('--dropout', 0.1, save_dir_key=lambda val: f'drop{val}'), hyperparam('--weight-decay', 0.0001, save_dir_key=lambda val: f'wd{val}'), hyperparam('--criterion', 'label_smoothed_cross_entropy'), hyperparam('--label-smoothing', 0.1, save_dir_key=lambda val: f'ls{val}'), hyperparam('--max-tokens', 4096, save_dir_key=lambda val: f'maxtok{val}'), hyperparam('--seed', [2], save_dir_key=lambda val: f'seed{val}'), hyperparam('--keep-last-epochs', 15), hyperparam('--keep-interval-updates', 5), hyperparam('--log-format', 'simple'), hyperparam('--log-interval', 100), ]
def get_grid_inat(args): return [ # task, model, criterion hyperparam('--task', 'translation_lev'), hyperparam('--arch', 'iterative_nonautoregressive_transformer', save_dir_key=lambda val: MODEL[val]), hyperparam('--criterion', 'label_smoothed_dual_imitation'), # task specific hyperparam('--fixed-validation-seed', 7), hyperparam('--append-bos', binary_flag=True), hyperparam('--noise', 'full_mask'), # model hyperparam('--encoder-learned-pos', True, binary_flag=True), hyperparam('--decoder-learned-pos', True, binary_flag=True, save_dir_key=lambda val: f'lp' if val else f'sp'), hyperparam('--share-all-embeddings', binary_flag=True), hyperparam('--apply-bert-init', binary_flag=True, save_dir_key=lambda val: f'bert' if val else f''), # iterative refinement settings hyperparam('--train-step', 3, save_dir_key=lambda val: f'iter{val}'), hyperparam('--dae-ratio', 0.5, save_dir_key=lambda val: f'dae{val}'), hyperparam('--stochastic-approx', True, binary_flag=True, save_dir_key=lambda val: 'sa'), # length prediction hyperparam('--pred-length-offset', binary_flag=True), # hyperparam('--sg-length-pred', binary_flag=True, save_dir_key=lambda val: f'sg' if val else f''), hyperparam('--length-loss-factor', 0.1, save_dir_key=lambda val: f'lf{val}'), # hyperparam('--src-embedding-copy', [True, False], # binary_flag=True, # save_dir_key=lambda val: 'copy'), # n-gram loss # hyperparam('--ngram-predictor', # 4, # save_dir_key=lambda val: f'{val}-gram'), # general hyperparam('--activation-fn', 'gelu', save_dir_key=lambda val: f'{val}'), hyperparam('--max-tokens', 2048, save_dir_key=lambda val: f'b{val}'), hyperparam('--update-freq', 2, save_dir_key=lambda val: f'u{val}'), hyperparam('--fp16', binary_flag=True), hyperparam('--optimizer', 'adam'), hyperparam('--lr', 0.0005, save_dir_key=lambda val: f'lr{val}'), hyperparam('--min-lr', '1e-09'), hyperparam('--lr-scheduler', 'inverse_sqrt'), hyperparam('--max-update', 400000), hyperparam('--warmup-updates', 10000), hyperparam('--keep-last-epochs', 5), hyperparam('--keep-interval-updates', 5), hyperparam('--warmup-init-lr', '1e-07'), hyperparam('--adam-betas', '(0.9, 0.999)'), hyperparam('--dropout', 0.3), hyperparam('--label-smoothing', 0.1), hyperparam('--weight-decay', 0.01), hyperparam('--save-interval-updates', 10000), hyperparam('--log-format', 'simple'), hyperparam('--log-interval', 5), # hyperparam('--seed', [1, 2, 3, 4, 5, 6, 7], save_dir_key=lambda val: f'rb-{val}'), ]
def get_grid(args): return [ hyperparam('--fp16', save_dir_key=lambda val: 'fp16'), hyperparam('--max-update', 50000), hyperparam('--task', 'language_modeling'), hyperparam('--arch', 'hf_gpt2', save_dir_key=lambda val: val), #hyperparam('--arch', 'transformer_lm_gpt', save_dir_key=lambda val: val), #hyperparam('--share-decoder-input-output-embed', save_dir_key=lambda val: 'shareemb'), hyperparam('--dropout', 0.1, save_dir_key=lambda val: f'drop{val}'), hyperparam('--optimizer', 'adam', save_dir_key=lambda val: val), hyperparam('--adam-betas', '(0.9, 0.98)', save_dir_key=lambda val: 'beta0.9,0.98'), hyperparam('--weight-decay', 0.01, save_dir_key=lambda val: f'wd{val}'), hyperparam('--clip-norm', 0.0, save_dir_key=lambda val: f'clip{val}'), hyperparam('--lr', 5e-4, save_dir_key=lambda val: f'lr{val}'), hyperparam('--lr-scheduler', 'inverse_sqrt'), hyperparam('--warmup-updates', 4000, save_dir_key=lambda val: f'warmup{val}'), hyperparam('--warmup-init-lr', 1e-7, save_dir_key=lambda val: f'initlr{val}'), hyperparam('--tokens-per-sample', 512, save_dir_key=lambda val: f'sampletok{val}'), hyperparam('--sample-break-mode', 'none', save_dir_key=lambda val: f'break{val}'), hyperparam('--max-tokens', 2048, save_dir_key=lambda val: f'maxtok{val}'), hyperparam('--update-freq', 4, save_dir_key=lambda val: f'updatefreq{val}'), hyperparam('--seed', [2], save_dir_key=lambda val: f'seed{val}'), hyperparam('--log-format', 'json'), hyperparam('--log-interval', 25), ]
def get_grid(args): max_update = 500000 return [ hyperparam('--train-subset', 'train' if not args.local else 'valid'), hyperparam('--skip-invalid-size-inputs-valid-test'), hyperparam('--fast-stat-sync', save_dir_key=lambda _: 'faststatsync'), hyperparam('--memory-efficient-fp16', save_dir_key=lambda val: 'me_fp16'), hyperparam('--num-workers', 2), hyperparam('--task', 'masked_lm'), hyperparam('--criterion', 'masked_lm'), hyperparam('--arch', 'roberta_base', save_dir_key=lambda val: val), hyperparam('--sample-break-mode', 'complete', save_dir_key=lambda val: 'cmpltdoc'), hyperparam('--tokens-per-sample', 512, save_dir_key=lambda val: f'tps{val}'), hyperparam('--optimizer', 'adam', save_dir_key=lambda val: val), hyperparam('--adam-betas', '(0.9, 0.98)', save_dir_key=lambda val: 'b2_0.98'), hyperparam('--adam-eps', 1e-6, save_dir_key=lambda val: f'eps{val}'), hyperparam('--clip-norm', 0.0, save_dir_key=lambda val: f'cl{val}'), hyperparam('--lr-scheduler', 'polynomial_decay'), hyperparam('--lr', 6e-4, save_dir_key=lambda val: f'lr{val}'), hyperparam('--warmup-updates', 24000, save_dir_key=lambda val: f'wu{val}'), hyperparam('--total-num-update', max_update), hyperparam('--dropout', 0.1, save_dir_key=lambda val: f'dr{val}'), hyperparam('--attention-dropout', 0.1, save_dir_key=lambda val: f'atdr{val}'), hyperparam('--weight-decay', 0.01, save_dir_key=lambda val: f'wd{val}'), hyperparam('--max-sentences', 32, save_dir_key=lambda val: f'ms{val}'), hyperparam('--update-freq', 1, save_dir_key=lambda val: f'uf{val}'), hyperparam('--max-update', max_update, save_dir_key=lambda val: f'mu{val}'), hyperparam('--seed', 1, save_dir_key=lambda val: f's{val}'), hyperparam('--log-format', 'json'), hyperparam('--log-interval', 25), ]
def get_grid(args): config = '8k' # 2k if config == '8k': max_update = 100000 save_interval = 5000 valid_interval = 5000 update_freq = 1 lr = 5.2e-4 warmup = 5000 else: max_update = 100000 save_interval = 5000 valid_interval = 5000 update_freq = 4 lr = 5e-4 warmup = 5000 seeds = [0] grid = [ # hyperparam('--train-subset', 'train' if not args.local else 'test'), hyperparam('--train-subset', 'valid'), hyperparam('--fp16', save_dir_key=lambda val: 'fp16'), hyperparam('--num-workers', 4), hyperparam('--task', 'multilingual_masked_lm'), hyperparam('--criterion', 'masked_lm'), hyperparam('--arch', 'roberta_large', save_dir_key=lambda val: val), hyperparam('--sample-break-mode', 'complete', save_dir_key=lambda val: 'cmplt'), hyperparam('--tokens-per-sample', 512, save_dir_key=lambda val: f'tps{val}'), hyperparam('--optimizer', 'adam', save_dir_key=lambda val: val), hyperparam('--adam-betas', '(0.9, 0.98)', save_dir_key=lambda val: 'b2_0.98'), hyperparam('--adam-eps', 1e-6, save_dir_key=lambda val: f'eps{val}'), hyperparam('--clip-norm', 1.0, save_dir_key=lambda val: f'cl{val}'), hyperparam('--lr-scheduler', 'polynomial_decay'), hyperparam('--lr', lr, save_dir_key=lambda val: f'lr{val}'), hyperparam('--warmup-updates', warmup, save_dir_key=lambda val: f'wu{val}'), hyperparam('--total-num-update', max_update), hyperparam('--dropout', 0.1, save_dir_key=lambda val: f'dr{val}'), hyperparam('--attention-dropout', 0.1, save_dir_key=lambda val: f'atdr{val}'), hyperparam('--weight-decay', 0.01, save_dir_key=lambda val: f'wd{val}'), # hyperparam('--max-tokens', 3200, save_dir_key=lambda val: f'mt{val}'), hyperparam('--max-sentences', 12, save_dir_key=lambda val: f'ms{val}'), hyperparam('--update-freq', update_freq, save_dir_key=lambda val: f'uf{val}'), hyperparam('--max-update', max_update, save_dir_key=lambda val: f'mu{val}'), hyperparam('--multilang-sampling-alpha', 0.7, save_dir_key=lambda val: f's{val}'), ] grid += [ hyperparam("--skip-invalid-size-inputs-valid-test"), hyperparam("--log-format", "json"), hyperparam("--log-interval", 100), ] # random seed grid += [ hyperparam("--seed", seeds, save_dir_key=lambda val: f"seed{val}"), ] grid += [ hyperparam("--validate-interval", valid_interval), ] grid += [ hyperparam("--save-interval-updates", save_interval), hyperparam("--no-epoch-checkpoints"), ] if args.local: grid += [ hyperparam("--log-format", "json"), hyperparam("--log-interval", 1), ] return grid
def get_grid(args): model_size = 'large' return [ hyperparam('--train-subset', 'train' if not args.local else 'valid'), hyperparam('data', list(tasks.keys()), positional_arg=True, save_dir_key=lambda val: get_save_dir_key(val)), hyperparam('--no-epoch-checkpoints'), hyperparam('--no-last-checkpoints'), hyperparam('--no-save-optimizer-state'), hyperparam('--save-interval-updates', 1000), hyperparam('--reset-optimizer'), hyperparam('--reset-dataloader'), hyperparam('--reset-meters'), hyperparam('--best-checkpoint-metric', 'accuracy'), hyperparam('--maximize-best-checkpoint-metric', [True], binary_flag=True), hyperparam('--restore-file', '/private/home/myleott/roberta.' + model_size + '/model.pt'), hyperparam('--fp16', save_dir_key=lambda val: 'fp16'), hyperparam('--ddp-backend', 'no_c10d'), hyperparam('--num-workers', 1 if not args.local else 0), hyperparam('--task', 'sentence_prediction', save_dir_key=lambda val: 'sentpred'), hyperparam('--init-token', 0, save_dir_key=lambda val: f'bos{val}'), hyperparam('--separator-token', 2, save_dir_key=lambda val: f'sep{val}'), hyperparam('--max-positions', 512), hyperparam('--regression-target', [False], binary_flag=True), hyperparam('--arch', 'roberta_' + model_size, save_dir_key=lambda val: val), hyperparam('--bpe', 'gpt2'), hyperparam('--criterion', 'sentence_prediction'), hyperparam('--num-classes', [None]), hyperparam('--optimizer', 'adam', save_dir_key=lambda val: val), hyperparam('--adam-betas', '(0.9, 0.98)', save_dir_key=lambda val: 'b2_0.98'), hyperparam('--adam-eps', 1e-6, save_dir_key=lambda val: f'eps{val}'), hyperparam('--clip-norm', 0.0, save_dir_key=lambda val: f'clip{val}'), hyperparam('--lr-scheduler', 'polynomial_decay'), hyperparam('--lr', [None], save_dir_key=lambda val: f'lr{val}'), hyperparam('--warmup-updates', [None], save_dir_key=lambda val: f'wu{val}'), hyperparam('--total-num-update', [None]), hyperparam('--dropout', 0.1, save_dir_key=lambda val: f'dr{val}'), hyperparam('--attention-dropout', 0.1, save_dir_key=lambda val: f'atdr{val}'), hyperparam('--weight-decay', 0.01, save_dir_key=lambda val: f'wd{val}'), hyperparam('--max-sentences', [None], save_dir_key=lambda val: f'ms{val}'), hyperparam('--required-batch-size-multiple', 1), hyperparam('--update-freq', 1, save_dir_key=lambda val: f'uf{val}'), hyperparam('--max-update', [None], save_dir_key=lambda val: f'mu{val}'), hyperparam('--seed', [1], save_dir_key=lambda val: f's{val}'), hyperparam('--log-format', 'json'), hyperparam('--log-interval', 25), ]
def get_grid(args): max_update = 100000 return [ hyperparam('--train-subset', 'train' if not args.local else 'valid'), hyperparam('--fp16', save_dir_key=lambda val: 'fp16'), #hyperparam('--memory-efficient-fp16', save_dir_key=lambda val: 'me_fp16'), hyperparam('--num-workers', 2), hyperparam('--save-interval-updates', 10000), hyperparam('--no-epoch-checkpoints'), hyperparam('--task', 'language_modeling'), hyperparam('--sample-break-mode', 'none', save_dir_key=lambda val: f'bm_{val}'), hyperparam('--tokens-per-sample', 1024, save_dir_key=lambda val: f'tps{val}'), #hyperparam('--arch', 'transformer_lm_gpt', save_dir_key=lambda val: val), hyperparam('--arch', 'transformer_lm_gpt2_small', save_dir_key=lambda val: val), #hyperparam('--arch', 'transformer_lm_gpt2_medium', save_dir_key=lambda val: val), #hyperparam('--arch', 'transformer_lm_gpt2_big', save_dir_key=lambda val: val), hyperparam('--share-decoder-input-output-embed', save_dir_key=lambda val: 'share'), hyperparam('--optimizer', 'adam', save_dir_key=lambda val: val), hyperparam('--adam-betas', '(0.9, 0.98)', save_dir_key=lambda val: 'b2_0.98'), hyperparam('--adam-eps', 1e-8, save_dir_key=lambda val: f'eps{val}'), hyperparam('--clip-norm', 0.0, save_dir_key=lambda val: f'cl{val}'), hyperparam('--lr-scheduler', 'polynomial_decay'), hyperparam('--lr', 50e-4, save_dir_key=lambda val: f'lr{val}'), hyperparam('--total-num-update', max_update), hyperparam('--warmup-updates', 10000, save_dir_key=lambda val: f'wu{val}'), hyperparam('--dropout', 0.1, save_dir_key=lambda val: f'dr{val}'), hyperparam('--attention-dropout', 0.1, save_dir_key=lambda val: f'atdr{val}'), hyperparam('--weight-decay', 0.01, save_dir_key=lambda val: f'wd{val}'), hyperparam('--max-sentences', 2, save_dir_key=lambda val: f'ms{val}'), hyperparam('--required-batch-size-multiple', 1), hyperparam('--update-freq', 1, save_dir_key=lambda val: f'uf{val}'), hyperparam('--max-update', max_update, save_dir_key=lambda val: f'mu{val}'), hyperparam('--seed', 1, save_dir_key=lambda val: f's{val}'), hyperparam('--log-format', 'json'), hyperparam('--log-interval', 25), ]