def make_model(args, device, ntokens): ninp = 2048 # embedding dimension nhid = 2048 # the dimension of the feedforward network model in nn.TransformerEncoder nhead = 32 # the number of heads in the multiheadattention models dropout = 0 initrange = 0.1 ndecoder = args.num_decoder_layers if args.lazy_construction: layers = [ lambda: EmbeddingLayer(ntokens, ninp, initrange), lambda: PositionalEncodingLayer(ninp, dropout), ] for _ in range(ndecoder): layers.append( lambda: TransformerDecoderLayer(ninp, nhead, nhid, dropout)) layers.append(lambda: LinearLayer(ninp, ntokens, initrange)) model = layers else: model = TransformerLMSequntial(ntokens, ninp, nhead, nhid, dropout, initrange, ndecoder).to(device) criterion = nn.CrossEntropyLoss() lr = 0.01 # learning rate def make_adam(model): return Adam(model.parameters(), lr=lr) optimizer = make_adam scaler = GradScaler() return model, criterion, optimizer, scaler
def get_model_config(): return { "vocab_size": 10000, "ninp": 2048, # embedding dimension "nhid": 2048, # the dimension of the feedforward network model in nn.TransformerEncoder "nhead": 32, # the number of heads in the multiheadattention models "dropout": 0, "initrange": 0.1, "scaler": GradScaler(), "clip_value": 0.05, "num_decoder_layers": 10, "seq_len": 32, }
def test_step_with_grad_scaler(): weight, bias, input = make_half_precision_params() optimizer = Adam([weight, bias], lr=1e-3, precision=Precision.PURE_FP16) scaler = GradScaler() initial_value = None for _i in range(5): optimizer.zero_grad() loss = (weight.mv(input) + bias).pow(2).sum() if _i == 0: initial_value = loss.item() scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() assert loss.item() < initial_value
def get_benchmark_config(): return { "epochs": 1, "vocab_size": 10000, "ninp": 2048, # embedding dimension "nhid": 2048, # the dimension of the feedforward network model in nn.TransformerEncoder "nhead": 32, # the number of heads in the multiheadattention models "dropout": 0, "initrange": 0.1, "criterion": nn.CrossEntropyLoss(), "lr": 0.001, # learning rate "scaler": GradScaler(), "clip_value": 0.05, "batch_size": 8, }
def create_benchmark_config(model_name): """Return a dict with configurations required for benchmarking `model_name` model.""" if model_name == "lm": return { "vocab_size": 10000, "ninp": 2048, # embedding dimension "nhid": 2048, # the dimension of the feedforward network model in nn.TransformerEncoder "nhead": 32, # the number of heads in the multiheadattention models "dropout": 0, "initrange": 0.1, "criterion": nn.CrossEntropyLoss(), "lr": 0.01, # learning rate "scaler": GradScaler(), "clip_value": 0.05, } else: raise RuntimeError("Unrecognized args.model_mame " % args.model_name)
def make_model(device, ntokens): ninp = 50 # embedding dimension nhid = 50 # the dimension of the feedforward network model in nn.TransformerEncoder nhead = 2 # the number of heads in the multiheadattention models dropout = 0 initrange = 0.1 model = TransformerLMSequntial(ntokens, ninp, nhead, nhid, dropout, initrange).half().to(device) balance = generate_balance(min(num_devices, 4), len(model)) p = Pipe(model, balance, chunks=len(balance)) criterion = nn.CrossEntropyLoss() lr = 0.001 # learning rate try: optimizer = Adam(p.parameters(), lr=lr, precision=Precision.PURE_FP16) except NameError: optimizer = Adam(p.parameters(), lr=lr) scaler = GradScaler() return p, criterion, optimizer, scaler