def train(): assert ( args.batch_size % args.small_batch_size == 0 ), "batch_size must be divisible by small_batch_size" # Turn on training mode which enables dropout. total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = [ model.init_hidden(args.small_batch_size) for _ in range(args.batch_size // args.small_batch_size) ] hidden_valid = [ model.init_hidden(args.small_batch_size) for _ in range(args.batch_size // args.small_batch_size) ] batch, i = 0, 0 while i < train_data.size(0) - 1 - 1: bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2.0 # Prevent excessively small or negative sequence lengths # seq_len = max(5, int(np.random.normal(bptt, 5))) # # There's a very small chance that it could select a very long sequence length resulting in OOM # seq_len = min(seq_len, args.bptt + args.max_seq_len_delta) seq_len = int(bptt) lr2 = optimizer.param_groups[0]["lr"] optimizer.param_groups[0]["lr"] = lr2 * seq_len / args.bptt model.train() data_valid, targets_valid = get_batch( search_data, i % (search_data.size(0) - 1), args ) data, targets = get_batch(train_data, i, args, seq_len=seq_len) optimizer.zero_grad() start, end, s_id = 0, args.small_batch_size, 0 while start < args.batch_size: # cur_data, cur_targets = ( # data[:, start:end], # targets[:, start:end].contiguous().view(-1), # ) # cur_data_valid, cur_targets_valid = ( # data_valid[:, start:end], # targets_valid[:, start:end].contiguous(), # ) cur_data, cur_targets = (data, targets.contiguous()) cur_data_valid, cur_targets_valid = ( data_valid, targets_valid.contiguous()) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden[s_id] = repackage_hidden(hidden[s_id]) hidden_valid[s_id] = repackage_hidden(hidden_valid[s_id]) hidden_valid[s_id], grad_norm = architect.step( hidden[s_id], cur_data, cur_targets, hidden_valid[s_id], cur_data_valid, cur_targets_valid, optimizer, args.unrolled, ) # assuming small_batch_size = batch_size so we don't accumulate gradients optimizer.zero_grad() hidden[s_id] = repackage_hidden(hidden[s_id]) log_prob, hidden[s_id], rnn_hs, dropped_rnn_hs = parallel_model( cur_data, hidden[s_id], return_h=True ) raw_loss = nn.functional.nll_loss( log_prob.view(-1, log_prob.size(2)), cur_targets ) loss = raw_loss # Activiation Regularization if args.alpha > 0: loss = loss + sum( args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:] ) # Temporal Activation Regularization (slowness) loss = loss + sum( args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:] ) loss *= args.small_batch_size / args.batch_size total_loss += raw_loss.data * args.small_batch_size / args.batch_size loss.backward() s_id += 1 start = end end = start + args.small_batch_size gc.collect() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) optimizer.step() # total_loss += raw_loss.data optimizer.param_groups[0]["lr"] = lr2 if batch % args.log_interval == 0 and batch > 0: logging.info(parallel_model.genotype()) print(F.softmax(parallel_model.weights, dim=-1)) cur_loss = total_loss.item() / args.log_interval elapsed = time.time() - start_time logging.info( "| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | " "loss {:5.2f} | ppl {:8.2f}".format( epoch, batch, len(train_data) // args.bptt, optimizer.param_groups[0]["lr"], elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss), ) ) total_loss = 0 start_time = time.time() batch += 1 i += seq_len
model = torch.load(os.path.join(args.save, "model.pt")) else: model = model.RNNModelSearch( ntokens, args.emsize, args.nhid, args.nhidlast, args.dropout, args.dropouth, args.dropoutx, args.dropouti, args.dropoute, ) size = 0 for p in model.parameters(): size += p.nelement() logging.info("param size: {}".format(size)) logging.info("initial genotype:") logging.info(model.genotype()) if torch.cuda.is_available(): # if torch.cuda.device_count() > 1: # parallel_model = nn.DataParallel(model, dim=1) # parallel_model = parallel_model.to(device) # else: parallel_model = model.to(device) else: parallel_model = model
train_data = batchify(corpus.train, args.batch_size, args) search_data = batchify(corpus.valid, args.batch_size, args) val_data = batchify(corpus.valid, eval_batch_size, args) test_data = batchify(corpus.test, test_batch_size, args) ntokens = len(corpus.dictionary) if args.continue_train: model = torch.load(os.path.join(args.save, 'model.pt')) else: model = model.RNNModelSearch(ntokens, args.emsize, args.nhid, args.nhidlast, args.dropout, args.dropouth, args.dropoutx, args.dropouti, args.dropoute) size = 0 for p in model.parameters(): size += p.nelement() logging.info('param size: {}'.format(size)) logging.info('initial genotype:') logging.info(model.genotype()) if args.cuda: if args.single_gpu: parallel_model = model.cuda() else: parallel_model = nn.DataParallel(model, dim=1).cuda() else: parallel_model = model print(parallel_model.rnns[0]._W0.U.device, parallel_model.rnns[0]._Ws[0].U.device) architect = Architect(parallel_model, args)
train_data = batchify(corpus.train, args.batch_size, args) search_data = batchify(corpus.valid, args.batch_size, args) val_data = batchify(corpus.valid, eval_batch_size, args) test_data = batchify(corpus.test, test_batch_size, args) ntokens = len(corpus.dictionary) if args.continue_train: model = torch.load(os.path.join(args.save, 'model.pt')) else: model = model.RNNModelSearch(ntokens, args.emsize, args.nhid, args.nhidlast, args.dropout, args.dropouth, args.dropoutx, args.dropouti, args.dropoute) size = 0 for p in model.parameters(): size += p.nelement() logging.info('param size: {}'.format(size)) logging.info('initial genotype:') logging.info(model.genotype()) if args.cuda: if args.single_gpu: parallel_model = model.cuda() else: parallel_model = nn.DataParallel(model, dim=1).cuda() else: parallel_model = model architect = Architect(parallel_model, args) total_params = sum(x.data.nelement() for x in model.parameters())
def train(train_data, dev_data): assert args.batch_size % args.small_batch_size == 0, 'batch_size must be divisible by small_batch_size' ntokens = len(vocab.word2id) # Turn on training mode which enables dropout. total_loss = 0 total_valid_loss = 0 start_time = time.time() # ntokens = len(corpus.dictionary) # batch, i = 0, 0 for batch in range(len(train_data)): train_batch = train_data.next_batch() dev_batch = dev_data.next_batch() # for batch, (train_batch, dev_batch) in enumerate(zip(train_data, dev_data)): # hidden = [model.init_hidden(args.small_batch_size) for _ in range(args.batch_size // args.small_batch_size)] # hidden_valid = [model.init_hidden(args.small_batch_size) for _ in # range(args.batch_size // args.small_batch_size)] #print('hidden shape: {} | hidden valid: {} |'.format(hidden.shape, hidden_valid.shape)) # while i < train_data.size(0) - 1 - 1: bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2. # Prevent excessively small or negative sequence lengths # seq_len = max(5, int(np.random.normal(bptt, 5))) # # There's a very small chance that it could select a very long sequence length resulting in OOM # seq_len = min(seq_len, args.bptt + args.max_seq_len_delta) # seq_len = int(bptt) lr2 = optimizer.param_groups[0]['lr'] optimizer.param_groups[0]['lr'] = lr2 #* seq_len / args.bptt model.train() # data_valid, targets_valid = get_batch(search_data, i % (search_data.size(0) - 1), args) # data, targets = get_batch(train_data, i, args, seq_len=seq_len) optimizer.zero_grad() # start, end, s_id = 0, args.small_batch_size, 0 cur_data = train_batch cur_targets = train_batch['relation'] cur_data_valid = dev_batch cur_targets_valid = dev_batch['relation'] hidden = model.init_hidden(len(train_batch['relation']))[0] hidden_valid = model.init_hidden(len(dev_batch['relation']))[0] # print('Train Batch Shapes: | Hidden: {} | Tokens: {} |'.format(hidden.shape, cur_data['tokens'].shape)) # print('Dev Batch Shapes: | Hidden: {} | Tokens: {} |'.format(hidden_valid.shape, cur_data_valid['tokens'].shape)) assert hidden.shape[1] == cur_data['tokens'].shape[ 0], 'Hidden shape: {} | tokens shape: {}'.format( hidden.shape, cur_data['tokens'].shape) assert hidden_valid.shape[1] == cur_data_valid['tokens'].shape[ 0], 'Hidden shape: {} | tokens shape: {}'.format( hidden_valid.shape, cur_data_valid['tokens'].shape) # while start < args.batch_size: # cur_data, cur_targets = data[:, start: end], targets[:, start: end].contiguous().view(-1) # cur_data_valid, cur_targets_valid = data_valid[:, start: end], targets_valid[:, start: end].contiguous().view(-1) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. # hidden[s_id] = repackage_hidden(hidden[s_id]) # hidden_valid[s_id] = repackage_hidden(hidden_valid[s_id]) #print(hidden.shape) #hidden = repackage_hidden(hidden) #hidden_valid = repackage_hidden(hidden_valid) # hidden_valid[s_id], grad_norm = architect.step( # hidden[s_id], cur_data, cur_targets, # hidden_valid[s_id], cur_data_valid, cur_targets_valid, # optimizer, # args.unrolled) hidden_valid, valid_loss = architect.step(hidden, cur_data, cur_targets, hidden_valid, cur_data_valid, cur_targets_valid, optimizer, args.unrolled) total_valid_loss += valid_loss.data # print('Finished architect step...') # assuming small_batch_size = batch_size so we don't accumulate gradients optimizer.zero_grad() # hidden[s_id] = repackage_hidden(hidden[s_id]) #hidden = repackage_hidden(hidden) # log_prob, hidden[s_id], rnn_hs, dropped_rnn_hs = parallel_model(cur_data, hidden[s_id], return_h=True) # print('Entering model training...') hidden = torch.autograd.Variable(hidden.data) # Hidden should be all zeros print('hidden all zeros?: (not {})'.format(torch.sum(hidden))) log_prob, hidden, rnn_hs, dropped_rnn_hs = parallel_model( cur_data, hidden, return_h=True) # print('received predictions') raw_loss = nn.functional.nll_loss(log_prob, cur_targets) # print('received loss' ) loss = raw_loss # Activiation Regularization if args.alpha > 0: loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:]) # Temporal Activation Regularization (slowness) loss = loss + sum(args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:]) # loss *= args.small_batch_size / args.batch_size total_loss += raw_loss.data # * args.small_batch_size / args.batch_size loss.backward() # s_id += 1 # start = end # end = start + args.small_batch_size # print('backpropogated...') gc.collect() # print('garbage collected...') # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs. torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) # print('clipped gradients...') optimizer.step() # print('updated gradients...') # total_loss += raw_loss.data optimizer.param_groups[0]['lr'] = lr2 if batch % args.log_interval == 0: # and batch > 0: logging.info(parallel_model.genotype()) print(F.softmax(parallel_model.weights, dim=-1)) #print('total loss: {}'.format(type(total_loss))) #print('total loss: {}'.format(total_loss)) #print('total loss: {}'.format(total_loss.shape)) #cur_loss = total_loss[0] / args.log_interval cur_loss = total_loss / args.log_interval cur_valid_loss = total_valid_loss / args.log_interval elapsed = time.time() - start_time logging.info( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f} | valid loss: {:5.2f} | valid ppl: {:5.2f}' .format(epoch, batch, len(train_data), optimizer.param_groups[0]['lr'], elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss), cur_valid_loss, math.exp(cur_valid_loss))) total_loss = 0 start_time = time.time() # print('on to next batch...') # batch += 1 # i += seq_len print('Reached end of epoch training!')
def train(): assert args.batch_size % args.small_batch_size == 0, 'batch_size must be divisible by small_batch_size' # Turn on training mode which enables dropout. total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = [ model.init_hidden(args.small_batch_size) for _ in range(args.batch_size // args.small_batch_size) ] batch, i = 0, 0 model.train() while i < train_data.size(0) - 1 - 1: bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2. # Prevent excessively small or negative sequence lengths # seq_len = max(5, int(np.random.normal(bptt, 5))) # # There's a very small chance that it could select a very long sequence length resulting in OOM # seq_len = min(seq_len, args.bptt + args.max_seq_len_delta) seq_len = int(bptt) lr2 = optimizer.param_groups[0]['lr'] optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt data, targets = get_batch(train_data, i, args, seq_len=seq_len) start, end, s_id = 0, args.small_batch_size, 0 while start < args.batch_size: cur_data, cur_targets = data[:, start:end], targets[:, start: end].contiguous( ).view(-1) optimizer.zero_grad() hidden[s_id] = repackage_hidden(hidden[s_id]) parallel_model.sample_new_architecture() log_prob, hidden[s_id], rnn_hs, dropped_rnn_hs = parallel_model( cur_data, hidden[s_id], return_h=True) raw_loss = nn.functional.nll_loss( log_prob.view(-1, log_prob.size(2)), cur_targets) loss = raw_loss # Activiation Regularization if args.alpha > 0: loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:]) # Temporal Activation Regularization (slowness) loss = loss + sum(args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:]) loss *= args.small_batch_size / args.batch_size total_loss += raw_loss.data * args.small_batch_size / args.batch_size loss.backward() s_id += 1 start = end end = start + args.small_batch_size gc.collect() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) optimizer.step() # total_loss += raw_loss.data optimizer.param_groups[0]['lr'] = lr2 if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss.item() / args.log_interval elapsed = time.time() - start_time logging.info( '| dag_epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, optimizer.param_groups[0]['lr'], elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() batch += 1 i += seq_len