def decomopsitionNet(data, lookBack, batchSize): scaler = MinMaxScaler(feature_range=(0, 1)) dataset = scaler.fit_transform(data) # 分割序列为样本,并整理成RNN的输入形式 trainData, testData = divideTrainTest(dataset) trainX, trainY = createSamples(trainData, lookBack, RNN=False) testX, testY = createSamples(testData, lookBack, RNN=False) print("testX shape:", testX.shape) print("testy shape:", testY.shape) print("trainX shape:", trainX.shape) print("trainy shape:", trainY.shape) net1 = DecompositionNetModel(inputDim=24, hiddenNum=100, outputDim=24) net2 = RNNModel(inputDim=1, hiddenNum=100, outputDim=1, layerNum=1, cell="RNN") optimizer1 = optim.RMSprop(net1.parameters(), lr=1e-4) optimizer2 = optim.SGD(net2.parameters(), lr=1e-3) prime = net1.forward()
def train_model(self): args = self.args # Load data corpus = Corpus(args.file) train_data = train.batchify(corpus.train, args.batch_size, self.device) # Build the model ntokens = len(corpus.dictionary) model = RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied).to(self.device) # criterion = nn.NLLLoss() # criterion = nn.MSELoss() criterion = self.args.criterion optimizer = optim.Adam(model.parameters(), lr=args.lr) # Training code # Loop over epochs. lr = args.lr # At any point you can hit Ctrl + C to break out of training early. try: for epoch in range(1, args.epochs + 1): epoch_start_time = time.time() train.train(train_data, args, model, optimizer, criterion, corpus, epoch, lr, self.device) print('-' * 89) with open(args.save, 'wb') as f: torch.save(model, f) lr /= 4.0 except KeyboardInterrupt: print('-' * 89) print('Exiting from training early') return model
def train(): # Turn on training mode which enables dropout. model.train() total_loss = 0. start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): data, targets = get_batch(train_data, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) model.zero_grad() output, hidden = model(data, hidden) loss = criterion(output.view(-1, ntokens), targets) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) for p in model.parameters(): p.data.add_(-lr, p.grad.data) total_loss += loss.item() if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
def train(): # training file on disk train_file = args.train_data # training data class print("starting") trainData = textData(train_file, args.vocab_size) # model model = RNNModel(embedding_size=args.embedding_size, bidir=args.bidir, hidden_units=args.hidden_units, vocab_size=args.vocab_size, batch_size=args.batch_size, num_layers=args.num_layers, num_entities=args.num_entities) # create the genereator for the training set and validation set params = { 'batch_size': args.batch_size, 'shuffle': True, 'num_workers': 1, 'collate_fn': collate_fn } train_gen = data.DataLoader(trainData, **params) max_epochs = args.epochs # loss function and optimizer loss_func = nn.NLLLoss() optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate) for epoch in range(max_epochs): for batch_x, batch_y in train_gen: if batch_y.size()[0] < args.batch_size: continue print(batch_x) # make zero grad optimizer.zero_grad() output = model(batch_x) loss = loss_func(output, batch_y) loss.backward() optimizer.step() print(loss)
def build_model(args, corpus): criterion = None ntokens = len(corpus.dictionary) model = RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied) ### if args.resume: logging.info('Resuming model ...') model, criterion, optimizer = model_load(args.resume_path) optimizer.param_groups[0]['lr'] = args.lr model.dropouti, model.dropouth, model.dropout, args.dropoute = args.dropouti, args.dropouth, args.dropout, args.dropoute if args.wdrop: from weight_drop import WeightDrop for rnn in model.rnns: if type(rnn) == WeightDrop: rnn.dropout = args.wdrop elif rnn.zoneout > 0: rnn.zoneout = args.wdrop ### if not criterion: splits = [] if ntokens > 500000: # One Billion # This produces fairly even matrix mults for the buckets: # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422 splits = [4200, 35000, 180000] elif ntokens > 75000: # WikiText-103 splits = [2800, 20000, 76000] logging.info(f'Using {splits}') criterion = SplitCrossEntropyLoss(args.emsize, splits=splits, verbose=False) ### params = list(model.parameters()) + list(criterion.parameters()) total_params = sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] for x in params if x.size()) logging.info(f'Args: {args}') logging.info(f'Model total parameters: {total_params}') if args.cuda: model = model.cuda() criterion = criterion.cuda() return model, criterion
def build_model(args, ntokens: int): """ Returns model and loss function. """ print('INFO: Building model') model = RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied) if args.cuda: print('INFO: Moving model to GPU') model.cuda() total_params = sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] for x in model.parameters()) print('INFO: Model total parameters:', total_params) criterion = nn.CrossEntropyLoss() return model, criterion
args.n_layers).to(device) # Load the dictionaries with open(os.path.join(args.data_dir, "char_dict.pkl"), "rb") as f: model.char2int = pickle.load(f) with open(os.path.join(args.data_dir, "int_dict.pkl"), "rb") as f: model.int2char = pickle.load(f) print("Model loaded with embedding_dim {}, hidden_dim {}, vocab_size {}.". format(args.embedding_dim, args.hidden_dim, args.vocab_size)) # Train the model. # Define Loss, Optimizer criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) train_main(model, optimizer, criterion, train_loader, num_batches, val_batches, args.batch_size, args.max_len, args.epochs, args.clip_norm, device) # Save the parameters used to construct the model model_info_path = os.path.join(args.model_dir, 'model_info.pth') with open(model_info_path, 'wb') as f: model_info = { 'n_layers': args.n_layers, 'embedding_dim': args.vocab_size, 'hidden_dim': args.hidden_dim, 'vocab_size': args.vocab_size, 'drop_rate': 0.2 }
def run(args): np.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available(): if not args.cuda: print("WARNING: You have a CUDA device, so you should probably run with --cuda") else: torch.cuda.manual_seed(args.seed) ############################################################################### # Load data ############################################################################### def model_save(fn): with open(fn, 'wb') as f: torch.save([model, optimizer], f) def model_load(fn): global model, criterion, optimizer with open(fn, 'rb') as f: model, optimizer = torch.load(f) import os import hashlib fn = 'corpus.{}.data'.format(hashlib.md5(args.data.encode()).hexdigest()) if os.path.exists(fn): print('Loading cached dataset...') corpus = torch.load(fn) else: print('Producing dataset...') corpus = data.Corpus(args.data) torch.save(corpus, fn) # get token frequencies and eos_tokens frequencies, eos_tokens = None, None if not args.uni_freq: frequencies = corpus.frequencies if args.reinit_h: eos_tokens = corpus.reset_idxs # batchify eval_batch_size = 1 test_batch_size = 1 print(corpus.dictionary) if args.reinit_h: ntokens = len(corpus.dictionary) + 1 if args.batch_size > 1 else len(corpus.dictionary) train_data, seq_lens = batchify_padded(corpus.train, args.batch_size, args, ntokens, eos_tokens) else: ntokens = len(corpus.dictionary) train_data = batchify(corpus.train, args.batch_size, args) val_data = batchify(corpus.valid, eval_batch_size, args) test_data = batchify(corpus.test, test_batch_size, args) ############################################################################### # Build the model ############################################################################### model = RNNModel(ntokens, args.emsize, args.nhid, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.nsamples, args.temperature, frequencies, args.no_bias, args.bias_reg, args.dist_fn, args.activation_fn) ### if args.resume: print('Resuming model ...') model_load(args.resume) optimizer.param_groups[0]['lr'] = args.lr model.dropouti, model.dropouth, model.dropout, args.dropoute = args.dropouti, args.dropouth, args.dropout, args.dropoute ### if args.cuda: model = model.cuda() ### params = list(model.parameters()) total_params = sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] for x in params if x.size()) print('Args:', args) print('Model total parameters:', total_params) ############################################################################### # Training code ############################################################################### def evaluate(data_source, epoch, batch_size=1): # Turn on evaluation mode which disables dropout. model.eval() if args.dump_hiddens: loss, entropy, hiddens = model.evaluate(data_source, eos_tokens, args.dump_hiddens) dump_hiddens(hiddens, 'hiddens_' + str(epoch)) else: loss, entropy = model.evaluate(data_source, eos_tokens) if args.dump_words: dump_words(model.encoder.weight.detach().cpu().numpy(), 'words_' + str(epoch)) if not args.dump_entropy is None: dump(entropy, args.dump_entropy + str(epoch)) return loss def train(): # Turn on training mode which enables dropout. total_loss, avrg_loss = 0, 0 start_time = time.time() ntokens = len(corpus.dictionary) batch, i = 0, 0 hidden = model.init_hidden(args.batch_size) while i < train_data.size(0)-1: if args.reinit_h: seq_len = seq_lens[batch] - 1 else: bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2. # Prevent excessively small or negative sequence lengths seq_len = max(5, int(np.random.normal(bptt, 5))) # prevent negative sequence lengths # There's a very small chance that it could select a very long sequence length resulting in OOM # seq_len = min(seq_len, args.bptt + 10) lr2 = optimizer.param_groups[0]['lr'] optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt model.train() data = get_batch(train_data, i, args, seq_len=seq_len) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. reset_hidden = args.reinit_h if reset_hidden: hidden = model.init_hidden(args.batch_size) hidden = repackage_hidden(hidden) optimizer.zero_grad() #raw_loss = model.train_crossentropy(data, eos_tokens) raw_loss, hidden = model(data, hidden) loss = raw_loss ''' See what we can do here! We don't need the regularization as it is implicit! # Activiation Regularization if args.alpha: loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:]) # Temporal Activation Regularization (slowness) if args.beta: loss = loss + sum(args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:]) ''' loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. if args.clip: torch.nn.utils.clip_grad_norm_(params, args.clip) optimizer.step() total_loss += loss.data optimizer.param_groups[0]['lr'] = lr2 if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss.item() / args.log_interval elapsed = time.time() - start_time print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f}'.format( epoch, batch, len(train_data) // args.bptt, optimizer.param_groups[0]['lr'], elapsed * 1000 / args.log_interval, cur_loss, cur_loss, cur_loss / math.log(2))) avrg_loss = avrg_loss + total_loss total_loss = 0 start_time = time.time() ### batch += 1 i += seq_len + 1 return avrg_loss / train_data.size(0) # Loop over epochs. lr = args.lr best_val_loss = [] valid_loss = [] stored_loss = 100000000 # At any point you can hit Ctrl + C to break out of training early. try: optimizer = None # Ensure the optimizer is optimizing params, which includes both the model's weights as well as the criterion's weight (i.e. Adaptive Softmax) if args.optimizer == 'sgd': optimizer = torch.optim.SGD(params, lr=args.lr, weight_decay=args.wdecay) if args.optimizer == 'adam': optimizer = torch.optim.Adam(params, lr=args.lr, weight_decay=args.wdecay) for epoch in range(1, args.epochs+1): epoch_start_time = time.time() train_loss = train() _, s, _= np.linalg.svd(model.rnn.module.weight_hh_l0.cpu().detach().numpy()) print(s[0]) #dump(model.decoder.bias.cpu().detach().numpy(), 'bias_' + str(epoch) +'.out') # skip to beginning if not in evaluation mode if epoch % args.evaluate_every > 0: print('-' * 89) print('| end of epoch {:3d} | time: {:5.2f}s | train loss {:5.2f} |'.format( epoch, (time.time() - epoch_start_time), train_loss)) print('-' * 89) continue # evaluate validation loss if 't0' in optimizer.param_groups[0]: tmp = {} for prm in model.parameters(): #if 'ax' in optimizer.state[prm]: tmp[prm] = prm.data.clone() if 'ax' in optimizer.state[prm]: prm.data = optimizer.state[prm]['ax'].clone() val_loss2 = evaluate(val_data, epoch) valid_loss.append(val_loss2) print('-' * 89) print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f} | valid bpc {:8.3f}'.format( epoch, (time.time() - epoch_start_time), val_loss2, math.exp(val_loss2), val_loss2 / math.log(2))) print('-' * 89) if val_loss2 < stored_loss: model_save(args.save) print('Saving Averaged!') stored_loss = val_loss2 for prm in model.parameters(): prm.data = tmp[prm].clone() else: val_loss = evaluate(val_data, epoch, eval_batch_size) valid_loss.append(val_loss) print('-' * 89) print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f} | valid bpc {:8.3f}'.format( epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss), val_loss / math.log(2))) print('-' * 89) if val_loss < stored_loss: model_save(args.save) print('Saving model (new best validation)') stored_loss = val_loss if args.optimizer == 'sgd' and 't0' not in optimizer.param_groups[0] and (len(best_val_loss)>args.nonmono and val_loss > min(best_val_loss[:-args.nonmono])): print('Switching to ASGD') optimizer = torch.optim.ASGD(model.parameters(), lr=args.lr, t0=0, lambd=0., weight_decay=args.wdecay) if epoch in args.when: print('Saving model before learning rate decreased') model_save('{}.e{}'.format(args.save, epoch)) print('Dividing learning rate by 10') optimizer.param_groups[0]['lr'] /= 10. best_val_loss.append(val_loss) except KeyboardInterrupt: print('-' * 89) print('Exiting from training early') # Load the best saved model. model_load(args.save) # Run on test data. test_loss = evaluate(test_data, args.epochs+1, test_batch_size) print('=' * 89) print('| End of training | test loss {:5.2f} | test ppl {:8.2f} | test bpc {:8.3f}'.format( test_loss, math.exp(test_loss), test_loss / math.log(2))) print('=' * 89) return np.array(valid_loss), test_loss
print(args) input_size = 88 X_train, X_valid, X_test = data_generator(args.data) nhid = args.nhid dropout = args.dropout rnn_type = args.rnn_type model = RNNModel(rnn_type, input_size, input_size, nhid) if args.cuda: model.cuda() criterion = nn.CrossEntropyLoss() lr = args.lr optimizer = getattr(optim, args.optim)(model.parameters(), lr=lr) def evaluate(X_data): model.eval() eval_idx_list = np.arange(len(X_data), dtype="int32") total_loss = 0.0 count = 0 for idx in eval_idx_list: data_line = X_data[idx] x, y = Variable(data_line[:-1]), Variable(data_line[1:]) if args.cuda: x, y = x.cuda(), y.cuda() output = model(x.unsqueeze(0)).squeeze(0) loss = -torch.trace( torch.matmul(y,
else: print('Building model and criterion...') model = RNNModel(corpus.ntoken, args.emsize, corpus.weight, args.nhid, args.nlayers, args.dropouti, args.dropoutrnn, args.dropout, args.wdrop) if args.cuda: model = model.cuda() print('-' * 89) print('Args:', args) print('Model parameters:', count_parameters(model)) criterion = nn.CrossEntropyLoss() if args.optimizer == 'sgd': optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.wdecay) else: optimizer = torch.optim.Adam(model.parameters(), weight_decay=args.wdecay) ############################################################################### # evaluate funcition ############################################################################### def evaluate(model, criterion, data_source, batch_size): total_loss = 0 model.eval() with torch.no_grad(): hidden = None for i in range(0, data_source.size(0) - 1, args.bptt):
tieweights=args.tieweights) else: LMModel_start = torch.load(args.start_model).cpu() # Note: watch out if the model class has different methods from the loaded one to start with !!! LMModel = RNNModel(vocab_size=vocab_size, embed_size=args.embedsz, hidden_size=args.hiddensz, num_layers=args.numlayers, dropout=args.dropout, padid=padid, tieweights=args.tieweights) LMModel.load_state_dict(LMModel_start.state_dict()) # LMModel = torch.load(args.save).cpu() model_size = sum(p.nelement() for p in LMModel.parameters()) logging('-' * 30, f_log=f_log) logging(f'Model tatal parameters: {model_size}', f_log=f_log) logging('-' * 30, f_log=f_log) # print('-' * 30) # print(f'Model tatal parameters: {model_size}') # print('-' * 30) if torch.cuda.is_available() and cuda_device is not 'cpu': LMModel = LMModel.cuda(cuda_device) LMModel_parallel = None if torch.cuda.is_available() and args.devids is not 'off': LMModel_parallel = torch.nn.DataParallel(LMModel, device_ids=device_ids,
collate_fn=lambda x: lm_collate(x, tokenizer.term2id['PAD'])) test_loader = DataLoader( test_dataset, batch_size=64, num_workers=NUM_WORKERS, collate_fn=lambda x: lm_collate(x, tokenizer.term2id['PAD'])) model = RNNModel(ntokens, 100, 100, dropout=0.0, pad_token=tokenizer.term2id['PAD']) model = cudalize(model) loss_fn = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), weight_decay=0.0) cross_entropy = nn.CrossEntropyLoss() def loss_function(preds, labels, lens): # TODO: delete padding new_preds, new_labels = [], [] for pred, label, l in zip(preds, labels, lens): new_preds.append(pred[:l]) new_labels.append(label[:l]) preds = torch.cat(new_preds, dim=0) labels = torch.cat(new_labels, dim=0) return cross_entropy(preds, labels)
hidden_dim = 200 num_layers = 4 lr = 1e-3 log_dir = './ckpt' model_name = 'model.pth' device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print(f"Found {device} ...") print("Instantiating RNN Model") if not os.path.exists(log_dir): os.mkdir(log_dir) model_save_path = os.path.join(log_dir, model_name) model = RNNModel(x_train.shape[-1], hidden_dim, num_layers, y_train.shape[-1]).to(device) optimizer = optim.Adam(model.parameters(), lr=lr) criterion = nn.MSELoss() print("< Training starts >") model = train(model, dataloader_train, dataloader_val, device, criterion, optimizer, n_epochs, model_save_path) print("Testing on test data-set ") log_dir = './ckpt' model_name = 'model.pth' model_save_path = os.path.join(log_dir, model_name) output_dim = 4 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = RNNModel(x_test.shape[-1], hidden_dim, num_layers, output_dim).to(device) y_test_pred = test(x_test, model, model_save_path, device)
D = args.emsize if args.proj else args.nhid ss = SampledSoftmax(ntokens, nsampled, D, tied_weight=twht) net.add_module("encoder", encoder) net.add_module("decoder", ss) net.cuda() tmp_net = net if world_size >= 1: tmp_net = DDP(net) tmp_net.init_hidden = net.init_hidden net = tmp_net print("Batch Size:", args.batch_size * args.scale, "Initial LR:", args.lr * args.scale) criterion = nn.CrossEntropyLoss() optimizer = Adam(net.parameters(), args.lr * args.scale, betas=(0.9, 0.999)) scheduler = LinearLR(optimizer, base_lr=args.lr * args.scale, max_iters=train_corpus.batch_num * args.epochs, last_iter=-1, min_lr=1e-8) ############################################################################### # Training code ############################################################################### def repackage_hidden(h, device_id=0): """Wraps hidden states in new Variables, to detach them from their history.""" if isinstance(h, Variable): return Variable(h.data).cuda(device_id)
# Set dataloader train_loader = DataLoader(dataset=TensorDataset(torch.FloatTensor(x_train), torch.LongTensor(y_train)), batch_size=batch_size, shuffle=True, num_workers=4) valid_loader = DataLoader(dataset=TensorDataset(torch.FloatTensor(x_valid), torch.LongTensor(y_valid)), batch_size=batch_size, shuffle=False, num_workers=4) print('Initial RNN model.') model = RNNModel(input_dim, output_dim).cuda() optimizer = torch.optim.RMSprop(model.parameters(), lr=0.0001, alpha=0.9) loss_func = torch.nn.CrossEntropyLoss() print('Start training.') best_ed = 999 early_stop_cnt = 0 for epoch in range(1, epochs + 1): print('Epoch: {}/{}'.format(epoch, epochs)) total_loss, total_acc, nonzeros = 0, 0, 0 widgets = [ FormatLabel(''), ' ',
train_loader, test_loader = data_generator(root, batch_size) permute = torch.Tensor(np.random.permutation(784).astype(np.float64)).long() model = RNNModel(rnn_type="LSTM", ntoken=n_classes, ninp=n_inputs, nhid=nhid, nlayers=args.num_layers) if args.cuda: model.cuda() permute = permute.cuda() lr = args.lr #optimizer = getattr(optim, args.optim)(model.parameters(), lr=lr) optimizer = optim.RMSprop(model.parameters(), lr=lr, momentum=0.9) def train(ep): global steps train_loss = 0 model.train() for batch_idx, (data, target) in enumerate(train_loader): if args.cuda: data, target = data.cuda(), target.cuda() data = data.view(-1, 1, seq_length) if args.permute: data = data[:, :, permute] # Data should be seq_len, batch, input_size, data = data.permute(2, 0, 1) data, target = Variable(data), Variable(target)
def train(): # 载入数据与配置模型 print("Loading data...") corpus = Corpus(train_dir) print(corpus) config = Config() config.vocab_size = len(corpus.dictionary) train_data = batchify(corpus.train, config.batch_size) train_len = train_data.size(0) seq_len = config.seq_len print("Configuring model...") model = RNNModel(config) if use_cuda: model.cuda() print(model) criterion = nn.CrossEntropyLoss() lr = config.learning_rate # 初始学习率 start_time = time.time() print("Training and generating...") for epoch in range(1, config.num_epochs + 1): # 多轮次训练 total_loss = 0.0 model.train() # 在训练模式下dropout才可用。 hidden = model.init_hidden(config.batch_size) # 初始化隐藏层参数 for ibatch, i in enumerate(range(0, train_len - 1, seq_len)): data, targets = get_batch(train_data, i, seq_len) # 取一个批次的数据 # 在每批开始之前,将隐藏的状态与之前产生的结果分离。 # 如果不这样做,模型会尝试反向传播到数据集的起点。 hidden = repackage_hidden(hidden) model.zero_grad() output, hidden = model(data, hidden) loss = criterion(output.view(-1, config.vocab_size), targets) loss.backward() # 反向传播 # `clip_grad_norm` 有助于防止RNNs/LSTMs中的梯度爆炸问题。 torch.nn.utils.clip_grad_norm(model.parameters(), config.clip) for p in model.parameters(): # 梯度更新 p.data.add_(-lr, p.grad.data) total_loss += loss.data # loss累计 if ibatch % config.log_interval == 0 and ibatch > 0: # 每隔多少个批次输出一次状态 cur_loss = total_loss[0] / config.log_interval elapsed = get_time_dif(start_time) print( "Epoch {:3d}, {:5d}/{:5d} batches, lr {:2.3f}, loss {:5.2f}, ppl {:8.2f}, time {}" .format(epoch, ibatch, train_len // seq_len, lr, cur_loss, math.exp(cur_loss), elapsed)) total_loss = 0.0 lr /= 4.0 # 在一轮迭代完成后,尝试缩小学习率 # 每隔多少轮次保存一次模型参数 if epoch % config.save_interval == 0: torch.save(model.state_dict(), os.path.join(save_dir, model_name.format(epoch))) print(''.join(generate(model, corpus.dictionary.idx2word)))
config = Config() config.vocab_size = len(corpus.dictionary) train_data = batchify(corpus.train, config.batch_size) train_len = train_data.size(0) seq_len = config.seq_len print("Configuring model...") model = RNNModel(config) if use_cuda: model.cuda() print(model) criterion = nn.CrossEntropyLoss() lr = config.learning_rate # 初始学习率 best_train_loss = None optimizer = torch.optim.Adam(model.parameters()) print("Training and generating...") try: for epoch in range(1, config.num_epochs + 1): # 多轮次训练 epoch_start_time = time.time() train_loss = train() print('-' * 89) print( '| end of epoch {:3d} | time: {:5.2f}s | train loss {:5.2f} | ' 'train ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time), train_loss, math.exp(train_loss))) print('-' * 89) # Save the model if the validation loss is the best we've seen so far.
def __init__(self, save_path,seed=111,val_interval = 20, val_times=1,controller = None, batch_size=128, grad_clip=0.1, config='eval'): args = {'emsize':850, 'nhid':850, 'nhidlast':850, 'dropoute':0.1, 'wdecay':8e-7} args['config'] = config args['data'] = '../data/penn' args['lr'] = 20 args['clip'] = grad_clip args['batch_size'] = batch_size args['search_batch_size'] = 256*4 args['small_batch_size'] = batch_size args['bptt'] = 35 args['dropout'] = 0.75 args['dropouth'] = 0.25 args['dropoutx'] = 0.75 args['dropouti'] = 0.2 args['seed'] = seed args['nonmono'] = 5 args['log_interval'] = val_interval args['val_times'] = val_times args['save'] = save_path args['alpha'] = 0 args['beta'] = 1e-3 args['max_seq_length_delta'] = 20 args['unrolled'] = True args['gpu'] = 0 args['cuda'] = True args = AttrDict(args) self.args = args self.seed = seed self.controller = controller np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.set_device(args.gpu) cudnn.benchmark = True cudnn.enabled=True torch.cuda.manual_seed_all(args.seed) corpus = data.Corpus(args.data) self.corpus = corpus eval_batch_size = 64 test_batch_size = 1 args.eval_batch_size = eval_batch_size self.train_data = batchify(corpus.train, args.batch_size, args) self.search_data = batchify(corpus.valid, args.search_batch_size, args) # self.val_data = batchify(corpus.train[464794:], eval_batch_size, args) # self.test_data = batchify(corpus.test, test_batch_size, args) # raw_data = batchify(corpus.train, batch_size, None) # indx = np.arange(14524) # random.shuffle(indx) # self.train_data = raw_data[indx[0:int(14524/2)],:] # self.val_data = raw_data[indx[int(14524/2):],:] raw_data = batchify(corpus.valid, 1, None) val_data = [] for i in range(len(raw_data)-1-args.bptt): val_data.append(raw_data[i:i+args.bptt+1]) val_data = torch.cat(val_data,1) self.val_data = val_data print(self.train_data.shape) print(self.search_data.shape) print(self.val_data.shape) self.batch = 0 self.steps = 0 self.epochs = 0 self.total_loss = 0 self.start_time = time.time() ntokens = len(corpus.dictionary) #if args.continue_train: # model = torch.load(os.path.join(args.save, 'model.pt')) # try: # model = torch.load(os.path.join(args.save, 'model.pt')) # print('Loaded model from checkpoint') # except Exception as e: # print(e) model = RNNModel(ntokens, args.emsize, args.nhid, args.nhidlast, args.dropout, args.dropouth, args.dropoutx, args.dropouti, args.dropoute, genotype=genotypes.DARTS) size = 0 for p in model.parameters(): size += p.nelement() logging.info('param size: {}'.format(size)) logging.info('initial genotype:') logging.info(model.rnns[0].genotype) total_params = sum(x.data.nelement() for x in model.parameters()) logging.info('Args: {}'.format(args)) logging.info('Model total parameters: {}'.format(total_params)) self.model = model.cuda() self.optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.wdecay)
def train(): best_val_loss = 100 ntokens = len(corpus.dictionary) train_data = batchify(corpus.train, args.batch_size) # num_batches, batch_size val_data = batchify(corpus.valid, args.batch_size) model = RNNModel(rnn_type=args.model, ntoken=ntokens, ninp=args.emsize, nfeat=args.nfeat, nhid=args.nhid, nlayers=args.nlayers, font_path=args.font_path, font_size=args.font_size, dropout=args.dropout, tie_weights=args.tied, ).to(device) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) print('start training...') hidden = model.init_hidden(args.batch_size) epoch_start_time = time.time() for epoch in range(args.epochs): model.eval() # 在validation上测试 total_loss = 0. with torch.no_grad(): for idx in range(0, val_data.size(0) - 1, args.bptt): data, targets = get_batch(val_data, idx) output, hidden = model(data, hidden) output_flat = output.view(-1, ntokens) # (seq_len, batch, ntokens) -> (seq_len*batch, ntokens) total_loss += len(data) * criterion(output_flat, targets.view(-1)).item() hidden = repackage_hidden(hidden) val_loss = total_loss / len(val_data) best_val_loss = min(best_val_loss, val_loss) print('-' * 100) print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | valid ppl {:8.2f} | best valid ppl {:8.2f}' .format(epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss), math.exp(best_val_loss))) print('-' * 100) epoch_start_time = time.time() if val_loss == best_val_loss: # Save the model if the validation loss is best so far. torch.save(model, os.path.join(args.save, 'model.pkl')) else: args.lr /= 4.0 model.train() # 在training set上训练 total_loss = 0. start_time = time.time() for i, idx in enumerate(range(0, train_data.size(0) - 1, args.bptt)): data, targets = get_batch(train_data, idx) hidden = repackage_hidden(hidden) model.zero_grad() # 求loss和梯度 output, hidden = model(data, hidden) loss = criterion(output.view(-1, ntokens), targets.view(-1)) loss.backward() total_loss += loss.item() torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) # 用梯度更新参数 optimizer.step() # for p in model.parameters(): # p.data.add_(-args.lr, p.grad.data) if i % args.log_interval == 0 and i > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} |loss {:5.2f} | ppl {:8.2f}' .format(epoch + 1, i, len(train_data) // args.bptt, args.lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
""" ----------- Model Creation ------------""" number_tokens = len(corpus.dictionary) # Number of unique word in our corpus model = RNNModel(rnn_type = rt, ntoken = number_tokens, ninp = embedding_size, nhid = number_hidden, nlayers = number_layer, drop_rate = dropout, tie_weights = tied) if cuda and torch.cuda.is_available(): model = model.cuda() criterion = nn.CrossEntropyLoss() optimizer = optim.RMSprop(model.parameters(), lr = learning_rate) """ ----------- Training Code ------------""" def detach_hidden(h): # detach from distant history if type(h) == V: return V(h.data) else: return tuple(detach_hidden(v) for v in h) def get_batch(source, i, sequence_length): seq_len = min(sequence_length, len(source) - 1 - i) # torch.cat([data.data.view(-1).unsqueeze(-1), target.data.unsqueeze(-1)], dim=1) data = V(source[i:i+seq_len]).type(LongType) target = V(source[i+1:i+1+seq_len].view(-1)).type(LongType) return data, target
class DartsTrainer(): def __init__(self, arm): # Default params for eval network args = { 'emsize': 850, 'nhid': 850, 'nhidlast': 850, 'dropoute': 0.1, 'wdecay': 8e-7 } args['data'] = '/home/liamli4465/darts/data/penn' args['lr'] = 20 args['clip'] = 0.25 args['batch_size'] = 64 args['search_batch_size'] = 256 * 4 args['small_batch_size'] = 64 args['bptt'] = 35 args['dropout'] = 0.75 args['dropouth'] = 0.25 args['dropoutx'] = 0.75 args['dropouti'] = 0.2 args['seed'] = arm['seed'] args['nonmono'] = 5 args['log_interval'] = 50 args['save'] = arm['dir'] args['alpha'] = 0 args['beta'] = 1e-3 args['max_seq_length_delta'] = 20 args['unrolled'] = True args['gpu'] = 0 args['cuda'] = True args['genotype'] = arm['genotype'] args = AttrDict(args) self.args = args self.epoch = 0 np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.set_device(args.gpu) cudnn.benchmark = True cudnn.enabled = True torch.cuda.manual_seed_all(args.seed) corpus = data.Corpus(args.data) self.corpus = corpus self.eval_batch_size = 10 self.test_batch_size = 1 self.train_data = batchify(corpus.train, args.batch_size, args) self.search_data = batchify(corpus.valid, args.search_batch_size, args) self.val_data = batchify(corpus.valid, self.eval_batch_size, args) self.test_data = batchify(corpus.test, self.test_batch_size, args) self.ntokens = len(corpus.dictionary) def model_save(self, fn, to_save): if self.epoch % 150 == 0: with open( os.path.join(self.args.save, "checkpoint-incumbent-%d" % self.epoch), 'wb') as f: torch.save(to_save, f) with open(fn, 'wb') as f: torch.save(to_save, f) def model_load(self, fn): with open(fn, 'rb') as f: self.model, self.optimizer, rng_state, cuda_state = torch.load(f) torch.set_rng_state(rng_state) torch.cuda.set_rng_state(cuda_state) def model_resume(self, filename): logging.info('Resuming model from %s' % filename) self.model_load(filename) self.optimizer.param_groups[0]['lr'] = self.args.lr for rnn in self.model.rnns: rnn.genotype = self.args.genotype def train_epochs(self, epochs): args = self.args resume_filename = os.path.join(self.args.save, "checkpoint.incumbent") if os.path.exists(resume_filename): self.model_resume(resume_filename) logging.info('Loaded model from checkpoint') else: self.model = RNNModel(self.ntokens, args.emsize, args.nhid, args.nhidlast, args.dropout, args.dropouth, args.dropoutx, args.dropouti, args.dropoute, genotype=args.genotype) self.optimizer = torch.optim.SGD(self.model.parameters(), lr=args.lr, weight_decay=args.wdecay) size = 0 for p in self.model.parameters(): size += p.nelement() logging.info('param size: {}'.format(size)) logging.info('initial genotype:') logging.info(self.model.rnns[0].genotype) total_params = sum(x.data.nelement() for x in self.model.parameters()) logging.info('Args: {}'.format(args)) logging.info('Model total parameters: {}'.format(total_params)) self.model = self.model.cuda() # Loop over epochs. lr = args.lr best_val_loss = [] stored_loss = 100000000 # At any point you can hit Ctrl + C to break out of training early. try: for epoch in range(epochs): epoch_start_time = time.time() self.train() if 't0' in self.optimizer.param_groups[0]: tmp = {} for prm in self.model.parameters(): tmp[prm] = prm.data.clone() prm.data = self.optimizer.state[prm]['ax'].clone() val_loss2 = self.evaluate(self.val_data) logging.info('-' * 89) logging.info( '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f} | valid bpc {:8.3f}'.format( self.epoch, (time.time() - epoch_start_time), val_loss2, math.exp(val_loss2), val_loss2 / math.log(2))) logging.info('-' * 89) if val_loss2 < stored_loss: self.model_save( os.path.join(args.save, 'checkpoint.incumbent'), [ self.model, self.optimizer, torch.get_rng_state(), torch.cuda.get_rng_state() ]) logging.info('Saving Averaged!') stored_loss = val_loss2 for prm in self.model.parameters(): prm.data = tmp[prm].clone() else: val_loss = self.evaluate(self.val_data, self.eval_batch_size) logging.info('-' * 89) logging.info( '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f} | valid bpc {:8.3f}'.format( self.epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss), val_loss / math.log(2))) logging.info('-' * 89) if val_loss < stored_loss: self.model_save( os.path.join(args.save, 'checkpoint.incumbent'), [ self.model, self.optimizer, torch.get_rng_state(), torch.cuda.get_rng_state() ]) logging.info('Saving model (new best validation)') stored_loss = val_loss if (self.epoch > 75 and 't0' not in self.optimizer.param_groups[0] and (len(best_val_loss) > args.nonmono and val_loss > min(best_val_loss[:-args.nonmono]))): logging.info('Switching to ASGD') self.optimizer = torch.optim.ASGD( self.model.parameters(), lr=args.lr, t0=0, lambd=0., weight_decay=args.wdecay) best_val_loss.append(val_loss) except Exception as e: logging.info('-' * 89) logging.info(e) logging.info('Exiting from training early') return 0, 10000, 10000 # Load the best saved model. self.model_load(os.path.join(args.save, 'checkpoint.incumbent')) # Run on test data. val_loss = self.evaluate(self.val_data, self.eval_batch_size) logging.info(math.exp(val_loss)) test_loss = self.evaluate(self.test_data, self.test_batch_size) logging.info('=' * 89) logging.info( '| End of training | test loss {:5.2f} | test ppl {:8.2f} | test bpc {:8.3f}' .format(test_loss, math.exp(test_loss), test_loss / math.log(2))) logging.info('=' * 89) return 0, math.exp(val_loss), math.exp(test_loss) def train(self): args = self.args corpus = self.corpus total_loss = 0 start_time = time.time() hidden = [ self.model.init_hidden(args.small_batch_size) for _ in range(args.batch_size // args.small_batch_size) ] batch, i = 0, 0 while i < self.train_data.size(0) - 1 - 1: bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2. # Prevent excessively small or negative sequence lengths seq_len = max(5, int(np.random.normal(bptt, 5))) # There's a very small chance that it could select a very long sequence length resulting in OOM seq_len = min(seq_len, args.bptt + args.max_seq_length_delta) lr2 = self.optimizer.param_groups[0]['lr'] self.optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt self.model.train() data, targets = get_batch(self.train_data, i, args, seq_len=seq_len) self.optimizer.zero_grad() start, end, s_id = 0, args.small_batch_size, 0 while start < args.batch_size: cur_data, cur_targets = data[:, start: end], targets[:, start: end].contiguous( ).view(-1) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden[s_id] = repackage_hidden(hidden[s_id]) log_prob, hidden[s_id], rnn_hs, dropped_rnn_hs = self.model( cur_data, hidden[s_id], return_h=True) raw_loss = nn.functional.nll_loss( log_prob.view(-1, log_prob.size(2)), cur_targets) loss = raw_loss # Activiation Regularization if args.alpha > 0: loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:]) # Temporal Activation Regularization (slowness) loss = loss + sum(args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:]) loss *= args.small_batch_size / args.batch_size total_loss += raw_loss.data * args.small_batch_size / args.batch_size loss.backward() s_id += 1 start = end end = start + args.small_batch_size gc.collect() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs. torch.nn.utils.clip_grad_norm(self.model.parameters(), args.clip) self.optimizer.step() # total_loss += raw_loss.data self.optimizer.param_groups[0]['lr'] = lr2 if np.isnan(total_loss[0]): raise #if batch % args.log_interval == 0 and batch > 0: # cur_loss = total_loss[0] / args.log_interval # elapsed = time.time() - start_time # logging.info('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' # 'loss {:5.2f} | ppl {:8.2f}'.format( # self.epoch, batch, len(self.train_data) // args.bptt, self.optimizer.param_groups[0]['lr'], # elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) # total_loss = 0 # start_time = time.time() batch += 1 i += seq_len self.epoch += 1 def evaluate(self, data_source, batch_size=10): # Turn on evaluation mode which disables dropout. self.model.eval() total_loss = 0 hidden = self.model.init_hidden(batch_size) for i in range(0, data_source.size(0) - 1, self.args.bptt): data, targets = get_batch(data_source, i, self.args, evaluation=True) targets = targets.view(-1) log_prob, hidden = self.model(data, hidden) loss = nn.functional.nll_loss(log_prob.view(-1, log_prob.size(2)), targets).data total_loss += loss * len(data) hidden = repackage_hidden(hidden) return total_loss[0] / len(data_source)
class WordLanguageModelTrial(PyTorchTrial): def __init__(self, context: PyTorchTrialContext): self.context = context data_config = self.context.get_data_config() hparams = self.context.get_hparams() using_bind_mount = data_config["use_bind_mount"] use_cache = data_config["use_cache"] self.eval_batch_size = hparams["eval_batch_size"] download_directory = ( Path(data_config["bind_mount_path"]) if using_bind_mount else Path("/data")) / f"data-rank{self.context.distributed.get_rank()}" self.corpus = data.load_and_cache_dataset(download_directory, use_cache) self.model_cls = hparams["model_cls"] emsize = hparams["word_embeddings_size"] num_hidden = hparams["num_hidden"] num_layers = hparams["num_layers"] dropout = hparams["dropout"] self.bptt = hparams["bptt"] if self.model_cls.lower() == "transformer": num_heads = hparams["num_heads"] self.model = TransformerModel(self.corpus.ntokens, emsize, num_heads, num_hidden, num_layers, dropout) else: tied = hparams["tied"] self.model = RNNModel( self.model_cls, self.corpus.ntokens, emsize, num_hidden, num_layers, dropout, tied, ) self.model = self.context.wrap_model(self.model) self.criterion = nn.NLLLoss() lr = hparams["lr"] optimizer = torch.optim.SGD(self.model.parameters(), lr=lr) self.optimizer = self.context.wrap_optimizer(optimizer) self.lr_scheduler = self.context.wrap_lr_scheduler( torch.optim.lr_scheduler.ReduceLROnPlateau( self.optimizer, factor=0.25, patience=0, threshold=0.001, threshold_mode="abs", verbose=True, ), LRScheduler.StepMode.MANUAL_STEP, ) def build_training_data_loader(self) -> DataLoader: train_dataset = data.WikiTextDataset( self.corpus, batch_size=self.context.get_per_slot_batch_size(), ) batch_samp = data.BatchSamp(train_dataset, self.bptt) return DataLoader(train_dataset, batch_sampler=batch_samp) def build_validation_data_loader(self) -> DataLoader: val_dataset = data.WikiTextDataset( self.corpus, batch_size=self.eval_batch_size, valid=True, ) self.val_data_len = len(val_dataset) - 1 batch_samp = data.BatchSamp(val_dataset, self.bptt) return DataLoader(val_dataset, batch_sampler=batch_samp) def train_batch(self, batch: TorchData, epoch_idx: int, batch_idx: int) -> Dict[str, Union[torch.Tensor, float]]: if batch_idx == 0 and self.model_cls.lower() != "transformer": self.hidden = self.model.init_hidden( self.context.get_per_slot_batch_size()) inputs = batch[:-1] labels = batch[1:].view(-1) if self.model_cls.lower() == "transformer": output = self.model(inputs) output = output.view(-1, self.corpus.ntokens) else: self.hidden = self.model.repackage_hidden(self.hidden) output, self.hidden = self.model(inputs, self.hidden) loss = self.criterion(output, labels) self.context.backward(loss) self.context.step_optimizer( self.optimizer, clip_grads=lambda params: torch.nn.utils.clip_grad_norm_( params, self.context.get_hparam("max_grad_norm")), ) return { "loss": loss, "lr": float(self.optimizer.param_groups[0]["lr"]) } def evaluate_full_dataset( self, data_loader: DataLoader) -> Dict[str, torch.Tensor]: validation_loss = 0.0 if self.model_cls.lower() != "transformer": self.hidden = self.model.init_hidden(self.eval_batch_size) for batch in data_loader: batch = self.context.to_device(batch) if self.model_cls.lower() == "transformer": output = self.model(batch[:-1]) output = output.view(-1, self.corpus.ntokens) else: output, self.hidden = self.model(batch[:-1], self.hidden) self.hidden = self.model.repackage_hidden(self.hidden) validation_loss += ( len(batch[:-1]) * self.criterion(output, batch[1:].view(-1)).item()) validation_loss /= len(data_loader.dataset) - 1 self.lr_scheduler.step(validation_loss) if self.model_cls.lower() != "transformer": self.hidden = self.model.init_hidden( self.context.get_per_slot_batch_size()) return {"validation_loss": validation_loss}
def __init__(self, save_path, seed, batch_size, grad_clip, config='eval'): if config == 'search': args = { 'emsize': 300, 'nhid': 300, 'nhidlast': 300, 'dropoute': 0, 'wdecay': 5e-7 } elif config == 'eval': args = { 'emsize': 850, 'nhid': 850, 'nhidlast': 850, 'dropoute': 0.1, 'wdecay': 8e-7 } args['config'] = config args['data'] = '/home/liamli4465/darts/data/penn' args['lr'] = 20 args['clip'] = grad_clip args['batch_size'] = batch_size args['search_batch_size'] = 256 * 4 args['small_batch_size'] = batch_size args['bptt'] = 35 args['dropout'] = 0.75 args['dropouth'] = 0.25 args['dropoutx'] = 0.75 args['dropouti'] = 0.2 args['seed'] = seed args['nonmono'] = 5 args['log_interval'] = 50 args['save'] = save_path args['alpha'] = 0 args['beta'] = 1e-3 args['max_seq_length_delta'] = 20 args['unrolled'] = True args['gpu'] = 0 args['cuda'] = True args = AttrDict(args) self.args = args self.seed = seed np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.set_device(args.gpu) cudnn.benchmark = True cudnn.enabled = True torch.cuda.manual_seed_all(args.seed) corpus = data.Corpus(args.data) self.corpus = corpus eval_batch_size = 10 test_batch_size = 1 self.train_data = batchify(corpus.train, args.batch_size, args) self.search_data = batchify(corpus.valid, args.search_batch_size, args) self.val_data = batchify(corpus.valid, eval_batch_size, args) self.test_data = batchify(corpus.test, test_batch_size, args) self.batch = 0 self.steps = 0 self.epochs = 0 self.total_loss = 0 self.start_time = time.time() ntokens = len(corpus.dictionary) # if args.continue_train: # model = torch.load(os.path.join(args.save, 'model.pt')) try: model = torch.load(os.path.join(args.save, 'model.pt')) print('Loaded model from checkpoint') except Exception as e: print(e) model = RNNModel(ntokens, args.emsize, args.nhid, args.nhidlast, args.dropout, args.dropouth, args.dropoutx, args.dropouti, args.dropoute, genotype=genotypes.DARTS) size = 0 for p in model.parameters(): size += p.nelement() logging.info('param size: {}'.format(size)) logging.info('initial genotype:') logging.info(model.rnns[0].genotype) total_params = sum(x.data.nelement() for x in model.parameters()) logging.info('Args: {}'.format(args)) logging.info('Model total parameters: {}'.format(total_params)) self.model = model.cuda() self.optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.wdecay)
ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.rnn_dropout, args.output_dropout, args.tied, adasoft=args.adasoft, cutoff=cutoff) if torch.cuda.is_available(): model.cuda() if args.optim == 'SGD': optimizer = torch.optim.SGD(params=model.parameters(), lr=args.lr) elif args.optim == 'rms': optimizer = torch.optim.RMSprop(params=model.parameters(), lr=args.lr, weight_decay=0.00001) else: raise Exception criterion = None if args.adasoft: criterion = AdaptiveLoss([*cutoff, ntokens + 1]) else: criterion = nn.CrossEntropyLoss() ############################################################################### # Training code
datefmt='%H:%M:%S', level=logging.INFO) else: logging.basicConfig(format='%(asctime)s: %(message)s', datefmt='%H:%M:%S', filename=os.path.join(args.out, 'train.log'), level=logging.INFO) tb.configure(args.out) random.seed(1024) torch.manual_seed(1024) torch.cuda.manual_seed_all(1024) model = RNNModel(123, 62, 250, 3, args.dropout, bidirectional=args.bi) if args.init: model.load_state_dict(torch.load(args.init)) else: for param in model.parameters(): torch.nn.init.uniform(param, -0.1, 0.1) if args.cuda: model.cuda() optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=.9) criterion = CTCLoss() # data set trainset = SequentialLoader('train', args.batch_size) devset = SequentialLoader('dev', args.batch_size) tri = cvi = 0 def eval(): global cvi