def train_model(dataloader, model, criterion, optimizer, device, num_epochs, dataset_size): model.to(device) since = time.time() best_model_wts = copy.deepcopy(model.state_dict()) best_acc = 0.0 for epoch in range(num_epochs): print('Epoch {}/{}'.format(epoch, num_epochs - 1)) print('-' * 10) for phase in ['train', 'test']: if phase == 'train': model.train() else: model.eval() running_loss = 0.0 running_corrects = 0 for inputs, labels in tqdm(dataloaders[phase]): inputs = inputs.to(device) labels = labels.to(device) optimizer.zero_grad() with torch.set_grad_enabled(phase == 'train'): outputs = model(inputs) _, pred = torch.max(outputs, 1) loss = criterion(outputs, labels) if phase == 'train': loss.backward() optimizer.step() running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(pred == labels.data) epoch_loss = running_loss / dataset_size[phase] epoch_acc = running_corrects.double() / dataset_size[phase] print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc)) if phase == 'test' and epoch_acc > best_acc: best_acc = epoch_acc best_model_wts = copy.deepcopy(model.state_dict()) torch.save( best_model_wts, osp.join(Config['root_path'], Config['checkpoint_path'], 'model.pth')) print('Model saved at: {}'.format( osp.join(Config['root_path'], Config['checkpoint_path'], 'model.pth'))) time_elapsed = time.time() - since print('Time taken to complete training: {:0f}m {:0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print('Best acc: {:.4f}'.format(best_acc))
def save_checkpoint(epoch): model_folder = "../training/" model_out_path = model_folder + "epoch_{}.pth".format(epoch + args.save_epoch_bias) if not os.path.exists(model_folder): os.makedirs(model_folder) torch.save(model.state_dict(), model_out_path) print("===> Checkpoint saved to {}".format(model_out_path))
def train_iters(): best_valid_acc = 0 for e in range(EPOCHS): print(f"epoch: {e}") train_acc, train_loss, train_tgt, train_pred = train(model) print(f"training loss: {train_loss:.4f} | training accuracy: {train_acc:.4f} | training precision:" f" {precision_score(train_tgt, train_pred):.4f} | training recall:" f" {recall_score(train_tgt, train_pred):.4f}") valid_acc, valid_loss, expected, prediction = evaluate(model, valid_loader, valid_len) print(f"validation loss: {valid_loss:.4f} | validation accuracy: {valid_acc:.4f} | validation precision:" f" {precision_score(expected, prediction):.4f} | validation recall: " f"{recall_score(expected, prediction):.4f}") if best_valid_acc < valid_acc: print("new best model! improvement: %f" % (best_valid_acc - valid_acc)) best_valid_acc = valid_acc torch.save(model.state_dict(), 'model.pt')
def main(): model = Net() if torch.cuda.is_available(): model.cuda() else: pass model.apply(weights_init) if args.resume: if isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}'" .format(args.resume)) else: print("=> no checkpoint found at '{}'".format(args.resume)) # 数据处理 # 直接在train里面处理 # dataParser = DataParser(batch_size) loss_function = nn.L1Loss() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) # train_scheduler = optim.lr_scheduler.MultiStepLR(optimizer,milestones=settings.MILESTONES,gamma=0.2)#learning rate decay scheduler = lr_scheduler.StepLR(optimizer, step_size=args.stepsize, gamma=args.gamma) log = Logger(join(TMP_DIR, '%s-%d-log.txt' % ('Adam', args.lr))) sys.stdout = log train_loss = [] train_loss_detail = [] for epoch in range(args.start_epoch, args.maxepoch): if epoch == 0: print("Performing initial testing...") # 暂时空着 tr_avg_loss, tr_detail_loss = train(model = model,optimizer = optimizer,epoch= epoch,save_dir=join(TMP_DIR, 'epoch-%d-training-record' % epoch)) test() log.flush() # Save checkpoint save_file = os.path.join(TMP_DIR, 'checkpoint_epoch{}.pth'.format(epoch)) save_checkpoint({'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}) scheduler.step() # 自动调整学习率 train_loss.append(tr_avg_loss) train_loss_detail += tr_detail_loss
def train_model(dataloader, model, optimizer, device, num_epochs, dataset_size): model.to(device) for epoch in range(num_epochs): print('-' * 15) print('Epoch {}/{}'.format(epoch + 1, num_epochs)) for phase in ['train', 'val']: #train and validate every epoch if phase == 'train': model.train() else: model.eval() running_loss = 0.0 for i in tqdm(range(len(dataloader[phase].dataset[0]))): inputs = dataloader[phase].dataset[0][i] #print(inputs.shape) labels = dataloader[phase].dataset[1][i] #print(labels.shape) inputs = inputs.unsqueeze(0) labels = labels.unsqueeze(0) inputs = inputs.to(device) labels = labels.to(device) optimizer.zero_grad() with torch.set_grad_enabled(phase == 'train'): outputs = model(inputs) loss = criterion(outputs, labels) if phase == 'train': loss.backward() optimizer.step() running_loss += loss.item() * inputs.size(0) epoch_loss = running_loss / dataset_size[phase] print('{} Loss: {:.4f} '.format(phase, epoch_loss)) # save the model #saved_model = copy.deepcopy(model.state_dict()) with open(osp.join(Config['path'], "my_model.pth"), "wb") as output_file: torch.save(model.state_dict(), output_file)
def train(epoch): print('#' * 15) print('Epoch {}, Latent Size {}'.format(epoch, model.latent_size)) print('#' * 15) model.train() for index, (x, _) in enumerate(loader): x = x.mean(dim=1, keepdim=True).to(device) optimizer.zero_grad() x_generated, mu, logvar = model(x) loss = get_loss(x_generated, x, mu, logvar) loss.backward() optimizer.step() if index % 100 == 0: print('Loss at iteration {0}: {1:.4f}'.format(index, loss.item())) if epoch == 4: filename = 'epoch{}_ls{}.pkl'.format(epoch, model.latent_size) torch.save(model.state_dict(), os.path.join(weights_dir, filename)) if epoch < 4: scheduler.step()
def train_model(dataloader, model, criterion, optimizer, device, num_epochs, dataset_size): model.to(device) since = time.time() best_model_wts = copy.deepcopy(model.state_dict()) best_acc = 0.0 acc_list = [] loss_list = [] test_acc_list= [] test_loss_list = [] for epoch in range(num_epochs): print('Epoch {}/{}'.format(epoch, num_epochs - 1)) print('-' * 10) for phase in ['train', 'test']: if phase=='train': model.train() else: model.eval() running_loss = 0.0 running_corrects = 0 for input1, input2, labels in tqdm(dataloaders[phase], position=0, leave=True): input1 = input1.to(device) input2 = input2.to(device) labels = labels.to(device) optimizer.zero_grad() with torch.set_grad_enabled(phase=='train'): outputs = model(input1, input2) outputs = torch.reshape(outputs, (outputs.shape[0],)) outputs = outputs.type(torch.DoubleTensor) labels = labels.type(torch.DoubleTensor) pred = [] for i in outputs: if i>0.5: pred.append(0) else: pred.append(1) pred = torch.FloatTensor(pred) loss = criterion(outputs,labels) if phase=='train': loss.backward() optimizer.step() running_loss += loss.item() * input1.size(0) running_corrects += torch.sum(pred==labels.data) epoch_loss = running_loss / dataset_size[phase] epoch_acc = running_corrects.double() / dataset_size[phase] if phase=='train': acc_list.append(epoch_acc) loss_list.append(epoch_loss) elif phase=='test': test_acc_list.append(epoch_acc) test_loss_list.append(epoch_loss) print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc)) if phase=='test' and epoch_acc > best_acc: best_acc = epoch_acc best_model_wts = copy.deepcopy(model.state_dict()) torch.save(best_model_wts, osp.join(Config['root_path'], Config['checkpoint_path'], 'model.pth')) print('Model saved at: {}'.format(osp.join(Config['root_path'], Config['checkpoint_path'], 'model.pth'))) time_elapsed = time.time() - since print('Time taken to complete training: {:0f}m {:0f}s'.format(time_elapsed // 60, time_elapsed % 60)) print('Best acc: {:.4f}'.format(best_acc)) np.savetxt('acc_list.txt',acc_list) np.savetxt('test_acc_list.txt',test_acc_list) np.savetxt('loss_list.txt',loss_list) np.savetxt('test_loss_list.txt',test_loss_list)
epoch_start_time = time.time() train(args,model,train_dataset,epoch) val_loss = evaluate(args,model,test_dataset) print('-' * 89) print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.4f} | '.format(epoch, (time.time() - epoch_start_time), val_loss)) print('-' * 89) generate_output(args,epoch,model,gen_dataset,startPoint=1500) if epoch%args.save_interval==0: # Save the model if the validation loss is the best we've seen so far. is_best = val_loss > best_val_loss best_val_loss = max(val_loss, best_val_loss) model_dictionary = {'epoch': epoch, 'best_loss': best_val_loss, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'args':args } model.save_checkpoint(model_dictionary, is_best) except KeyboardInterrupt: print('-' * 89) print('Exiting from training early') # Calculate mean and covariance for each channel's prediction errors, and save them with the trained model print('=> calculating mean and covariance') means, covs = list(),list() train_dataset = TimeseriesData.batchify(args, TimeseriesData.trainData, bsz=1) for channel_idx in range(model.enc_input_size):
''' if use_cuda: print('Using GPU') model.cuda() else: print('Using CPU') criterion = nn.CrossEntropyLoss() # Observe that all parameters are being optimized optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) # Decay LR by a factor of 0.1 every 7 epochs exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1) #best_acc = 0 for epoch in range(1, epoch + 1): train(epoch) validation() #acc = 100. * correct / len(val_loader.dataset) #if acc > best_acc : #best_acc = acc #best_model = copy.deepcopy(model.state_dict()) model_file = experiment + '/model_' + str(epoch) + '.pth' torch.save(model.state_dict(), model_file) print('Saved model to ' + model_file + '. You can run `python evaluate.py --model ' + model_file + '` to generate the Kaggle formatted csv file\n') #model.load_state_dict(best_model)
num_minibatches = len(train_inputs) // minibatch_size for epoch in (range(30)): # Training print("Training") # Put the model in training mode m.train() start_train = time.time() for group in tqdm(range(num_minibatches)): total_loss = None optimizer.zero_grad() for i in range(group * minibatch_size, (group + 1) * minibatch_size): input_seq = train_inputs[i] gold_seq = torch.tensor(train_outputs[i]) prediction = m(input_seq) loss = m.compute_Loss(prediction, gold_seq) # On the first gradient update if total_loss is None: total_loss = loss else: total_loss += loss total_loss = total_loss / 3 total_loss.backward() optimizer.step() print("Training time: {} for epoch {}".format( time.time() - start_train, epoch)) torch.save(m.state_dict(), 'model_1.pt')
def main(): """Run training""" from model import model parser = argparse.ArgumentParser( description='PyTorch RNN Prediction Model on Time-series Dataset') parser.add_argument( '--data', type=str, default='ecg', help= 'type of the dataset (ecg, gesture, power_demand, space_shuttle, respiration, nyc_taxi' ) parser.add_argument('--filename', type=str, default='chfdb_chf13_45590.pkl', help='filename of the dataset') parser.add_argument( '--model', type=str, default='LSTM', help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU, SRU)') parser.add_argument('--augment', type=bool, default=True, help='augment') parser.add_argument('--emsize', type=int, default=32, help='size of rnn input features') parser.add_argument('--nhid', type=int, default=32, help='number of hidden units per layer') parser.add_argument('--nlayers', type=int, default=2, help='number of layers') parser.add_argument('--res_connection', action='store_true', help='residual connection') parser.add_argument('--lr', type=float, default=0.0002, help='initial learning rate') parser.add_argument('--weight_decay', type=float, default=1e-4, help='weight decay') parser.add_argument('--clip', type=float, default=10, help='gradient clipping') parser.add_argument('--epochs', type=int, default=400, help='upper epoch limit') parser.add_argument('--batch_size', type=int, default=64, metavar='N', help='batch size') parser.add_argument('--eval_batch_size', type=int, default=64, metavar='N', help='eval_batch size') parser.add_argument('--bptt', type=int, default=50, help='sequence length') parser.add_argument('--teacher_forcing_ratio', type=float, default=0.7, help='teacher forcing ratio (deprecated)') parser.add_argument('--dropout', type=float, default=0.2, help='dropout applied to layers (0 = no dropout)') parser.add_argument( '--tied', action='store_true', help='tie the word embedding and softmax weights (deprecated)') parser.add_argument('--seed', type=int, default=1111, help='random seed') parser.add_argument('--device', type=str, default='cuda', help='cuda or cpu') parser.add_argument('--log_interval', type=int, default=10, metavar='N', help='report interval') parser.add_argument('--save_interval', type=int, default=10, metavar='N', help='save interval') parser.add_argument('--save_fig', action='store_true', help='save figure') parser.add_argument( '--resume', '-r', help= 'use checkpoint model parameters as initial parameters (default: False)', action="store_true") parser.add_argument( '--pretrained', '-p', help= 'use checkpoint model parameters and do not train anymore (default: False)', action="store_true") parser.add_argument('--prediction_window_size', type=int, default=10, help='prediction_window_size') args = parser.parse_args() # Set the random seed manually for reproducibility. torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) ############################################################################### # Load data ############################################################################### TimeseriesData = preprocess_data.PickleDataLoad( data_type=args.data, filename=args.filename, augment_test_data=args.augment) train_dataset = TimeseriesData.batchify(args, TimeseriesData.trainData, args.batch_size) test_dataset = TimeseriesData.batchify(args, TimeseriesData.testData, args.eval_batch_size) gen_dataset = TimeseriesData.batchify(args, TimeseriesData.testData, 1) ############################################################################### # Build the model ############################################################################### feature_dim = TimeseriesData.trainData.size(1) model = model.RNNPredictor(rnn_type=args.model, enc_inp_size=feature_dim, rnn_inp_size=args.emsize, rnn_hid_size=args.nhid, dec_out_size=feature_dim, nlayers=args.nlayers, dropout=args.dropout, tie_weights=args.tied, res_connection=args.res_connection).to( args.device) optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) criterion = nn.MSELoss() ############################################################################### # Training code ############################################################################### def get_batch(args, source, i): seq_len = min(args.bptt, len(source) - 1 - i) data = source[i:i + seq_len] # [ seq_len * batch_size * feature_size ] target = source[i + 1:i + 1 + seq_len] # [ (seq_len x batch_size x feature_size) ] return data, target def generate_output(args, epoch, model, gen_dataset, disp_uncertainty=True, startPoint=500, endPoint=3500): if args.save_fig: # Turn on evaluation mode which disables dropout. model.eval() hidden = model.init_hidden(1) outSeq = [] upperlim95 = [] lowerlim95 = [] with torch.no_grad(): for i in range(endPoint): if i >= startPoint: # if disp_uncertainty and epoch > 40: # outs = [] # model.train() # for i in range(20): # out_, hidden_ = model.forward(out+0.01*Variable(torch.randn(out.size())).cuda(),hidden,noise=True) # outs.append(out_) # model.eval() # outs = torch.cat(outs,dim=0) # out_mean = torch.mean(outs,dim=0) # [bsz * feature_dim] # out_std = torch.std(outs,dim=0) # [bsz * feature_dim] # upperlim95.append(out_mean + 2.58*out_std/np.sqrt(20)) # lowerlim95.append(out_mean - 2.58*out_std/np.sqrt(20)) out, hidden = model.forward(out, hidden) #print(out_mean,out) else: out, hidden = model.forward( gen_dataset[i].unsqueeze(0), hidden) outSeq.append(out.data.cpu()[0][0].unsqueeze(0)) outSeq = torch.cat(outSeq, dim=0) # [seqLength * feature_dim] target = preprocess_data.reconstruct(gen_dataset.cpu(), TimeseriesData.mean, TimeseriesData.std) outSeq = preprocess_data.reconstruct(outSeq, TimeseriesData.mean, TimeseriesData.std) # if epoch>40: # upperlim95 = torch.cat(upperlim95, dim=0) # lowerlim95 = torch.cat(lowerlim95, dim=0) # upperlim95 = preprocess_data.reconstruct(upperlim95.data.cpu().numpy(),TimeseriesData.mean,TimeseriesData.std) # lowerlim95 = preprocess_data.reconstruct(lowerlim95.data.cpu().numpy(),TimeseriesData.mean,TimeseriesData.std) plt.figure(figsize=(15, 5)) for i in range(target.size(-1)): plt.plot(target[:, :, i].numpy(), label='Target' + str(i), color='black', marker='.', linestyle='--', markersize=1, linewidth=0.5) plt.plot(range(startPoint), outSeq[:startPoint, i].numpy(), label='1-step predictions for target' + str(i), color='green', marker='.', linestyle='--', markersize=1.5, linewidth=1) # if epoch>40: # plt.plot(range(startPoint, endPoint), upperlim95[:,i].numpy(), label='upperlim'+str(i), # color='skyblue', marker='.', linestyle='--', markersize=1.5, linewidth=1) # plt.plot(range(startPoint, endPoint), lowerlim95[:,i].numpy(), label='lowerlim'+str(i), # color='skyblue', marker='.', linestyle='--', markersize=1.5, linewidth=1) plt.plot(range(startPoint, endPoint), outSeq[startPoint:, i].numpy(), label='Recursive predictions for target' + str(i), color='blue', marker='.', linestyle='--', markersize=1.5, linewidth=1) plt.xlim([startPoint - 500, endPoint]) plt.xlabel('Index', fontsize=15) plt.ylabel('Value', fontsize=15) plt.title('Time-series Prediction on ' + args.data + ' Dataset', fontsize=18, fontweight='bold') plt.legend() plt.tight_layout() plt.text(startPoint - 500 + 10, target.min(), 'Epoch: ' + str(epoch), fontsize=15) save_dir = Path( 'result', args.data, args.filename).with_suffix('').joinpath('fig_prediction') save_dir.mkdir(parents=True, exist_ok=True) plt.savefig( save_dir.joinpath('fig_epoch' + str(epoch)).with_suffix('.png')) #plt.show() plt.close() return outSeq else: pass def evaluate_1step_pred(args, model, test_dataset): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0 with torch.no_grad(): hidden = model.init_hidden(args.eval_batch_size) for nbatch, i in enumerate( range(0, test_dataset.size(0) - 1, args.bptt)): inputSeq, targetSeq = get_batch(args, test_dataset, i) outSeq, hidden = model.forward(inputSeq, hidden) loss = criterion(outSeq.view(args.batch_size, -1), targetSeq.view(args.batch_size, -1)) hidden = model.repackage_hidden(hidden) total_loss += loss.item() return total_loss / nbatch def train(args, model, train_dataset, epoch): with torch.enable_grad(): # Turn on training mode which enables dropout. model.train() total_loss = 0 start_time = time.time() hidden = model.init_hidden(args.batch_size) for batch, i in enumerate( range(0, train_dataset.size(0) - 1, args.bptt)): inputSeq, targetSeq = get_batch(args, train_dataset, i) # inputSeq: [ seq_len * batch_size * feature_size ] # targetSeq: [ seq_len * batch_size * feature_size ] # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = model.repackage_hidden(hidden) hidden_ = model.repackage_hidden(hidden) optimizer.zero_grad() '''Loss1: Free running loss''' outVal = inputSeq[0].unsqueeze(0) outVals = [] hids1 = [] for i in range(inputSeq.size(0)): outVal, hidden_, hid = model.forward(outVal, hidden_, return_hiddens=True) outVals.append(outVal) hids1.append(hid) outSeq1 = torch.cat(outVals, dim=0) hids1 = torch.cat(hids1, dim=0) loss1 = criterion( outSeq1.contiguous().view(args.batch_size, -1), targetSeq.contiguous().view(args.batch_size, -1)) '''Loss2: Teacher forcing loss''' outSeq2, hidden, hids2 = model.forward(inputSeq, hidden, return_hiddens=True) loss2 = criterion( outSeq2.contiguous().view(args.batch_size, -1), targetSeq.contiguous().view(args.batch_size, -1)) '''Loss3: Simplified Professor forcing loss''' loss3 = criterion(hids1.view(args.batch_size, -1), hids2.view(args.batch_size, -1).detach()) '''Total loss = Loss1+Loss2+Loss3''' loss = loss1 + loss2 + loss3 loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) optimizer.step() total_loss += loss.item() if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | ms/batch {:5.4f} | ' 'loss {:5.2f} '.format( epoch, batch, len(train_dataset) // args.bptt, elapsed * 1000 / args.log_interval, cur_loss)) total_loss = 0 start_time = time.time() def evaluate(args, model, test_dataset): # Turn on evaluation mode which disables dropout. model.eval() with torch.no_grad(): total_loss = 0 hidden = model.init_hidden(args.eval_batch_size) nbatch = 1 for nbatch, i in enumerate( range(0, test_dataset.size(0) - 1, args.bptt)): inputSeq, targetSeq = get_batch(args, test_dataset, i) # inputSeq: [ seq_len * batch_size * feature_size ] # targetSeq: [ seq_len * batch_size * feature_size ] hidden_ = model.repackage_hidden(hidden) '''Loss1: Free running loss''' outVal = inputSeq[0].unsqueeze(0) outVals = [] hids1 = [] for i in range(inputSeq.size(0)): outVal, hidden_, hid = model.forward(outVal, hidden_, return_hiddens=True) outVals.append(outVal) hids1.append(hid) outSeq1 = torch.cat(outVals, dim=0) hids1 = torch.cat(hids1, dim=0) loss1 = criterion( outSeq1.contiguous().view(args.batch_size, -1), targetSeq.contiguous().view(args.batch_size, -1)) '''Loss2: Teacher forcing loss''' outSeq2, hidden, hids2 = model.forward(inputSeq, hidden, return_hiddens=True) loss2 = criterion( outSeq2.contiguous().view(args.batch_size, -1), targetSeq.contiguous().view(args.batch_size, -1)) '''Loss3: Simplified Professor forcing loss''' loss3 = criterion(hids1.view(args.batch_size, -1), hids2.view(args.batch_size, -1).detach()) '''Total loss = Loss1+Loss2+Loss3''' loss = loss1 + loss2 + loss3 total_loss += loss.item() return total_loss / (nbatch + 1) # Loop over epochs. if args.resume or args.pretrained: print("=> loading checkpoint ") checkpoint = torch.load( Path('save', args.data, 'checkpoint', args.filename).with_suffix('.pth')) args, start_epoch, best_val_loss = model.load_checkpoint( args, checkpoint, feature_dim) optimizer.load_state_dict((checkpoint['optimizer'])) del checkpoint epoch = start_epoch print("=> loaded checkpoint") else: epoch = 1 start_epoch = 1 best_val_loss = float('inf') print("=> Start training from scratch") print('-' * 89) print(args) print('-' * 89) if not args.pretrained: # At any point you can hit Ctrl + C to break out of training early. try: for epoch in range(start_epoch, args.epochs + 1): epoch_start_time = time.time() train(args, model, train_dataset, epoch) val_loss = evaluate(args, model, test_dataset) print('-' * 89) print( '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.4f} | ' .format(epoch, (time.time() - epoch_start_time), val_loss)) print('-' * 89) generate_output(args, epoch, model, gen_dataset, startPoint=1500) if epoch % args.save_interval == 0: # Save the model if the validation loss is the best we've seen so far. is_best = val_loss < best_val_loss best_val_loss = min(val_loss, best_val_loss) model_dictionary = { 'epoch': epoch, 'best_loss': best_val_loss, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'args': args } model.save_checkpoint(model_dictionary, is_best) except KeyboardInterrupt: print('-' * 89) print('Exiting from training early') # Calculate mean and covariance for each channel's prediction errors, and save them with the trained model print('=> calculating mean and covariance') means, covs = list(), list() train_dataset = TimeseriesData.batchify(args, TimeseriesData.trainData, bsz=1) for channel_idx in range(model.enc_input_size): mean, cov = fit_norm_distribution_param( args, model, train_dataset[:TimeseriesData.length], channel_idx) means.append(mean), covs.append(cov) model_dictionary = { 'epoch': max(epoch, start_epoch), 'best_loss': best_val_loss, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'args': args, 'means': means, 'covs': covs } model.save_checkpoint(model_dictionary, True) print('-' * 89)
model = model.to(device) l1_criterion = l1_criterion.to(device) if args.pretrained: if os.path.isfile(args.pretrained): print("===> loading models '{}'".format(args.pretrained)) checkpoint = torch.load(args.pretrained) new_state_dcit = OrderedDict() for k, v in checkpoint.items(): if 'module' in k: name = k[7:] else: name = k new_state_dcit[name] = v model_dict = model.state_dict() pretrained_dict = { k: v for k, v in new_state_dcit.items() if k in model_dict } for k, v in model_dict.items(): if k not in pretrained_dict: print(k) model.load_state_dict(pretrained_dict, strict=True) else: print("===> no models found at '{}'".format(args.pretrained)) print("===> Setting Optimizer")
def train_model(model, device, train_data_loader, valid_data_loader, criterion, optimizer, scheduler, num_epochs=5): """ training Parameters -------------- model : DogClassificationModel Network model to be trained. device : device cuda or cpu train_data_loader : dataloader dataloader for training valid_data_loader : dataloader dataloader for validation criterion : Loss function. optimizer : Optimizer. scheduler : Learning rate scheduler. num_epochs : int The number of epochs. Returns -------------- model : DogClassificationModel Trained model. """ since = time.time() model = model.to(device) best_model_wts = copy.deepcopy(model.state_dict()) best_acc = 0.0 for epoch in range(num_epochs): bar = tqdm(total=len(train_data_loader)) bar.set_description("Epoch: {}/{}".format(epoch + 1, num_epochs)) """ Training Phase """ model.train() running_loss = 0.0 running_corrects = 0 for j, (inputs, labels) in enumerate(train_data_loader): optimizer.zero_grad() tmp_loss_item = 0.0 # training with torch.set_grad_enabled(True): outputs = model(inputs.to(device)) torch.cuda.empty_cache() _, preds = torch.max(outputs, 1) loss = criterion(outputs, labels.to(device)) # backward + optimize only if in training phase loss.backward() optimizer.step() tmp_loss_item = loss.item() # statistics running_loss += tmp_loss_item * inputs.size(0) running_corrects += torch.sum(preds.to('cpu') == labels.data) # progress bar bar.update(1) tmp_loss = float(running_loss / (j + 1)) / 32 # 32: mini-batch size tmp_acc = float(running_corrects // (j + 1)) / 32 bar.set_postfix(OrderedDict(loss=tmp_loss, acc=tmp_acc)) # update learning rate scheduler scheduler.step() dataset_size = len(train_data_loader.dataset) epoch_loss = running_loss / dataset_size epoch_acc = running_corrects.double() / dataset_size """ Validation Phase """ model.eval() # Set model to validation mode val_running_loss = 0.0 val_running_corrects = 0 # Iterate over data. for inputs, labels in valid_data_loader: val_inputs = inputs.to(device) val_labels = labels.to(device) # zero the parameter gradients optimizer.zero_grad() # forward # track history if only in train with torch.no_grad(): val_outputs = model(val_inputs) _, preds = torch.max(val_outputs, 1) loss = criterion(val_outputs, val_labels) # statistics val_running_loss += loss.item() * val_inputs.size(0) val_running_corrects += torch.sum(preds == val_labels.data) dataset_size = len(valid_data_loader.dataset) val_epoch_loss = val_running_loss / dataset_size val_epoch_acc = val_running_corrects.double() / dataset_size print('VALIDATION Loss: {:.4f} Acc: {:.4f}'.format( val_epoch_loss, val_epoch_acc)) print("Elapsed time: {} [sec]".format(time.time() - since)) # deep copy the model if val_epoch_acc > best_acc: best_acc = val_epoch_acc best_model_wts = copy.deepcopy(model.state_dict()) time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print('Best val Acc: {:4f}'.format(best_acc)) # load best model weights model.load_state_dict(best_model_wts) return model
drop_last=False) LFW_valid_loader = torch.utils.data.Dataloader( dataset.ImageList_x(root=args.root_path, fileList=LFW_valid_std, transform=valid_transform), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=False, drop_last=False) # 4.4 load model model_dir = './imageset/label/model_ir_se50.pth' pretrained_dict = torch.load(model_dir) model = model.Backbone(num_layers=50, drop_ratio=0.6, mode='ir') model_dict = model.state_dict() pretrained_dict = { k: v for k, v in pretrained_dict.items() if k in model_dict } model_dict.update(pretrained_dict) # update parameter model.load_state_dict(pretrained_dict) model = torch.nn.DataParallel(model).to(args.device) # 4.5 set loss_function loss_function_A = torch.nn.MarginRankingLoss().to(args.device) loss_function_B = torch.nn.MSELoss().to(args.device) # 4.6 choose optimizer optimizer = torch.optim.SGD(model.parameters(), lr=args.lr,
def train(model,optimizer,epoch,save_dir): dataParser = DataParser(args.batch_size) batch_time = Averagvalue() data_time = Averagvalue() losses = Averagvalue() # switch to train mode model.train() end = time.time() epoch_loss = [] counter = 0 for batch_index ,(images,labels_numpy) in enumerate(generate_minibatches(dataParser,True)): # measure data loading time data_time.update(time.time()-end) labels = [] if torch.cuda.is_available(): images = torch.from_numpy(images).cuda() for item in labels_numpy: labels.append(torch.from_numpy(item).cuda()) else: images = torch.from_numpy(images) for item in labels_numpy: labels.append(torch.from_numpy(item)) if torch.cuda.is_available(): loss =torch.zeros(1).cuda() else: loss = torch.zeros(1) optimizer.zero_grad() outputs = model(images) # 四张GT监督 for o in outputs[9:]: # o2 o3 o4 t_loss = cross_entropy_loss(o, labels[-1]) loss = loss +t_loss counter +=1 for c_index,c in enumerate(outputs[:8]): loss = loss + cross_entropy_loss(c, labels[c_index]) loss = loss/11 loss.backward() acc_scroe = my_accuracy_score(outputs[9].cpu().detach().numpy(),labels[-1].cpu().detach().numpy()) print('the acc is :',acc_scroe) # 下面应该是用来解决batch size 过下的问题 # if counter == args.itersize: # optimizer.step() # optimizer.zero_grad() # counter = 0 optimizer.step() optimizer.zero_grad() # measure the accuracy and record loss losses.update(loss.item(),images.size(0)) epoch_loss.append(loss.item()) batch_time.update(time.time()-end) end = time.time() # display and logging if not isdir(save_dir): os.makedirs(save_dir) if batch_index % args.print_freq ==0: info = 'Epoch: [{0}/{1}][{2}/{3}] '.format(epoch, args.maxepoch, batch_index, dataParser.steps_per_epoch) + \ 'Time {batch_time.val:.3f} (avg:{batch_time.avg:.3f}) '.format(batch_time=batch_time) + \ 'Loss {loss.val:f} (avg:{loss.avg:f}) '.format( loss=losses) print(info) # torch.save(model,join(save_dir,"checkpoint.pth")) # 每一轮保存一次参数 save_checkpoint({'epoch': epoch,'state_dict':model.state_dict(), 'optimizer': optimizer.state_dict()},filename=join(save_dir,"epooch-%d-checkpoint.pth" %epoch)) return losses.avg,epoch_loss
optimizer.zero_grad() #梯度归零 output = model(data) #前向计算 loss = criterion(output, target) #计算 loss loss.backward() #反向传播 optimizer.step() #优化器梯度下降 predictions = output.argmax(dim=1, keepdim=True).squeeze() #预测 correct += (predictions == target).sum().item() #统计预测正确数 accuracy = correct / (BATCH_SIZE * batch) #计算准确度 tepoch.set_postfix(loss=loss.item(), accuracy=100. * accuracy) if epoch % 15 == 0: print("Epoch done, evaluating:", epoch) torch.save(model.state_dict(), "./chkpoint_res.bin") #每 15 epoch 保存一次 model.eval() #测试 with tqdm(eval_dataloader, unit="batch") as eepoch: correct = 0 batch = 0 for data, target in eepoch: batch += 1 eepoch.set_description(f"Epoch {epoch}") data, target = data.cuda(), target.cuda() output = model(data) predictions = output.argmax(dim=1, keepdim=True).squeeze() correct += (predictions == target).sum().item() accuracy = correct / (BATCH_SIZE * batch) eepoch.set_postfix(loss=loss.item(), accuracy=100. * accuracy)
print('-' * 89) print( '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.4f} | ' .format(epoch, (time.time() - epoch_start_time), val_loss)) print('-' * 89) generate_output(args, epoch, model, gen_dataset, startPoint=1500) if epoch % args.save_interval == 0: # Save the model if the validation loss is the best we've seen so far. is_best = val_loss < best_val_loss best_val_loss = min(val_loss, best_val_loss) model_dictionary = { 'epoch': epoch, 'best_loss': best_val_loss, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'args': args } model.save_checkpoint(model_dictionary, is_best) except KeyboardInterrupt: print('-' * 89) print('Exiting from training early') # Calculate mean and covariance for each channel's prediction errors, and save them with the trained model print('=> calculating mean and covariance') means, covs = list(), list() train_dataset = TimeseriesData.batchify(args, TimeseriesData.trainData, bsz=1) for channel_idx in range(model.enc_input_size): mean, cov = fit_norm_distribution_param(
train() val_loss = evaluate(model, val_data) print("-" * 89) print("| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.4f} | " "".format( epoch, (time.time() - epoch_start_time), val_loss, )) print("-" * 89) if val_loss < best_val_loss: best_val_loss = val_loss best_model = model scheduler.step() ###################################################################### # Evaluate the model with the test dataset # ------------------------------------- # # Apply the best model to check the result with the test dataset. test_loss = evaluate(best_model, test_data) print("=" * 89) print("| End of training | test loss {:5.2f} | test ppl {:8.2f}".format( test_loss, test_loss)) print("=" * 89) torch.save(model.state_dict(), "model.pt")
start_time = time.time() train_iterator = DataIter(srclocationDatas[0:numTrainData], trgLocationDatas[0:numTrainData], device, 2, centerLocs) valid_iterator = DataIter(srclocationDatas[numTrainData:], trgLocationDatas[numTrainData:], device, 2, centerLocs) # 训练 train_loss = train(model, train_iterator, optimizer, criterion, CLIP) # 验证 valid_loss = evaluate(model, valid_iterator, criterion) end_time = time.time() epoch_mins, epoch_secs = epoch_time(start_time, end_time) if valid_loss < best_valid_loss: best_valid_loss = valid_loss torch.save(model.state_dict(), 'my-model-test.pt') print("Epoch:", epoch + 1, "| Time:", epoch_mins, "m", epoch_secs, "s") print("\tTrain Loss:", train_loss) print("\tVal Loss:", valid_loss) print('best_valid_loss is ') print(best_valid_loss) # 将训练数据和测试数据存储在txt文件中,将mu和sig存在.npy文件中 fileTest = open(homeDirectory + 'testPath.txt', mode='w') fileTrain = open(homeDirectory + 'trainPath.txt', mode='w') for testPath in paths[int(numTrainPaths):]: fileTest.write(testPath) fileTest.write('\n') fileTest.close()
val_acc = 0.0 model.train() for idx, (image, label) in enumerate(train_loader): optimizer.zero_grad() output = model(image.cuda()) loss = loss_function(output, label.cuda()) loss.backward() optimizer.step() print('Epoch: [{0}][{1}/{2}] loss: {3}'.format(epoch + 1, idx + 1, len(train_loader), loss.item())) model.eval() with torch.no_grad(): for idx, (image, label) in enumerate(val_loader): output = model(image.cuda()) for i in range(BATCH): pred = torch.max(output[i]) for j in range(3): if output[i][j] == pred and label[i][j] == 1.0: val_acc += 1 break print('Epoch: [{0}] val_acc: {1}'.format(epoch + 1, val_acc / len(val_label))) if val_acc / len(val_label) > best_acc: best_acc = val_acc / len(val_label) torch.save(model.state_dict(), os.path.join('model/model.tar')) scheduler.step()