def main(): X_tr, y_tr, X_te, y_te = load_data() X_tr, y_tr = X_tr[:1024], y_tr[:1024] X_te, y_te = X_te[:128], y_te[:128] if args.model == 'cnn': model = ConvNet() model_save_path = config.CNN_MODEL_PATH else: model = CapsuleNet() model_save_path = config.CAPSULE_MODEL_PATH model.to(device) optimizer = Adam(model.parameters()) train_loss = [] train_accuracy = [] best_acc = 0.0 for epoch in range(10): print(("Epoch %d " + "-" * 70) % (epoch + 1)) loss = train(model, optimizer, X_tr, y_tr) train_loss.append(loss) acc = test(model, X_tr, y_tr, "Train") train_accuracy.append(acc) if acc > best_acc: best_acc = acc torch.save(model.state_dict(), model_save_path) pickle.dump((train_loss, train_accuracy), \ open('result/' + args.model + '_train.p', 'wb'))
def main(): # Device configuration device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print("Using device: ", device) model = ConvNet().to(device) torch.save(model.state_dict(), FILE) print("Finished saving model.")
class Training: def __init__(self, epoch, learningRate, batchSize, imageSize, L2Rate, trainPath): super(Training, self).__init__() self.epoch = epoch self.learningRate = learningRate self.batchSize = batchSize self.imageSize = imageSize self.L2Rate = L2Rate self.trainPath = trainPath self.data_size = calculate_data_size(self.trainPath) self.num_batches = self.data_size // batchSize self.data_loader = run_loader('train', trainPath, batchSize, imageSize, shuffle=True) self.model = ConvNet(10) self.train() def train(self): self.model.train() crossentropy = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learningRate, weight_decay=self.L2Rate) for epoch in range(self.epoch): epoch_loss = 0 epoch_acc = 0 for X, y in tqdm(self.data_loader): optimizer.zero_grad() out = self.model(X) loss = crossentropy(out, y) loss.backward() optimizer.step() epoch_loss += loss.item() # makes it to python float predictions = torch.argmax(out, 1) epoch_acc += torch.sum(predictions == y).item() epoch_loss = epoch_loss / self.num_batches epoch_acc = epoch_acc / self.data_size print(f"Epoch {epoch}:", "ACC:", epoch_acc, "LOSS:", epoch_loss) torch.save(self.model.state_dict(), f"Trained/Model_{epoch}.model")
def main(): # Device configuration device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print("Using device: ", device) model = ConvNet().to(device) try: model.load_state_dict(torch.load(FILE)) print("Finished loading model.") model.eval() except IOError: print("Failed to load model. Model might not exist.") return print("Print Network Parameters:") for param in model.parameters(): print(param) print("Print model state dict: ", model.state_dict()) with torch.no_grad(): print("Perform inference/testing here...")
"val_loss_batch": batch_loss, "val_acc_batch": batch_acc }) val_loop.set_postfix_str( f"val_loss: {round(val_loss/i, 4)} - val_acc: {round(val_acc/i, 4)}" ) # Change learning rate lr_scheduler.step() # Calculate and log averages train_loss /= len(train_loader) val_loss /= len(val_loader) train_acc /= len(train_loader) val_acc /= len(val_loader) wandb.log({ "train_loss_epoch": train_loss, "train_acc_epoch": train_acc, "val_loss_epoch": val_loss, "val_acc_epoch": val_acc, }) total_time = round(time.time() - start_time, 1) print(f"Time per epoch: {total_time}s") # Save model torch.save( model.state_dict(), os.path.join(wandb.run.dir, f"model_{epoch}_{round(val_loss, 4)}.pth"), )
images = im_batch['image'] images = images.to(device) labels = im_batch['arrangement'] labels = labels.reshape(-1, num_classes) labels = labels.float().to(device) outputs = model(images) outputs.reshape(test_batch_size, num_classes) # get class index from one-hot _, predicted = torch.max(outputs.data, 1) _, classes = torch.max(labels.data, 1) total += outputs.size(0) if predicted == classes: count = count + 1 print('Test Accuracy of the model on the 1453 test images: {} %'.format( 100 * count / total)) # Save the model checkpoint # save model if not os.path.exists('./saved_models/'): os.makedirs('./saved_models/') ts = time.time() st = datetime.datetime.fromtimestamp(ts).strftime('%Y%m%d_%H%M%S') torch.save(model.state_dict(), "./saved_models/model_arr_" + st + ".ckpt")
from model import SimpleNet, ResNet, ConvNet from mcts import mcts from agents import netAgent, processObservation from epoch_training import selfplay, net_update from evaluation import evaluate model = ConvNet(42, 7, 64) defaultModel = ConvNet(42, 7, 64) log = open("log.txt", 'w') # defaultModel.load_state_dict(torch.load('parameters_simple128.pth')) optimizer = torch.optim.SGD(model.parameters(), lr=0.01) for epoch in range(1000): agent = netAgent(model, return_probs=True) against = netAgent(defaultModel, incorrect_moves=False) training_data = selfplay(agent, against, num=10) net_update(model, training_data, optimizer) agent = netAgent(model, incorrect_moves=False, best_move=False) against = netAgent(defaultModel, incorrect_moves=False, best_move=False) result = evaluate(agent, against, 1000) log.write("Epoch " + str(epoch) + " Result: " + str(result) + "\n") print("Test result: ", result) if (result > 0.65): torch.save(model.state_dict(), "parameters_simple128.pth") defaultModel.load_state_dict(model.state_dict()) print("switch") log.write("Switch\n")
summary(model, input_size=(3, 640, 640), device='cpu') # model.load_state_dict(torch.load('no_gassuion_epoch35.pth')) criterion = MultiBranchLoss(input_size=(640, 640), writer=writer, obj_scale=obj_scale, nobj_scale=nobj_scale, loc_scale=loc_scale) optimizer = Adam(model.parameters(), lr=learing_rate) batchs_loss = 0 for epoch in range(epochs): model.train() dataset = WIDERFaceDetection(WIDERFace_ROOT, transform=SSDAugmentation(640, (127.5, 127.5, 127.5))) dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn) for i, (images, labels) in enumerate(dataloader): batch_num = epoch * len(dataloader) + i + 1 optimizer.zero_grad() if torch.cuda.is_available(): images = images.cuda() outputs = model(images) loss = criterion(outputs, labels, batch_num) batchs_loss += loss.item() loss.backward() optimizer.step() if batch_num % show_iter == 0: average_loss = batchs_loss / show_iter print("epoch {} batch {}:".format(epoch, i)) print('total_loss', average_loss) writer.add_scalar('total_loss', average_loss, global_step=batch_num) batchs_loss = 0 if epoch%10 == 0: torch.save(model.state_dict(), "no_gassuion_epoch{}.pth".format(epoch))
# Show the training information if batch % 500 == 0 or batch == len(val_loader): acc = val_correct_cnt / val_total_cnt ave_loss = val_total_loss / batch print( 'Validation batch index: {}, val loss: {:.6f}, acc: {:.3f}' .format(batch, ave_loss, acc)) validation_loss.append(ave_loss) validation_acc.append(acc) model.train() # Save trained model torch.save(model.state_dict(), './checkpoint/%s.pth' % model.name()) # Plot Learning Curve # TODO fig, axs = plt.subplots(nrows=2, ncols=2, constrained_layout=True) axs[0, 0].plot(train_loss) axs[0, 0].set_xlabel('epoch', fontsize=12) axs[0, 0].set_ylabel('loss', fontsize=12) axs[0, 0].set_title('Training Loss', fontsize=14) axs[0, 1].plot(validation_loss) axs[0, 1].set_xlabel('epoch', fontsize=12) axs[0, 1].set_ylabel('loss', fontsize=12) axs[0, 1].set_title('Validation Loss', fontsize=14) axs[1, 0].plot(train_acc)
def train(pre_trained=None): # create folder to save models and loss graphs reference = hp['net_type'] + str(time.strftime("_%Y%m%d_%H%M%S")) checkpoints_folder = hp["output_dir"] + '/checkpoints/' + reference os.makedirs(checkpoints_folder, exist_ok=True) # save hyper parameter settings pickle_file_location = checkpoints_folder + "/hp.pkl" pickle_file = open(pickle_file_location, "wb") pickle.dump(hp, pickle_file) pickle_file.close() # create data iterator train_data_set = DataGenerator(hp) iterator = DataLoader(dataset=train_data_set, batch_size=hp['batch_size'], num_workers=hp['num_workers'], pin_memory=True, shuffle=False, drop_last=True) val_set = ValidationDataGenerator(hp) val_set_iterator = DataLoader(dataset=val_set, batch_size=50, num_workers=hp['num_workers'], pin_memory=True, shuffle=False, drop_last=True) # create model and loss model = ConvNet().to(device) loss = CrossEntropyLoss().to(device) # optimizer optimizer = torch.optim.Adam(params=model.parameters(), lr=hp['learning_rate']) start_epoch = 0 # load pre trained model if pre_trained is not None: ckpt = torch.load(pre_trained) model.load_state_dict(ckpt['net']) optimizer.load_state_dict(ckpt['opt']) start_epoch = ckpt['epoch'] + 1 # init loss arrays classification_loss = np.zeros(hp['num_epochs']) train_accuracy = np.zeros(hp['num_epochs']) val_accuracy = np.zeros(hp['num_epochs']) # training loop for epoch in range(start_epoch, hp['num_epochs']): c_loss = 0 acc = 0 for i, (img, label) in enumerate(iterator): img = img.to(device, dtype=torch.float) label = label.to(device, dtype=torch.float) optimizer.zero_grad() logits = model(img) l = loss(logits, label.long()) l.backward() optimizer.step() c_loss += l.item() # calc accuracy logits = logits.detach().cpu().numpy() label = label.detach().cpu().numpy() acc += utils.classification_accuracy(logits, label) print("epoch = {}, Training_sample={}, classification loss ={}". format(epoch, i, l.item())) # average loss per epoch classification_loss[epoch] = c_loss / (i + 1) # average accuracy per epoch train_accuracy[epoch] = acc / (i + 1) print("epoch = {}, average classification loss ={}".format( epoch, classification_loss[epoch])) print("epoch = {}, Training accuracy ={}".format( epoch, train_accuracy[epoch])) with torch.no_grad(): val_acc = 0 for i, (img, label) in enumerate(val_set_iterator): img = img.to(device, dtype=torch.float) label = label.to(device, dtype=torch.float) logits = model(img) # calc accuracy logits = logits.detach().cpu().numpy() label = label.detach().cpu().numpy() val_acc += utils.classification_accuracy(logits, label) val_accuracy[epoch] = val_acc / (i + 1) print("epoch = {}, Validation set accuracy ={}".format( epoch, val_accuracy[epoch])) # plot accuracy curves and save model plt.plot(range(1, len(train_accuracy) + 1), train_accuracy, 'b-', label=" Train Accuracy") plt.plot(range(1, len(val_accuracy) + 1), val_accuracy, 'r-', label="Validation Accuracy") plt.xlabel("epochs") plt.ylabel("accuracy") plt.legend(loc='best') plt.savefig(checkpoints_folder + "/accuracy.jpeg", bbox_inches="tight") plt.clf() net_save = { 'net': model.state_dict(), 'opt': optimizer.state_dict(), 'epoch': epoch } torch.save( net_save, checkpoints_folder + "/convnet_ethiopian_mnist_epoch{}.pth".format(epoch))
with open('char_dict', 'rb') as f: class_dict = pickle.load(f) num_classes = len(class_dict) # 读取数据 transform = transforms.Compose([ transforms.Resize((64, 64)), transforms.ToTensor(), ]) dataset = HWDB(path=data_path, transform=transform) print("训练集数据:", dataset.train_size) print("测试集数据:", dataset.test_size) trainloader, testloader = dataset.get_loader(batch_size) net = ConvNet(num_classes) if torch.cuda.is_available(): net = net.cuda() net.load_state_dict(torch.load('checkpoints/handwriting_iter_009.pth')) print('网络结构:\n') #summary(net, input_size=(3, 64, 64), device='cuda') criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=lr) writer = SummaryWriter(log_path) for epoch in range(10, epochs): train(epoch, net, criterion, optimizer, trainloader, writer=writer) valid(epoch, net, testloader, writer=writer) print("epoch%d 结束, 正在保存模型..." % epoch) torch.save(net.state_dict(), save_path + 'handwriting_iter_%03d.pth' % epoch)
# Forward pass outputs = model(images) loss = criterion(outputs, labels) # Backward and optimize optimizer.zero_grad() loss.backward() optimizer.step() if (i+1) % 100 == 0: print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' .format(epoch+1, num_epochs, i+1, total_step, loss.item())) # Test the model model.eval() # eval mode (batchnorm uses moving mean/variance instead of mini-batch mean/variance) with torch.no_grad(): correct = 0 total = 0 for images, labels in test_loader: images = images.to(device) labels = labels.to(device) outputs = model(images) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total)) # Save the model checkpoint torch.save(model.state_dict(), 'model.ckpt')
def main(args): best_acc1 = 0 os.makedirs('checkpoints', exist_ok=True) args.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print('device: {}'.format(args.device)) # create model model = ConvNet(cfg.NUM_CLASSES).to(args.device) #model.apply(weights_init_normal) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().to(args.device) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location=args.device) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code train_dataset = ImageFolder(cfg.TRAIN_PATH) val_dataset = ImageFolder(cfg.VAL_PATH) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) logger = Logger('./logs') for epoch in range(args.start_epoch, args.epochs): # train for one epoch adjust_learning_rate(optimizer, epoch, args) train_loss, train_acc = train(train_loader, model, criterion, optimizer, epoch, args) # evaluate on validation set val_loss, val_acc = validate(val_loader, model, criterion, args) # remember best acc@1 and save checkpoint is_best = val_acc > best_acc1 best_acc1 = max(val_acc, best_acc1) # log info = { 'train_loss': float(train_loss), 'train_acc': float(train_acc), 'val_loss': float(val_loss), 'val_acc': float(val_acc) } for tag, value in info.items(): logger.scalar_summary(tag, value, epoch) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), }, is_best)
class Learner: def __init__(self, args, q_batch): self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.q_batch = q_batch self.learn_step_counter = 0 self.gamma = args.gamma self.batch_size = args.batch_size self.env = gym.make(args.env) self.n_act = self.env.action_space.n self.n_state = self.env.observation_space.shape[0] self.n_atom = args.atom self.v_min = args.v_min self.v_max = args.v_max self.dz = (self.v_max - self.v_min) / (self.n_atom - 1) self.z = [self.v_min + i * self.dz for i in range(self.n_atom)] self.z_space = torch.FloatTensor(self.z).to(self.device) self.net = ConvNet(self.n_state, self.n_act, self.n_atom).to(self.device) self.target_net = ConvNet(self.n_state, self.n_act, self.n_atom).to(self.device) self.optimizer = optim.Adam(self.net.parameters(), lr=args.lr) def learn(self): while True: self.learn_step_counter += 1 # target parameter update if self.learn_step_counter % 10 == 0: self.update_target() states, actions, rewards, next_states, dones = self.q_batch.get(block=True) states = torch.FloatTensor(states).to(self.device) actions = torch.LongTensor(actions).to(self.device) next_states = torch.FloatTensor(next_states).to(self.device) dones = [int(i) for i in dones] # action value distribution prediction # (m, N_ACTIONS, N_ATOM) curr_q = self.net(states) # 実際に行動したQだけを取り出す curr_q = torch.stack([curr_q[i].index_select(0, actions[i]) for i in range(self.batch_size)]).squeeze(1) # get next state value next_q = self.net(next_states).detach() # (m, N_ACTIONS, N_ATOM) next_q = torch.sum(next_q * self.z_space.view(1, 1, -1), dim=2) # (m, N_ACTIONS) next_action = next_q.argmax(dim=1) # (m) # target_q target_q = self.target_net(next_states).detach().cpu().numpy() target_q = [target_q[i, action, :] for i, action in enumerate(next_action)] target_q = np.array(target_q) # (m, N_ATOM) m_prob = np.zeros((self.batch_size, self.n_atom)) # (m, N_ATOM) # we didn't vectorize the computation of target assignment. for i in range(self.batch_size): for j in range(self.n_atom): Tz = np.fmin(self.v_max, np.fmax(self.v_min, rewards[i] + (1 - dones[i]) * 0.99 * (self.v_min + j * self.dz) ) ) bj = (Tz - self.v_min) / self.dz lj = np.floor(bj).astype(int) # m_l uj = np.ceil(bj).astype(int) # m_u # calc prob mass of relative position weighted with distance m_prob[i, lj] += (dones[i] + (1 - dones[i]) * target_q[i][j]) * (uj - bj) m_prob[i, uj] += (dones[i] + (1 - dones[i]) * target_q[i][j]) * (bj - lj) m_prob = m_prob / m_prob.sum(axis=1, keepdims=1) m_prob = torch.FloatTensor(m_prob).to(self.device) # print(curr_q) # calc huber loss, dont reduce for importance weight loss = - torch.mean(torch.sum(m_prob * torch.log(curr_q + 1e-20), dim=1)) # (m , N_ATOM) if self.learn_step_counter % 100 == 0: print('loss:', loss.item()) # backprop loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() def update_target(self): self.target_net.load_state_dict(self.net.state_dict())
class Learner: def __init__(self, args, q_batch): self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.q_batch = q_batch self.update_count = 0 self.gamma = args.gamma self.batch_size = args.batch_size self.env_eval = gym.make(args.env) self.n_act = self.env_eval.action_space.n self.n_state = self.env_eval.observation_space.shape[0] self.n_quant = args.quant self.target_net_update_freq = args.target_net_update_freq self.net = ConvNet(self.n_state, self.n_act, self.n_quant).to(self.device) self.target_net = ConvNet(self.n_state, self.n_act, self.n_quant).to(self.device) self.optimizer = optim.Adam(self.net.parameters(), lr=args.lr) def learn(self): while True: self.update_count += 1 if self.update_count % 10 == 0: rewards = self.evaluation() rewards_mu = np.array( [np.sum(np.array(l_i), 0) for l_i in rewards]).mean() print('update cnt %d Eval Reward %.2f' % (self.update_count, rewards_mu)) # target parameter update if self.update_count % self.target_net_update_freq == 0: self.update_target() states, actions, rewards, next_states, dones = self.q_batch.get( block=True) states = torch.FloatTensor(states).to(self.device) actions = torch.LongTensor(actions).to(self.device) next_states = torch.FloatTensor(next_states).to(self.device) dones = np.array([int(i) for i in dones]) # action value distribution prediction # [BATCH, N_QUANT, N_ACTIONS] curr_q, tau = self.net(states) # 実際に行動したQだけを取り出す # [BATCH, N_QUANT, 1] curr_q = torch.stack([ curr_q[i].index_select(1, actions[i]) for i in range(self.batch_size) ]) # # [BATCH, N_QUANT, N_QUANT] curr_q = curr_q.repeat(1, 1, self.n_quant) # get next state value # [BATCH, N_QUANT, N_ACTIONS] next_q, _ = self.net(next_states) next_action = next_q.sum(dim=1).argmax(dim=1) # target_q with torch.no_grad(): # [BATCH, N_QUANT, N_ACT] target_q, _ = self.target_net(next_states) target_q = target_q.detach().cpu().numpy() # [BATCH, N_QUANT, 1] target_q = np.array([ target_q[i, :, action] for i, action in enumerate(next_action) ]) target_q = rewards.reshape( -1, 1) + self.gamma * target_q * (1 - dones.reshape(-1, 1)) target_q = torch.FloatTensor(target_q).to( self.device).unsqueeze(2) # # [BATCH, N_QUANT, N_QUANT] target_q = target_q.repeat(1, 1, self.n_quant) target_q = target_q.permute(0, 2, 1) # loss = F.smooth_l1_loss(curr_q, target_q.detach(), reduction='none') # (BATCH, N_QUANT, N_QUANT) tau = tau.repeat(1, 1, self.n_quant) diff = target_q - curr_q loss = self.huber(diff) I_delta = (diff < 0).double() loss *= torch.abs(tau - I_delta) # huber loss loss = torch.mean(torch.sum(torch.mean(loss, dim=2), dim=1)) # backprop loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() def huber(self, x): cond = (x.abs() < 1.0).float().detach() return 0.5 * x.pow(2) * cond + (x.abs() - 0.5) * (1.0 - cond) def update_target(self): self.target_net.load_state_dict(self.net.state_dict()) def evaluation(self): rewards = [] for _ in range(10): rewards_i = [] state = self.env_eval.reset() action = self.action(state) state, reward, done, _ = self.env_eval.step(action) rewards_i.append(reward) while not done: action = self.action(state) state, reward, done, _ = self.env_eval.step(action) rewards_i.append(reward) rewards.append(rewards_i) return rewards def action(self, state): state = torch.FloatTensor(state).to(self.device).unsqueeze(0) action_value, _ = self.net(state) # if self.update_count > 3000: # dist_action = action_value[0].detach().cpu().numpy() # sns.distplot(dist_action[:, 0], bins=10, color='red') # sns.distplot(dist_action[:, 1], bins=10, color='blue') # plt.show() action_value = action_value[0].sum(dim=0) action = torch.argmax(action_value).detach().cpu().item() return action
class Wrapper(object): """docstring for Wrapper.""" def __init__(self, config, cont=None): super(Wrapper, self).__init__() with open(config, 'r') as f: config = json.load(f) self.config = config self.best_path = str(self.config['model']['model_save_path'] + self.config['name'] + '_model_best.pt') self.model = ConvNet(config['model']) self.continuing = False if cont is not None: print('loading in weights') self.load_model(cont) self.continuing = True self.cuda = torch.cuda.is_available() if self.cuda: print('using cuda') self.model.cuda() def train(self): model = self.model config = self.config trainloader = DataLoader( KanjiDataset(self.config, train=True), batch_size=config['train']['batch_size'], shuffle=True, pin_memory=True) # self.valloader = DataLoader( # KanjiDataset(self.config, train=False), # batch_size=config['train']['batch_size'], pin_memory=True) self.valset = KanjiDataset(self.config, train=False) objective = nn.CrossEntropyLoss() self.objective = objective optimizer = optim.Adam(model.parameters(), lr=config['train']['learning_rate']) # bestloss = float('Inf') if not self.continuing else self.valid() bestacc = 0.0 if not self.continuing else self.eval()[0] past_best = 0 max_past = 50 for e in range(config['train']['epochs']): avgloss = 0.0 for i, (x, y) in enumerate(trainloader): if self.cuda: x = x.cuda(async=True) y = y.cuda(async=True) optimizer.zero_grad() preds = model(x) loss = objective(preds, y) avgloss += loss.item() loss.backward() optimizer.step() preds = None gc.collect() avgloss /= len(trainloader) # vloss = self.valid() vacc = self.eval()[0] if e%5==0: print('epoch: {}, loss: {:.4f}, val_acc: {:.4f}' .format( e+1, avgloss, vacc ) ) # print('epoch: {}, loss: {:.4f}, val_loss: {:.4f}, memory: {:.4f}' # .format(e+1, avgloss, vloss, torch.cuda.memory_allocated(0) / 1e9 ) ) # if e%20==0: # self.print_acc() # if vloss < bestloss: if vacc > bestacc: path = str(self.config['model']['model_save_path'] + self.config['name'] + '_model_{:.4f}.pt'.format(vacc)) self.save_model(path) self.save_model(self.best_path) # bestloss = vloss bestacc = vacc past_best = 0 else: past_best += 1 if past_best >= max_past: print('past') break self.valloader = None self.print_acc() return def valid(self): loss = 0.0 for (x, y) in self.valloader: if self.cuda: x = x.cuda(async=True) y = y.cuda(async=True) loss += self.objective(self.model(x), y).item() return loss/len(self.valloader) def eval(self, train=False): validset = self.valset if train else KanjiDataset(self.config, train=False) acc = 0 conf = np.zeros((self.config['model']['classes'], self.config['model']['classes']), dtype=np.int32) for (x, y) in validset: pred = self.predict(x) acc += (pred == y) conf[y, pred] = conf[y, pred] + 1 return acc/len(validset), conf def print_acc(self): acc, conf = self.eval() print('acc:', acc) print('conf:\n', conf) def predict(self, image): image = torch.unsqueeze(image, 0) if self.cuda: image = image.cuda(async=True) pred = self.model(image) pred = torch.argmax(pred[0]) return pred.item() def save_model(self, path): torch.save( self.model.state_dict(), path ) print('save:', path) def load_model(self, cont): path = self.best_path if cont != 'cont': path = join(self.config['model']['model_save_path'], cont) print('loading path:', path) self.model.load_state_dict( torch.load( path ) )