def validate_1epoch(self): print('==> Epoch:[{0}/{1}][validation stage]'.format( self.epoch, self.nb_epochs)) batch_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() self.model.eval() end = time.time() progress = tqdm(self.testloader) for i, (data, label) in enumerate(progress): label = label.cuda(async=True) input_var = Variable(data).cuda() target_var = Variable(label).cuda() # print('load to cuda:', time.time() - t) output = self.model(input_var) prec1, prec5 = accuracy(output.data, label, topk=(1, 5)) top1.update(prec1.item(), output.data.size(0)) top5.update(prec5.item(), output.data.size(0)) info = { 'Epoch': [self.epoch], 'Batch Time': [round(batch_time.avg, 3)], 'Loss': [0], 'Prec@1': [round(top1.avg, 4)], 'Prec@5': [round(top5.avg, 4)], } record_info(info, 'record/spatial/rgb_test.csv', 'test')
def train_1epoch(self): print('==> Epoch:[{0}/{1}][training stage]'.format( self.epoch, self.nb_epochs)) batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() #switch to train mode self.model.train() end = time.time() # mini-batch training progress = tqdm(self.train_loader) for i, (data_dict, label) in enumerate(progress): # measure data loading time data_time.update(time.time() - end) label = label.cuda(async=True) target_var = Variable(label).cuda() # compute output output = Variable( torch.zeros(len(data_dict['img1']), 101).float()).cuda() for i in range(len(data_dict)): key = 'img' + str(i) data = data_dict[key] input_var = Variable(data).cuda() output += self.model(input_var) loss = self.criterion(output, target_var) # measure accuracy and record loss prec1, prec5 = accuracy(output.data, label, topk=(1, 5)) losses.update(loss.data[0], data.size(0)) top1.update(prec1[0], data.size(0)) top5.update(prec5[0], data.size(0)) # compute gradient and do SGD step self.optimizer.zero_grad() loss.backward() self.optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() info = { 'Epoch': [self.epoch], 'Batch Time': [np.round(batch_time.avg, 4)], 'Data Time': [np.round(data_time.avg, 4)], 'Loss': [np.round(losses.avg, 4)], 'Prec@1': [np.round(top1.avg, 4)], 'Prec@5': [np.round(top5.avg, 4)], 'lr': self.optimizer.param_groups[0]['lr'] } record_info(info, filename='record/spatial/rgb_train.csv', mode='train')
def train_1epoch(self): print('==> Epoch:[{0}/{1}][training stage]'.format( self.epoch, self.nb_epochs)) batch_time = utils.AverageMeter() data_time = utils.AverageMeter() losses = utils.AverageMeter() top1 = utils.AverageMeter() top5 = utils.AverageMeter() # switch to train mode self.model.train() end = time.time() # mini-batch training progress = tqdm.tqdm(self.train_loader) for i, (data, label) in enumerate(progress): # Probabilistically withhold a data batch if 100 * random.random() > self.percent: continue # measure data loading time data_time.update(time.time() - end) label = label.cuda(async=True) input_var = Variable(data).cuda() target_var = Variable(label).cuda() # compute output output = self.model(input_var) loss = self.criterion(output, target_var) # measure accuracy and record loss prec1, prec5 = utils.accuracy(output.data, label, topk=(1, 5)) losses.update(loss.item(), data.size(0)) top1.update(prec1.item(), data.size(0)) top5.update(prec5.item(), data.size(0)) # compute gradient and do SGD step self.optimizer.zero_grad() loss.backward() self.optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() info = { 'Epoch': [self.epoch], 'Batch Time': [round(float(batch_time.avg), 3)], 'Data Time': [round(float(data_time.avg), 3)], 'Loss': [round(float(losses.avg), 5)], 'Prec@1': [round(float(top1.avg), 4)], 'Prec@5': [round(float(top5.avg), 4)], 'lr': self.optimizer.param_groups[0]['lr'] } utils.record_info( info, os.path.join(self.output_dir, 'opf_train_{}.csv'.format(self.model_type)), 'train')
def validate_1epoch(self): print('==> Epoch:[{0}/{1}][validation stage]'.format( self.epoch, self.nb_epochs)) batch_time = utils.AverageMeter() losses = utils.AverageMeter() top1 = utils.AverageMeter() top5 = utils.AverageMeter() # switch to evaluate mode self.model.eval() self.dic_video_level_preds = {} end = time.time() progress = tqdm.tqdm(self.test_loader) for i, (keys, data, label) in enumerate(progress): label = label.cuda(async=True) with torch.no_grad(): data_var = Variable(data).cuda(async=True) label_var = Variable(label).cuda(async=True) # compute output output = self.model(data_var) # measure elapsed time batch_time.update(time.time() - end) end = time.time() # Calculate video level prediction preds = output.data.cpu().numpy() nb_data = preds.shape[0] for j in range(nb_data): videoName = keys[j].split('|', 1)[0] # ApplyMakeup_g01_c01 if videoName not in self.dic_video_level_preds.keys(): self.dic_video_level_preds[videoName] = preds[j, :] else: self.dic_video_level_preds[videoName] += preds[j, :] # Frame to video level accuracy video_top1, video_top5, video_loss = self.frame2_video_level_accuracy() info = { 'Epoch': [self.epoch], 'Batch Time': [np.round(batch_time.avg, 3)], 'Loss': [np.round(video_loss, 5)], 'Prec@1': [np.round(video_top1, 3)], 'Prec@5': [np.round(video_top5, 3)] } utils.record_info( info, os.path.join(self.output_dir, 'opf_test_{}.csv'.format(self.model_type)), 'test') return video_top1, video_loss
def train_1epoch(self): print('==> Epoch:[{0}/{1}][training stage]'.format( self.epoch, self.nb_epochs)) batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() self.model.train() end = time.time() progress = tqdm(self.trainloader) for i, (data, label) in enumerate(progress): data_time.update(time.time() - end) label = label.cuda(async=True) input_var = Variable(data).cuda() target_var = Variable(label).cuda() # print('load to cuda:', time.time() - t) # t = time.time() output = self.model(input_var) loss = self.criterion(output, target_var) # print('loss:', loss.item()) # print('calculate loss:', time.time() - t) prec1, prec5 = accuracy(output.data, label, topk=(1, 5)) losses.update(loss.item(), output.data.size(0)) top1.update(prec1.item(), output.data.size(0)) top5.update(prec5.item(), output.data.size(0)) # print('test', output.data.size(0)) self.optimizer.zero_grad() loss.backward() self.optimizer.step() batch_time.update(time.time() - end) end = time.time() info = { 'Epoch': [self.epoch], 'Batch Time': [round(batch_time.avg, 3)], 'Data Time': [round(data_time.avg, 3)], 'Loss': [round(losses.avg, 5)], 'Prec@1': [round(top1.avg, 4)], 'Prec@5': [round(top5.avg, 4)], 'lr': self.optimizer.param_groups[0]['lr'] } record_info(info, './record/spatial/rgb_train.csv', 'train')
def validate_1epoch(self): print('==> Epoch:[{0}/{1}][validation stage]'.format(self.epoch, self.nb_epochs)) batch_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() # switch to evaluate mode self.model.eval() self.dic_video_level_preds={} end = time.time() progress = tqdm(self.test_loader) for i, (keys,data,label) in enumerate(progress): #data = data.sub_(127.353346189).div_(14.971742063) label = label.cuda(async=True) data_var = Variable(data, volatile=True).cuda(async=True) label_var = Variable(label, volatile=True).cuda(async=True) # compute output output = self.model(data_var) # measure elapsed time batch_time.update(time.time() - end) end = time.time() #Calculate video level prediction preds = output.data.cpu().numpy() nb_data = preds.shape[0] for j in range(nb_data): videoName = keys[j].split('-',1)[0] # ApplyMakeup_g01_c01 if videoName not in self.dic_video_level_preds.keys(): self.dic_video_level_preds[videoName] = preds[j,:] else: self.dic_video_level_preds[videoName] += preds[j,:] #Frame to video level accuracy video_top1, video_top5, video_loss = self.frame2_video_level_accuracy() info = {'Epoch': [self.epoch], 'Batch Time': [np.round(batch_time.avg, 4)], 'Loss': [np.round(video_loss, 4)], 'Prec@1': [np.round(video_top1, 4)], 'Prec@5': [np.round(video_top5, 4)] } record_info(info, filename='record/motion/opf_test.csv', mode='test') return video_top1, video_loss
def validate_1epoch(self): ''' Run validation on the test set ''' print('==> Epoch:[{0}/{1}][validation stage]'.format( self.epoch, self.nb_epochs)) batch_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() # Switch to evaluate mode self.model.eval() self.dic_video_level_preds = {} end = time.time() if self.arg.prog: progress = tqdm(self.test_loader) else: progress = self.test_loader # Iterate over dataset for i, (keys, data, label) in enumerate(progress): label = label.cuda(async=True) data_var = Variable(data, volatile=True).cuda(async=True) label_var = Variable(label, volatile=True).cuda(async=True) # Compute Loss output = self.model(data_var) # Measure elapsed time batch_time.update(time.time() - end) end = time.time() # Calculate video level prediction preds = output.data.cpu().numpy() nb_data = preds.shape[0] for j in range(nb_data): videoName = keys[j].split('/', 1)[0] if videoName not in self.dic_video_level_preds.keys(): self.dic_video_level_preds[videoName] = preds[j, :] else: self.dic_video_level_preds[videoName] += preds[j, :] # Calculate and record top-1 accuracy, top-5 accuracy, and evaluation loss video_top1, video_top5, video_loss = self.frame2_video_level_accuracy() losses.update(video_loss) top1.update(video_top1) top5.update(video_top5) self.writer.add_scalars("Eval Loss", { "val": losses.val, "average": losses.avg }, self.epoch) self.writer.add_scalars("Eval Acc@1", { "val": top1.val, "average": top1.avg }, self.epoch) self.writer.add_scalars("Eval Acc@5", { "val": top5.val, "average": top5.avg }, self.epoch) # Record to TensorBoard info = { 'Epoch': [self.epoch], 'Batch Time': [round(batch_time.avg, 3)], 'Loss': [round(video_loss, 5)], 'Prec@1': [round(video_top1, 3)], 'Prec@5': [round(video_top5, 3)] } record_info(info, os.path.join(self.arg.savedir, 'rgb_train.csv'), 'test') return video_top1, video_loss
def train_1epoch(self): ''' Train for a single epoch ''' print('==> Epoch:[{0}/{1}][training stage]'.format( self.epoch, self.nb_epochs)) batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() # Switch to train mode self.model.train() end = time.time() # Mini-batch training if self.arg.prog: progress = tqdm(self.train_loader) else: progress = self.train_loader # Iterate over dataset for i, (data, label) in enumerate(progress): # Measure data loading time data_time.update(time.time() - end) label = label.cuda(async=True) target_var = Variable(label).cuda() # Compute loss input_var = Variable(data).cuda() output = self.model(input_var) loss = self.criterion(output, target_var) # Measure accuracy and record loss prec1, prec5 = accuracy(output.data, label, topk=(1, 5)) losses.update(loss.data[0], data.size(0)) top1.update(prec1[0], data.size(0)) top5.update(prec5[0], data.size(0)) # Compute gradient and do SGD step self.optimizer.zero_grad() loss.backward() self.optimizer.step() # Measure elapsed time and record results batch_time.update(time.time() - end) end = time.time() self.writer.add_scalars("Loss", { "val": losses.val, "average": losses.avg }, (self.epoch) * len(self.train_loader) + i) self.writer.add_scalars("Acc@1", { "val": top1.val, "average": top1.avg }, (self.epoch) * len(self.train_loader) + i) self.writer.add_scalars("Acc@5", { "val": top5.val, "average": top5.avg }, (self.epoch) * len(self.train_loader) + i) # Print every 10 iterations if i % 10 == 0: print( 'Loss {loss.val:.4f} ({loss.avg:.4f})\tAcc@1 {top1.val:.3f} ({top1.avg:.3f})\tAcc@5 {top5.val:.3f} ({top5.avg:.3f})' .format(loss=losses, top1=top1, top5=top5)) # Write to TensorBoard info = { 'Epoch': [self.epoch], 'Batch Time': [round(batch_time.avg, 3)], 'Data Time': [round(data_time.avg, 3)], 'Loss': [round(losses.avg, 5)], 'Prec@1': [round(top1.avg, 4)], 'Prec@5': [round(top5.avg, 4)], 'lr': self.optimizer.param_groups[0]['lr'] } # Save training info to CSV record_info(info, os.path.join(self.arg.savedir, 'rgb_train.csv'), 'train') return None
def train(self, train_data_path='data/TFRdata/BSDS500_64.tfrecords', test_data_dir='data/test', epoch_volume=63000, epoch_to_train=None, time_str=None, train_batch_size=64, steps=None, max_steps=None, log_print_interval=50, test_interval=500, save_interval=10000, loss_func='l2', optimizer='adam', learning_rate=0.001, decay=None, decay_epoch=1, decay_strategy='exponent'): # params: time_str = time_str or get_time_str() self.loss_func = loss_func self.optimizer = optimizer # paths: log_dir = os.path.join('./logs', self.model_name, self.name, time_str) ckpt_dir = os.path.join('./checkpoints', self.model_name, self.name, time_str) test_imgs_dir = os.path.join(log_dir, 'test_imgs') if not os.path.exists(log_dir): os.makedirs(log_dir) if not os.path.exists(ckpt_dir): os.makedirs(ckpt_dir) if not os.path.exists(test_imgs_dir): os.makedirs(test_imgs_dir) latest_ckpt_path = tf.train.latest_checkpoint(ckpt_dir) # info: self.info_train = {} self.info_train['time_str'] = str(time_str) self.info_train['train_batch_size'] = str(train_batch_size) self.info_train['loss_func'] = str(loss_func) self.info_train['optimizer'] = str(optimizer) if isinstance(decay_strategy, str): self.info_train['learning_rate'] = str(learning_rate) self.info_train['decay'] = str(decay) self.info_train['decay_strategy'] = str(decay_strategy) else: self.info_train['learning_rate'] = 'CustomStrategy' self.info_train['train_data_path'] = str(train_data_path) self.info_train['test_data_dir'] = str(test_data_dir) print('\n\n********** Train **********') print_info([self.info_train]) print('********** ***** **********') record_info([self.info_top, self.info_train], os.path.join(log_dir, 'info.txt')) # define graph: print('\n** Define graph...') self.train_graph = tf.Graph() with self.train_graph.as_default(): self._build(self.MODE_TRAIN) # logs: log_train_MSE = tf.summary.scalar('MSE_train', self.mse) log_test_MSE = tf.summary.scalar('MSE_test', self.mse) log_train_PSNR = tf.summary.scalar('PSNR_train', self.psnr_float) log_test_PSNR = tf.summary.scalar('PSNR_test', self.psnr_float) log_lr = tf.summary.scalar('learning_rate', self.learning_rate) test_PSNR_mean = tf.placeholder(tf.float32, name='PSNR_mean') log_test_PSNR_mean = tf.summary.scalar('PSNR_mean_test', test_PSNR_mean) log_writer = tf.summary.FileWriter(log_dir) log_writer.add_graph(self.train_graph) log_writer.flush() # saver: saver_all = tf.train.Saver(max_to_keep=0, name='saver_all') print('Done.') # datasets: print('\n** Generate datasets...') print('train data path:', train_data_path) print('test data dir:', test_data_dir) with self.train_graph.as_default(): get_train_batch = dataset_TFR(train_data_path, train_batch_size, epoch_volume) test_batches = dataset_IMG(test_data_dir) print('Done.') print('\n** Initialize and prepare...') # init: sess = tf.Session(graph=self.train_graph) if latest_ckpt_path: saver_all.restore(sess, latest_ckpt_path) else: sess.run(self.variable_init) step = tf.train.global_step(sess, self.global_step) epoch = self._get_epoch(step, train_batch_size, epoch_volume) steps_to_run = None if steps or max_steps: steps_to_run = steps or max(max_steps - step, 0) # define process functions: def train_once(step, epoch=None, pring_log=True): train_batch = sess.run(get_train_batch) feed_dic = { self.inputs: train_batch, self.labels: train_batch, self.learning_rate: lr } mse, mse_log, psnr, psnr_log, lr_log, _ = sess.run([ self.mse, log_train_MSE, self.psnr_float, log_train_PSNR, log_lr, self.train_op ], feed_dic) log_writer.add_summary(mse_log, step) log_writer.add_summary(psnr_log, step) log_writer.add_summary(lr_log, step) if pring_log: log = 'step: %d lr: %.8f train-loss: %.10f train-PSNR: %.6f' % ( step, lr, mse, psnr) if epoch is not None: log = ('epoch: %d ' % epoch) + log print(log) def test_all(step, epoch=None, pring_log=True, save_dir=None): if pring_log: print( '--------------------------------------------------------------' ) print('Test all:') img_num = len(test_batches['imgs']) psnr_sum = 0 for tb in range(img_num): img = test_batches['imgs'][tb][np.newaxis, :] name = test_batches['names'][tb] feed_dic = {self.inputs: img, self.labels: img} run_list = [ self.mse, log_test_MSE, self.psnr_float, log_test_PSNR ] if save_dir is not None: run_list.append(self.outputs) run_results = sess.run(run_list, feed_dic) if save_dir is None: mse, mse_log, psnr, psnr_log = run_results else: mse, mse_log, psnr, psnr_log, outputs = run_results name_no_ext = os.path.splitext(name)[0] if epoch is not None: cv_imwrite( os.path.join( save_dir, 'epoch_%d_step_%d_%s_psnr_%.4f.png' % (epoch, step, name_no_ext, psnr)), outputs[0], 'RGB') else: cv_imwrite( os.path.join( save_dir, 'step_%d_%s_psnr_%.4f.png' % (step, name_no_ext, psnr)), outputs[0], 'RGB') log_writer.add_summary(mse_log, step) log_writer.add_summary(psnr_log, step) log_writer.flush() psnr_sum += psnr if pring_log: log = 'step: %d test-loss: %.10f test-PSNR: %.6f' % ( step, mse, psnr) if epoch is not None: log = ('epoch: %d ' % epoch) + log log = ('| img: %s ' % name) + log print(log) psnr_mean = psnr_sum / img_num log_writer.add_summary( sess.run(log_test_PSNR_mean, {test_PSNR_mean: psnr_mean}), step) if pring_log: print('PSNR-mean: %.6f (img_num: %d)' % (psnr_mean, img_num)) print( '--------------------------------------------------------------' ) return psnr_mean def save_once(step, pring_log=True): save_path = os.path.join(ckpt_dir, get_time_str()) saver_all.save(sess=sess, save_path=save_path, global_step=step, write_meta_graph=False) if pring_log: print('save:', save_path) return save_path print('Done.') # run: print('\n** Begin training:') save_path = None if latest_ckpt_path is None: test_all(0, 0, True) save_path = save_once(0) else: test_all(step, epoch, True) save_flag_final = False save_flag_max = False psnr_max = 0 lr = self._lr_update(learning_rate, step, epoch, decay, decay_strategy) t = time.time() while (steps_to_run is None) or (steps_to_run > 0): # main loop step = tf.train.global_step(sess, self.global_step) + 1 epoch_old = epoch epoch = self._get_epoch(step, train_batch_size, epoch_volume) if epoch_to_train and (epoch > epoch_to_train): break if epoch_old != epoch: # change lr only when new epoch if isinstance(decay_strategy, str): if epoch_old % decay_epoch == 0: lr = self._lr_update(learning_rate, step, epoch, decay, decay_strategy) else: lr = self._lr_update(learning_rate, step, epoch, decay, decay_strategy) save_flag_final = True save_flag_max = False if (step % log_print_interval) == 0: train_once(step, epoch, pring_log=True) else: train_once(step, epoch, pring_log=False) if (step % test_interval) == 0: print('time: train_%d %.6fs' % (test_interval, time.time() - t)) t = time.time() psnr_tmp = test_all(step, epoch, True) print('time: test_once %.6fs' % (time.time() - t)) if psnr_tmp > psnr_max: test_all(step, epoch, False, test_imgs_dir) psnr_max = psnr_tmp print('psnr_max: %.6f epoch: %d step: %d' % (psnr_max, epoch, step)) save_flag_max = True t = time.time() if (step % save_interval) == 0 or save_flag_max: t = time.time() save_path = save_once(step) save_flag_final = False save_flag_max = False print('time: save_once %.6fs' % (time.time() - t)) t = time.time() if steps_to_run is not None: steps_to_run -= 1 if save_flag_final: save_path = save_once(step) sess.close() print('\nALL DONE.') return save_path
def main(): torch.manual_seed(args.seed) np.random.seed(args.seed) dataloader = utils.load_dataset(args.data_path, args.batch_size, args.batch_size, args.batch_size) args.scaler = dataloader['scaler'] engine = a(args) print("start training...") his_loss = [] val_time = [] train_time = [] for epoch_num in range(args.epochs + 1): train_loss = [] train_mape = [] train_rmse = [] t1 = time.time() dataloader['train_loader'].shuffle() for iter, (x, y) in enumerate(dataloader["train_loader"].get_iterator()): trainX = torch.Tensor(x).to(args.device) trainy = torch.Tensor(y).to(args.device) metrics = engine.train(trainX, trainy[:, :, :, 0]) train_loss.append(metrics[0]) train_mape.append(metrics[1]) train_rmse.append(metrics[2]) if iter % 500 == 0: log = 'Iter: {:03d}, Train Loss: {:.4f}, Train MAPE: {:.4f}, Train RMSE: {:.4f}' print(log.format(iter, train_loss[-1], train_mape[-1], train_rmse[-1]), flush=True) utils.record_info(log.format(iter, train_loss[-1], train_mape[-1], train_rmse[-1]), args.info_dir) t2 = time.time() train_time.append(t2 - t1) valid_loss = [] valid_mape = [] valid_rmse = [] print("eval...") s1 = time.time() for iter, (x, y) in enumerate(dataloader['val_loader'].get_iterator()): valx = torch.Tensor(x).cuda() valy = torch.Tensor(y).cuda() metrics = engine.eval(valx, valy[:, :, :, 0]) valid_loss.append(metrics[0]) valid_mape.append(metrics[1]) valid_rmse.append(metrics[2]) s2 = time.time() log = 'Epoch: {:03d}, Inference Time: {:.4f} secs' print(log.format(epoch_num, (s2 - s1))) utils.record_info(log.format(epoch_num, (s2 - s1)), args.info_dir) val_time.append(s2 - s1) mtrain_loss = np.mean(train_loss) mtrain_mape = np.mean(train_mape) mtrain_rmse = np.mean(train_rmse) mvalid_loss = np.mean(valid_loss) mvalid_mape = np.mean(valid_mape) mvalid_rmse = np.mean(valid_rmse) his_loss.append(mvalid_loss) log = 'Epoch: {:03d}, Train Loss: {:.4f}, Train MAPE: {:.4f}, Train RMSE: {:.4f}, Valid Loss: {:.4f}, Valid MAPE: {:.4f}, Valid RMSE: {:.4f}, Training Time: {:.4f}/epoch' print(log.format(epoch_num, mtrain_loss, mtrain_mape, mtrain_rmse, mvalid_loss, mvalid_mape, mvalid_rmse, (t2 - t1)), flush=True) utils.record_info( log.format(epoch_num, mtrain_loss, mtrain_mape, mtrain_rmse, mvalid_loss, mvalid_mape, mvalid_rmse, (t2 - t1)), args.info_dir) torch.save(engine.model, "./model/" + "_epoch_" + str(epoch_num) + ".pkl") print("Average Training Time: {:.4f} secs/epoch".format(np.mean(train_time))) print("Average Inference Time: {:.4f} secs".format(np.mean(val_time)))
parser.add_argument('--out_dim', type=int, default=1) parser.add_argument('--epochs', type=int, default=100) parser.add_argument("--lr", type=float, default=0.0005) parser.add_argument("--clip", type=float, default=5.) parser.add_argument('--weight_decay', type=float, default=0.000001, help='weight decay rate') parser.add_argument("--his_len", type=int, default=12, help="") parser.add_argument("--pred_len", type=int, default=12, help="") parser.add_argument("--seed", type=int, default=1314, help="random seed") parser.add_argument('--info_dir', type=str, default="./infos/metr12/ratio003.txt") parser.add_argument('--channels', type=int, default=2) parser.add_argument('--layers', type=int, default=5) parser.add_argument('--snpsts_len', type=int, default=4) parser.add_argument('--dropout', type=float, default=0.3) args = parser.parse_args() utils.record_info('snapshot增加为6,pearson矩阵数量变为0.05', args.info_dir) utils.record_info(str(args), args.info_dir) print(args) if args.data == "metr": args.data_path = './data/METR-LA' args.adj_mx_path = './data/sensor_graph/adj_mx.pkl' args.adj_mx = torch.Tensor(utils.load_pickle(args.adj_mx_path)[-1]) args.num_node = 207 args.pearson_path = "./data/METR-LA/pearson_corr.pkl" args.dilations = [1, 2, 4, 2, 1, 1] elif args.data == "bay": args.data_path = './data/PEMS-BAY' args.adj_mx_path = './data/sensor_graph/adj_mx_bay.pkl' args.adj_mx = torch.Tensor(utils.load_pickle(args.adj_mx_path)[-1]) args.num_node = 325
def train(self): for self.epoch in range(self.numepoches): print('==> Epoch:[{0}/{1}][training stage]'.format( self.epoch, self.numepoches)) batch_time = utils.AverageMeter() data_time = utils.AverageMeter() top1 = utils.AverageMeter() top5 = utils.AverageMeter() losses = utils.AverageMeter() running_loss = 0.0 progress = tqdm(self.trainloader) end = time.time() for i, data in enumerate(progress, 0): data_time.update(time.time() - end) # get the inputs inputs, labels = data # warp them in Variable # inputs, labels = Variable(inputs), Variable(labels) if torch.cuda.is_available(): inputs = Variable(inputs).cuda() labels = Variable(labels).cuda() else: inputs = Variable(inputs) labels = Variable(labels) # zero the parameter gradients self.optimizer.zero_grad() # 清空上一步的梯度 # forward outputs = self.net(inputs) # loss loss = self.criterion(outputs, labels) # backward loss.backward() # update weights self.optimizer.step() prec1, prec5 = utils.accuracy(outputs.data, labels, topk=(1, 5)) #print(data) losses.update(loss.item(), len(data)) top1.update(prec1.item(), len(data)) top5.update(prec5.item(), len(data)) # print statistics running_loss = running_loss + loss.data.item() if i % 2000 == 1999: # print every 2000 mini-batches print('[%d, %5d] loss: %.3f' % (self.epoch + 1, i + 1, running_loss / 2000)) running_loss = 0.0 info = { 'Epoch': [self.epoch], 'Batch Time': [round(batch_time.avg, 3)], 'Data Time': [round(data_time.avg, 3)], 'Loss': [round(losses.avg, 5)], 'Prec@1': [round(top1.avg, 4)], 'Prec@5': [round(top5.avg, 4)], 'lr': self.optimizer.param_groups[0]['lr'] } utils.record_info(info, 'record/opf_train.csv', 'train') # prec1, val_loss = self.validate_1epoch() # 验证 # print(prec1) is_best = running_loss > self.best_prec1 # lr_scheduler # self.scheduler.step(val_loss) # save model if is_best: self.best_prec1 = running_loss self.save_checkpoint( self.net.state_dict(), is_best, 'record/checkpoint.pth.tar', ) print("Finished Training")
def train(self, time_str, patch_mode=None, patch_size=60, step=36, patch_height=1080, patch_width=1920, train_batch_size=64, max_epoch=2400, learning_rate=0.001, resume=None ): time_str = time_str or get_time_str() log_dir = os.path.join('./logs', self.net_name, self.video_name + '_QP' + str(self.QP), time_str) backup_dir = os.path.join('./checkpoints', self.net_name, self.video_name + '_QP' + str(self.QP), time_str) if not os.path.exists(log_dir): os.makedirs(log_dir) if not os.path.exists(backup_dir): os.makedirs(backup_dir) writer = SummaryWriter(logdir=log_dir) train_input_frame = './data/HM_compressed/' + self.video_name + '_QP' + str(self.QP) + '_' + self.codec_mode + '_rec_HM.yuv' train_label_frame = './data/raw/' + self.video_name + '.yuv' print('\n') print('===> Loading datasets') im_input, _, _ = YUVread(train_input_frame, [self.height, self.width],self.frame_num,self.start_frame) im_label, _, _ = YUVread(train_label_frame, [self.height, self.width],self.frame_num,self.start_frame) frame_num = im_input.shape[0] if patch_mode == 'small': train_set = MultiFrameDataset(rec_y=im_input, label_y=im_label, totalFrames=frame_num, nFrames=self.neighbor_frames, width=self.width, height=self.height, width_cut=patch_size, height_cut=patch_size) total_count = train_set.__len__() else: train_set = MultiFrameDataset(rec_y=im_input, label_y=im_label, totalFrames=frame_num, nFrames=self.neighbor_frames, width=self.width, height=self.height,width_cut=self.width, height_cut=self.height) total_count = train_set.__len__() if patch_mode == 'small': training_data_loader = DataLoader(train_set, batch_size=train_batch_size, shuffle=True, num_workers=4) else: training_data_loader = DataLoader(train_set, batch_size=train_batch_size, shuffle=False, num_workers=4) print('===> Done\n') print('===> Building model ') model = CRNN(input_channel=1, base_channel=self.channel, neighbor_frames=self.neighbor_frames, use_norm_at_begin=self.use_BN_at_begin, use_norm_in_ru=self.use_BN_in_ru, use_norm_at_end=self.use_BN_at_end) calculate_variables(model, print_vars=False) model = model.cuda() optimizer = optim.Adam(model.parameters(), lr=learning_rate, betas=(0.9, 0.999), eps=1e-8) l1_loss_fn = nn.L1Loss() l2_loss_fn = nn.MSELoss(reduction='elementwise_mean') l1_loss_fn = l1_loss_fn.cuda() l2_loss_fn = l2_loss_fn.cuda() print('===> Done\n') print('===> Try resume from checkpoint') if resume != 'none': checkpoint = torch.load( resume) model.load_state_dict(checkpoint['state']) if patch_mode == 'large': start_epoch = 1 else: optimizer.load_state_dict(checkpoint['optimizer']) start_epoch = checkpoint['epoch'] print(resume.split('_')[-7]) psnr_gain_max = float(resume.split('_')[-7]) print('===> Load checkpoint') else: start_epoch = 1 psnr_gain_max = 0.0 print('===> Start from scratch') # info: self.info_train = {} if resume != 'none': self.info_train['checkpoint_to_load'] = resume self.info_train['time_str'] = time_str self.info_train['max_epoch'] = max_epoch self.info_train['learning_rate'] = str(learning_rate) self.info_train['num_of_patches'] = str(total_count) if patch_mode == 'small': self.info_train['patch_size'] = str(patch_size) + 'x' + str(patch_size) else: self.info_train['patch_size'] = str(patch_height) + 'x' + str(patch_width) self.info_train['train_batch_size'] = str(train_batch_size) self.info_train['log_dir'] = log_dir self.info_train['backup_dir'] = backup_dir self.info_train['train_input'] = train_input_frame self.info_train['train_label'] = train_label_frame self.info_train['loss_function'] = 'L1-absolute_difference' print('\n\n********** Train **********') print_info([self.info_train]) print('********** ***** **********') record_info([self.info_top, self.info_train], os.path.join(backup_dir, 'info.txt')) record_info([self.info_top, self.info_train], os.path.join(log_dir, 'info.txt')) count = 0 for epoch in range(start_epoch, max_epoch+1): # global psnr_gain_max model.train() psnr_gain = 0.0 total_psnr_before = 0.0 for iteration, batch in enumerate(training_data_loader): batch_input, batch_neighor, batch_label = batch[0], batch[1], batch[2] batch_input = batch_input.cuda() batch_neighor = batch_neighor.cuda() batch_label = batch_label.cuda() batch_output = model(batch_input, batch_neighor) mse_loss_before = l2_loss_fn(batch_input, batch_label) l1_loss = l1_loss_fn(batch_output, batch_label) mse_loss = l2_loss_fn(batch_output, batch_label) optimizer.zero_grad() l1_loss.backward() optimizer.step() with torch.no_grad(): psnr_before = np.multiply(10.0, np.log(1.0 * 1.0 / mse_loss_before.cpu()) / np.log(10.0)) psnr = np.multiply(10.0, np.log(1.0 * 1.0 / mse_loss.cpu()) / np.log(10.0)) psnr_gain += (psnr - psnr_before) print( "Train(%.10s:QP%.2d):> Epoch[%.4d](%.3d/%.3d)== lr: %.8f train-loss: %.10f train_PSNR: %.6f PSNR_before: %.6f PSNR_gain: %.6f" % (self.video_name,self.QP,epoch, iteration + 1, len(training_data_loader), optimizer.param_groups[0]['lr'], mse_loss.cpu(), psnr, psnr_before, psnr - psnr_before)) total_psnr_before += psnr_before writer.add_scalar('Train_loss', l1_loss.cpu(), count) writer.add_scalar('Train_PSNR', psnr, count) total_psnr_before = total_psnr_before / (len(training_data_loader)) print(total_psnr_before) psnr_gain = psnr_gain / (len(training_data_loader)) self.checkpoint(model, epoch, optimizer, psnr_gain_max, backup_dir=backup_dir) if epoch % 50 == 0: self.checkpoint(model, epoch, optimizer, psnr_gain, backup_dir=backup_dir) if self.QP in [22, 27]: if (epoch + 1) == 50 or (epoch + 1) == 300: for param_group in optimizer.param_groups: param_group['lr'] /= 10 print('Learning rate decay: lr={}'.format(optimizer.param_groups[0]['lr'])) else: if (epoch + 1) == 100 or (epoch + 1) == 300: for param_group in optimizer.param_groups: param_group['lr'] /= 10 print('Learning rate decay: lr={}'.format(optimizer.param_groups[0]['lr']))
def validate_1epoch(self): print('==> Epoch:[{0}/{1}][validation stage]'.format( self.epoch, self.nb_epochs)) batch_time = utils.AverageMeter() data_time = utils.AverageMeter() losses = utils.myAverageMeter() perf = utils.myAverageMeter() #switch to eval mode self.model.eval() end = time.time() # mini-batch training progress = tqdm(self.test_loader, ascii=True) with torch.no_grad(): for _, (image, label_map, label_pts, loss_weight) in enumerate(progress): if Config.use_cuda: image = image.cuda() label_map = label_map.cuda() loss_weight = loss_weight.cuda() # measure data loading time data_time.update(time.time() - end) pred, _ = self.model(image) loss = self.weighted_loss(pred, label_map, loss_weight) # loss = self.loss_func(pred, label_map) # measure accuracy and record loss losses.update(loss.item(), image.size(0)) pred_pts = utils.getPointByMap(pred) rmse = 0.0 for b in range(0, label_pts.size(0)): x_mse = 0.0 y_mse = 0.0 b_count = 0.0 for p in range(0, label_pts.size(1)): x_mse += loss_weight[b, p, 0] * ( (label_pts[b, p, 0] - pred_pts[b, p, 0]).pow(2)) y_mse += loss_weight[b, p, 1] * ( (label_pts[b, p, 1] - pred_pts[b, p, 1]).pow(2)) b_count += loss_weight[b, p, 0] + loss_weight[b, p, 1] b_rmse = torch.sqrt((x_mse + y_mse) / b_count) rmse += b_rmse rmse = rmse / label_pts.size(0) perf.update(rmse, image.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() info = { 'Epoch': [self.epoch], 'Batch Time': [round(batch_time.avg, 3)], 'Data Time': [round(data_time.avg, 3)], 'Perf': [round(perf.average(), 5)], 'Loss': [round(losses.average(), 5)] } utils.record_info(info, 'record/test.csv', 'test') return round(perf.average(), 5), round(losses.average(), 5)
def train_1epoch(self): print('==> Epoch:[{0}/{1}][training stage]'.format( self.epoch, self.nb_epochs)) batch_time = utils.AverageMeter() data_time = utils.AverageMeter() losses = utils.myAverageMeter() perf = utils.myAverageMeter() #switch to train mode self.model.train() end = time.time() # mini-batch training progress = tqdm(self.train_loader, ascii=True) for _, (image, label_map, label_pts) in enumerate(progress): if Config.use_cuda: image = image.cuda() label_map = label_map.cuda() # measure data loading time data_time.update(time.time() - end) self.optimizer.zero_grad() pred, _ = self.model(image) loss = self.loss_func(pred, label_map) loss.backward() self.optimizer.step() # measure accuracy and record loss losses.update(loss.item(), image.size(0)) pred_pts = utils.getPointByMap(pred) with torch.no_grad(): rmse = 0.0 for b in range(0, label_pts.size(0)): x_mse = 0.0 y_mse = 0.0 for p in range(0, label_pts.size(1)): x_mse += (label_pts[b, p, 0] - pred_pts[b, p, 0]).pow(2) y_mse += (label_pts[b, p, 1] - pred_pts[b, p, 1]).pow(2) b_rmse = torch.sqrt( (x_mse + y_mse) / (2 * label_pts.size(1))) # print("RMSE of the item:", b_rmse) # print("pred :", pred_pts[b:, :, :]) # print("label:", label_pts[b:, :, :]) rmse += b_rmse rmse = rmse / label_pts.size(0) perf.update(rmse, image.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() info = { 'Epoch': [self.epoch], 'Batch Time': [round(batch_time.avg, 3)], 'Data Time': [round(data_time.avg, 3)], 'Perf': [round(perf.average(), 5)], 'Loss': [round(losses.average(), 5)], 'lr': self.optimizer.param_groups[0]['lr'] } utils.record_info(info, 'record/train.csv', 'train')