class TensorBoard(Callback): # TODO: add option to write images; find fix for graph def __init__(self, log_dir, update_frequency = 10): super(Callback, self).__init__() self.log_dir = log_dir self.writer = None self.update_frequency = update_frequency def on_train_begin(self, **_): self.writer = SummaryWriter(os.path.join(self.log_dir, datetime.datetime.now().__str__())) rndm_input = torch.autograd.Variable(torch.rand(1, *self.model.input_shape), requires_grad = True).to(self.logger['device']) # fwd_pass = self.model(rndm_input) self.writer.add_graph(self.model, rndm_input) return self def on_epoch_end(self, **_): if (self.logger['epoch'] % self.update_frequency) == 0: epoch_metrics = self.logger['epoch_metrics'][self.logger['epoch']] for e_metric, e_metric_dct in epoch_metrics.iteritems(): for e_metric_split, e_metric_val in e_metric_dct.iteritems(): self.writer.add_scalar('{}/{}'.format(e_metric_split, e_metric), e_metric_val, self.logger['epoch']) for name, param in self.model.named_parameters(): self.writer.add_histogram(name.replace('.', '/'), param.clone().cpu().data.numpy(), self.logger['epoch']) return self def on_train_end(self, **_): return self.writer.close()
np.random.shuffle(lines) np.random.seed(None) num_val = int(len(lines) * val_split) num_train = len(lines) - num_val print('Train dataset : {0}, Val dataset: {1}'.format(num_train, num_val)) writer = SummaryWriter(log_dir='logs', flush_secs=60) if Cuda: graph_inputs = torch.from_numpy( np.random.rand(1, 3, input_shape[0], input_shape[1])).type(torch.FloatTensor).cuda() else: graph_inputs = torch.from_numpy( np.random.rand(1, 3, input_shape[0], input_shape[1])).type(torch.FloatTensor) writer.add_graph(model, (graph_inputs, )) if True: lr = 1e-3 Batch_size = 4 Init_Epoch = 0 Freeze_Epoch = 50 optimizer = optim.Adam(net.parameters(), lr, weight_decay=5e-4) if Cosine_lr: lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5, eta_min=1e-5) else: lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1,
% (args.session, epoch, step, iters_per_epoch, loss_temp, lr)) print("\t\t\tfg/bg=(%d/%d), time cost: %f" % (fg_cnt, bg_cnt, end - start)) print("\t\t\trpn_cls: %.4f, rpn_box: %.4f, rcnn_cls: %.4f, rcnn_box %.4f" \ % (loss_rpn_cls, loss_rpn_box, loss_rcnn_cls, loss_rcnn_box)) if args.use_tfboard: info = { 'loss': loss_temp, 'loss_rpn_cls': loss_rpn_cls, 'loss_rpn_box': loss_rpn_box, 'loss_rcnn_cls': loss_rcnn_cls, 'loss_rcnn_box': loss_rcnn_box } logger.add_scalars("logs_s_{}/losses".format(args.session), info, (epoch - 1) * iters_per_epoch + step) if args.use_tfboard and step == 1: logger.add_graph(fasterRCNN, (im_data, im_info, gt_boxes, num_boxes)) loss_temp = 0 start = time.time() save_name = os.path.join(output_dir, 'faster_rcnn_{}_{}_{}.pth'.format(args.session, epoch, step)) save_checkpoint({ 'session': args.session, 'epoch': epoch + 1, 'model': fasterRCNN.module.state_dict() if args.mGPUs else fasterRCNN.state_dict(), 'optimizer': optimizer.state_dict(), 'pooling_mode': cfg.POOLING_MODE, 'class_agnostic': args.class_agnostic, }, save_name) print('save model: {}'.format(save_name))
if __name__ == '__main__': resnet50 = ResNet50().to(device) optimizer = torch.optim.SGD(resnet50.parameters(), lr=lr, momentum=0.9, weight_decay=0.0005, nesterov=True) scheduler = StepLR(optimizer, step_size=step_size, gamma=0.5) loss_func = torch.nn.CrossEntropyLoss() summary_writer = SummaryWriter() dump_input = torch.rand(1, 3, 224, 224).to(device) summary_writer.add_graph(resnet50, (dump_input, ), verbose=False) for epoch in range(num_epoches): resnet50.train() running_loss = 0.0 running_acc = 0.0 for step, ( batch_x, batch_y) in enumerate(train_loader): # 每一步 loader 释放一小批数据用来学习 batch_x, batch_y = batch_x.to(device), batch_y.to(device) out = resnet50(batch_x) loss = loss_func(out, batch_y) running_loss += loss.data.item() * batch_y.size(0) _, pred = torch.max(out, 1)
class LossHistory(): def __init__(self, model, patience=5): import datetime curr_time = datetime.datetime.now() time_str = datetime.datetime.strftime(curr_time, '%Y_%m_%d_%H_%M_%S') self.log_dir = "logs//SegNet/" self.time_str = time_str self.save_path = os.path.join(self.log_dir, "loss_" + str(self.time_str)) self.losses = [] self.val_loss = [] self.writer = SummaryWriter( log_dir=os.path.join(self.log_dir, "run_" + str(self.time_str))) self.freeze = False # write model summary x = threading.Thread(target=self.write_summary, args=([deepcopy(model.module).cpu()])) x.start() # launch tensorboard t = threading.Thread(target=self.launchTensorBoard, args=([self.log_dir])) t.start() # initial EarlyStopping self.patience = patience self.reset_stop() os.makedirs(self.save_path) def write_summary(self, cpu_model): print("write model summary ready") rndm_input = torch.autograd.Variable(torch.rand(1, 3, 512, 512), requires_grad=False).cpu() self.writer.add_graph(cpu_model, rndm_input) print("tensroboard model summary finished") f = io.StringIO() with redirect_stdout(f): summary(cpu_model, (3, 512, 512), device="cpu") lines = f.getvalue() with open(os.path.join(self.log_dir, "summary.txt"), "w") as f: [f.write(line) for line in lines] print("write model summary finished") return def launchTensorBoard(self, tensorBoardPath, port=8888): os.system('tensorboard --logdir=%s --port=%s' % (tensorBoardPath, port)) url = "http://localhost:%s/" % (port) # webbrowser.open_new(url) return def reset_stop(self): self.best_epoch_loss = np.Inf self.stopping = False self.counter = 0 def set_status(self, freeze): self.freeze = freeze def epoch_loss(self, loss, val_loss, epoch): self.losses.append(loss) self.val_loss.append(val_loss) with open( os.path.join(self.save_path, "epoch_loss_" + str(self.time_str) + ".txt"), 'a') as f: f.write(str(loss)) f.write("\n") with open( os.path.join(self.save_path, "epoch_val_loss_" + str(self.time_str) + ".txt"), 'a') as f: f.write(str(val_loss)) f.write("\n") self.loss_plot() prefix = "Freeze_epoch/" if self.freeze else "UnFreeze_epoch/" self.writer.add_scalar(prefix + 'Loss/Train', loss, epoch) self.writer.add_scalar(prefix + 'Loss/Val', val_loss, epoch) self.decide(val_loss) def epoch_loss_no_val(self, loss, epoch): self.losses.append(loss) with open( os.path.join(self.save_path, "epoch_loss_" + str(self.time_str) + ".txt"), 'a') as f: f.write(str(loss)) f.write("\n") self.loss_plot() prefix = "Freeze_epoch/" if self.freeze else "UnFreeze_epoch/" self.writer.add_scalar(prefix + 'Loss/Train', loss, epoch) def step(self, steploss, stepfscore, iteration): prefix = "Freeze_step/" if self.freeze else "UnFreeze_step/" self.writer.add_scalar(prefix + 'Train/Loss', steploss, iteration) self.writer.add_scalar(prefix + 'Train/F_Score', stepfscore, iteration) def decide(self, epoch_loss): if epoch_loss > self.best_epoch_loss: self.counter += 1 print( f'EarlyStopping counter: {self.counter} out of {self.patience}' ) if self.counter >= self.patience: print(f'Best lower loss:{self.best_epoch_loss}') self.stopping = True else: self.best_epoch_loss = epoch_loss self.counter = 0 self.stopping = False def loss_plot(self): iters = range(len(self.losses)) plt.figure() plt.plot(iters, self.losses, 'red', linewidth=2, label='train loss') plt.plot(iters, self.val_loss, 'coral', linewidth=2, label='val loss') try: if len(self.losses) < 25: num = 5 else: num = 15 plt.plot(iters, scipy.signal.savgol_filter(self.losses, num, 3), 'green', linestyle='--', linewidth=2, label='smooth train loss') plt.plot(iters, scipy.signal.savgol_filter(self.val_loss, num, 3), '#8B4513', linestyle='--', linewidth=2, label='smooth val loss') except: pass plt.grid(True) plt.xlabel('Epoch') plt.ylabel('Loss') plt.legend(loc="upper right") plt.savefig( os.path.join(self.save_path, "epoch_loss_" + str(self.time_str) + ".png")) plt.cla() plt.close("all")
def train(args, data_root, save_root): weight_dir = "{}weights/".format(save_root) log_dir = "{}logs/MobileNetV2Vortex-{}".format( save_root, time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())) # +++++++++++++++++++++++++++++++++++++++++++++++++++ # # 1. Setup Augmentations # +++++++++++++++++++++++++++++++++++++++++++++++++++ # net_h, net_w = int(args.img_rows * args.crop_ratio), int(args.img_cols * args.crop_ratio) augment_train = Compose([ RandomHorizontallyFlip(), RandomSized((0.5, 0.75)), RandomRotate(5), RandomCrop((net_h, net_w)) ]) augment_valid = Compose([ RandomHorizontallyFlip(), Scale((args.img_rows, args.img_cols)), CenterCrop((net_h, net_w)) ]) print("> # +++++++++++++++++++++++++++++++++++++++++++++++++++++++ #") print("> 0. Setting up DataLoader...") print("> # +++++++++++++++++++++++++++++++++++++++++++++++++++++++ #") train_loader = CityscapesLoader(data_root, gt="gtFine", is_transform=True, split='train', img_size=(args.img_rows, args.img_cols), augmentations=augment_train) valid_loader = CityscapesLoader(data_root, gt="gtFine", is_transform=True, split='val', img_size=(args.img_rows, args.img_cols), augmentations=augment_valid) n_classes = train_loader.n_classes # +++++++++++++++++++++++++++++++++++++++++++++++++++ # # 2. Setup Metrics # +++++++++++++++++++++++++++++++++++++++++++++++++++ # running_metrics = RunningScore(n_classes) # +++++++++++++++++++++++++++++++++++++++++++++++++++ # # 4. Setup Model # +++++++++++++++++++++++++++++++++++++++++++++++++++ # print("> # +++++++++++++++++++++++++++++++++++++++++++++++++++++++ #") print("> 1. Setting up Model...") model = MobileNetV2Vortex(n_class=19, in_size=(net_h, net_w), width_mult=1., out_sec=256, rate_sec=(3, 9, 27), norm_act=partial(InPlaceABNWrapper, activation="leaky_relu", slope=0.1)) """ model = MobileNetV2Plus(n_class=n_classes, in_size=(net_h, net_w), width_mult=1.0, out_sec=256, aspp_sec=(12, 24, 36), norm_act=partial(InPlaceABNWrapper, activation="leaky_relu", slope=0.1)) """ # np.arange(torch.cuda.device_count()) model = torch.nn.DataParallel(model, device_ids=[0, 1]).cuda() # 4.1 Setup Optimizer # +++++++++++++++++++++++++++++++++++++++++++++++++++ # # Check if model has custom optimizer / loss if hasattr(model.module, 'optimizer'): optimizer = model.module.optimizer else: optimizer = torch.optim.SGD(model.parameters(), lr=args.l_rate, momentum=0.90, weight_decay=5e-4, nesterov=True) # for pg in optimizer.param_groups: # print(pg['lr']) # optimizer = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999), # eps=1e-08, weight_decay=0, amsgrad=True) # optimizer = YFOptimizer(model.parameters(), lr=2.5e-3, mu=0.9, clip_thresh=10000, weight_decay=5e-4) # 4.2 Setup Loss # +++++++++++++++++++++++++++++++++++++++++++++++++++ # class_weight = None if hasattr(model.module, 'loss'): print('> Using custom loss') loss_fn = model.module.loss else: # loss_fn = cross_entropy2d class_weight = np.array([ 0.05570516, 0.32337477, 0.08998544, 1.03602707, 1.03413147, 1.68195437, 5.58540548, 3.56563995, 0.12704978, 1., 0.46783719, 1.34551528, 5.29974114, 0.28342531, 0.9396095, 0.81551811, 0.42679146, 3.6399074, 2.78376194 ], dtype=float) """ class_weight = np.array([3.045384, 12.862123, 4.509889, 38.15694, 35.25279, 31.482613, 45.792305, 39.694073, 6.0639296, 32.16484, 17.109228, 31.563286, 47.333973, 11.610675, 44.60042, 45.23716, 45.283024, 48.14782, 41.924667], dtype=float)/10.0 """ class_weight = torch.from_numpy(class_weight).float().cuda() loss_fn = bootstrapped_cross_entropy2d # loss_fn = cross_entropy2d # +++++++++++++++++++++++++++++++++++++++++++++++++++ # # 5. Resume Model # +++++++++++++++++++++++++++++++++++++++++++++++++++ # best_iou = -100.0 args.start_epoch = 0 if args.resume is not None: full_path = "{}{}".format(weight_dir, args.resume) if os.path.isfile(full_path): print("> Loading model and optimizer from checkpoint '{}'".format( args.resume)) checkpoint = torch.load(full_path) args.start_epoch = checkpoint['epoch'] best_iou = checkpoint['best_iou'] model.load_state_dict(checkpoint['model_state']) # weights optimizer.load_state_dict( checkpoint['optimizer_state']) # gradient state # for param_group in optimizer.param_groups: # s param_group['lr'] = 1e-5 del checkpoint print("> Loaded checkpoint '{}' (epoch {}, iou {})".format( args.resume, args.start_epoch, best_iou)) else: print("> No checkpoint found at '{}'".format(args.resume)) else: if args.pre_trained is not None: print("> Loading weights from pre-trained model '{}'".format( args.pre_trained)) full_path = "{}{}".format(weight_dir, args.pre_trained) pre_weight = torch.load(full_path) pre_weight = pre_weight["model_state"] # pre_weight = pre_weight["state_dict"] model_dict = model.state_dict() pretrained_dict = { k: v for k, v in pre_weight.items() if k in model_dict } model_dict.update(pretrained_dict) model.load_state_dict(model_dict) del pre_weight del model_dict del pretrained_dict # +++++++++++++++++++++++++++++++++++++++++++++++++++ # # 3. Setup tensor_board for visualization # +++++++++++++++++++++++++++++++++++++++++++++++++++ # writer = None if args.tensor_board: writer = SummaryWriter(log_dir=log_dir, comment="MobileNetV2Vortex") if args.tensor_board: dummy_input = Variable(torch.rand(1, 3, net_h, net_w).cuda(), requires_grad=True) writer.add_graph(model, dummy_input) # +++++++++++++++++++++++++++++++++++++++++++++++++++ # # 6. Train Model # +++++++++++++++++++++++++++++++++++++++++++++++++++ # print("> 2. Model Training start...") train_loader = data.DataLoader(train_loader, batch_size=args.batch_size, num_workers=6, shuffle=True) valid_loader = data.DataLoader(valid_loader, batch_size=args.batch_size, num_workers=6) num_batches = int( math.ceil( len(train_loader.dataset.files[train_loader.dataset.split]) / float(train_loader.batch_size))) lr_period = 20 * num_batches swa_weights = model.state_dict() # scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.90) # scheduler = CyclicLR(optimizer, base_lr=1.0e-3, max_lr=6.0e-3, step_size=2*num_batches) # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=32, gamma=0.1) # scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[30, 80], gamma=0.1) topk_init = 512 # topk_multipliers = [64, 128, 256, 512] for epoch in np.arange(args.start_epoch, args.n_epoch): # +++++++++++++++++++++++++++++++++++++++++++++++++++ # # 7.1 Mini-Batch Learning # +++++++++++++++++++++++++++++++++++++++++++++++++++ # # print("> Training Epoch [%d/%d]:" % (epoch + 1, args.n_epoch)) model.train() last_loss = 0.0 topk_base = topk_init pbar = tqdm(np.arange(num_batches)) for train_i, (images, labels) in enumerate( train_loader): # One mini-Batch data, One iteration full_iter = (epoch * num_batches) + train_i + 1 # poly_lr_scheduler(optimizer, init_lr=args.l_rate, iter=full_iter, # lr_decay_iter=1, max_iter=args.n_epoch*num_batches, power=0.9) batch_lr = args.l_rate * cosine_annealing_lr(lr_period, full_iter) optimizer = set_optimizer_lr(optimizer, batch_lr) topk_base = poly_topk_scheduler(init_topk=topk_init, iter=full_iter, topk_decay_iter=1, max_iter=args.n_epoch * num_batches, power=0.95) images = Variable( images.cuda(), requires_grad=True) # Image feed into the deep neural network labels = Variable(labels.cuda(), requires_grad=False) optimizer.zero_grad() net_out = model(images) # Here we have 3 output for 3 loss topk = topk_base * 512 if random.random() < 0.20: train_loss = loss_fn(input=net_out, target=labels, K=topk, weight=class_weight) else: train_loss = loss_fn(input=net_out, target=labels, K=topk, weight=None) last_loss = train_loss.data[0] pbar.update(1) pbar.set_description("> Epoch [%d/%d]" % (epoch + 1, args.n_epoch)) pbar.set_postfix(Loss=last_loss, TopK=topk_base, LR=batch_lr) train_loss.backward() optimizer.step() if full_iter % lr_period == 0: swa_weights = update_aggregated_weight_average( model, swa_weights, full_iter, lr_period) state = {'model_state': swa_weights} torch.save( state, "{}{}_mobilenetv2vortex_swa_model.pkl".format( weight_dir, args.dataset)) if (train_i + 1) % 31 == 0: loss_log = "Epoch [%d/%d], Iter: %d Loss: \t %.4f" % ( epoch + 1, args.n_epoch, train_i + 1, last_loss) net_out = F.softmax(net_out, dim=1) pred = net_out.data.max(1)[1].cpu().numpy() gt = labels.data.cpu().numpy() running_metrics.update(gt, pred) score, class_iou = running_metrics.get_scores() metric_log = "" for k, v in score.items(): metric_log += " {}: \t %.4f, ".format(k) % v running_metrics.reset() logs = loss_log + metric_log # print(logs) if args.tensor_board: writer.add_scalar('Training/Losses', last_loss, full_iter) writer.add_scalars('Training/Metrics', score, full_iter) writer.add_text('Training/Text', logs, full_iter) for name, param in model.named_parameters(): writer.add_histogram(name, param.clone().cpu().data.numpy(), full_iter) # +++++++++++++++++++++++++++++++++++++++++++++++++++ # # 7.2 Mini-Batch Validation # +++++++++++++++++++++++++++++++++++++++++++++++++++ # # print("> Validation for Epoch [%d/%d]:" % (epoch + 1, args.n_epoch)) model.eval() mval_loss = 0.0 vali_count = 0 for i_val, (images_val, labels_val) in enumerate(valid_loader): vali_count += 1 images_val = Variable(images_val.cuda(), volatile=True) labels_val = Variable(labels_val.cuda(), volatile=True) net_out = model(images_val) # Here we have 4 output for 4 loss topk = topk_base * 512 val_loss = loss_fn(input=net_out, target=labels_val, K=topk, weight=None) mval_loss += val_loss.data[0] net_out = F.softmax(net_out, dim=1) pred = net_out.data.max(1)[1].cpu().numpy() gt = labels_val.data.cpu().numpy() running_metrics.update(gt, pred) mval_loss /= vali_count loss_log = "Epoch [%d/%d] Loss: \t %.4f" % (epoch + 1, args.n_epoch, mval_loss) metric_log = "" score, class_iou = running_metrics.get_scores() for k, v in score.items(): metric_log += " {} \t %.4f, ".format(k) % v running_metrics.reset() logs = loss_log + metric_log # print(logs) pbar.set_postfix(Train_Loss=last_loss, Vali_Loss=mval_loss, Vali_mIoU=score['Mean_IoU']) if args.tensor_board: writer.add_scalar('Validation/Losses', mval_loss, epoch) writer.add_scalars('Validation/Metrics', score, epoch) writer.add_text('Validation/Text', logs, epoch) for name, param in model.named_parameters(): writer.add_histogram(name, param.clone().cpu().data.numpy(), epoch) # export scalar data to JSON for external processing # writer.export_scalars_to_json("{}/all_scalars.json".format(log_dir)) if score['Mean_IoU'] >= best_iou: best_iou = score['Mean_IoU'] state = { 'epoch': epoch + 1, "best_iou": best_iou, 'model_state': model.state_dict(), 'optimizer_state': optimizer.state_dict() } torch.save( state, "{}{}_mobilenetv2vortex_best_model.pkl".format( weight_dir, args.dataset)) # scheduler.step() # scheduler.batch_step() pbar.close() if args.tensor_board: # export scalar data to JSON for external processing # writer.export_scalars_to_json("{}/all_scalars.json".format(log_dir)) writer.close() print("> # +++++++++++++++++++++++++++++++++++++++++++++++++++++++ #") print("> Training Done!!!") print("> # +++++++++++++++++++++++++++++++++++++++++++++++++++++++ #")
class BaseTrainer(object): """Operations of training a model, including data loading, gradient descent, and validation. """ def __init__(self, **kwargs): """ :param kwargs: dict of (key, value), or dict-like object. key is str. The base trainer requires the following keys: - epochs: int, the number of epochs in training - validate: bool, whether or not to validate on dev set - batch_size: int - pickle_path: str, the path to pickle files for pre-processing """ super(BaseTrainer, self).__init__() """ "default_args" provides default value for important settings. The initialization arguments "kwargs" with the same key (name) will override the default value. "kwargs" must have the same type as "default_args" on corresponding keys. Otherwise, error will raise. """ default_args = { "epochs": 3, "batch_size": 8, "validate": True, "use_cuda": True, "pickle_path": "./save/", "save_best_dev": True, "model_name": "default_model_name.pkl", "print_every_step": 1, "loss": Loss(None), # used to pass type check "optimizer": Optimizer("Adam", lr=0.001, weight_decay=0) } """ "required_args" is the collection of arguments that users must pass to Trainer explicitly. This is used to warn users of essential settings in the training. Obviously, "required_args" is the subset of "default_args". The value in "default_args" to the keys in "required_args" is simply for type check. """ #add required arguments here required_args = {} for req_key in required_args: if req_key not in kwargs: logger.error("Trainer lacks argument {}".format(req_key)) raise ValueError("Trainer lacks argument {}".format(req_key)) for key in default_args: if key in kwargs: if isinstance(kwargs[key], type(default_args[key])): default_args[key] = kwargs[key] else: msg = "Argument %s type mismatch: expected %s while get %s" % ( key, type(default_args[key]), type(kwargs[key])) logger.error(msg) raise ValueError(msg) else: # BaseTrainer doesn't care about extra arguments pass print(default_args) self.n_epochs = default_args["epochs"] self.batch_size = default_args["batch_size"] self.pickle_path = default_args["pickle_path"] self.validate = default_args["validate"] self.save_best_dev = default_args["save_best_dev"] self.use_cuda = default_args["use_cuda"] self.model_name = default_args["model_name"] self.print_every_step = default_args["print_every_step"] self._model = None self._loss_func = default_args["loss"].get( ) # return a pytorch loss function or None self._optimizer = None self._optimizer_proto = default_args["optimizer"] self._summary_writer = SummaryWriter(self.pickle_path + 'tensorboard_logs') self._graph_summaried = False def train(self, network, train_data, dev_data=None): """General Training Procedure :param network: a model :param train_data: three-level list, the training set. :param dev_data: three-level list, the validation data (optional) """ # transfer model to gpu if available if torch.cuda.is_available() and self.use_cuda: self._model = network.cuda() # self._model is used to access model-specific loss else: self._model = network # define Tester over dev data if self.validate: default_valid_args = { "save_output": True, "validate_in_training": True, "save_dev_input": True, "save_loss": True, "batch_size": self.batch_size, "pickle_path": self.pickle_path, "use_cuda": self.use_cuda, "print_every_step": 0 } validator = self._create_validator(default_valid_args) logger.info("validator defined as {}".format(str(validator))) # optimizer and loss self.define_optimizer() logger.info("optimizer defined as {}".format(str(self._optimizer))) self.define_loss() logger.info("loss function defined as {}".format(str(self._loss_func))) # main training procedure start = time.time() logger.info("training epochs started") for epoch in range(1, self.n_epochs + 1): logger.info("training epoch {}".format(epoch)) # turn on network training mode self.mode(network, test=False) # prepare mini-batch iterator data_iterator = iter( Batchifier(RandomSampler(train_data), self.batch_size, drop_last=False)) logger.info("prepared data iterator") # one forward and backward pass self._train_step(data_iterator, network, start=start, n_print=self.print_every_step, epoch=epoch) # validation if self.validate: logger.info("validation started") validator.test(network, dev_data) if self.save_best_dev and self.best_eval_result(validator): self.save_model(network, self.model_name) print("Saved better model selected by validation.") logger.info("Saved better model selected by validation.") valid_results = validator.show_metrics() print("[epoch {}] {}".format(epoch, valid_results)) logger.info("[epoch {}] {}".format(epoch, valid_results)) def _train_step(self, data_iterator, network, **kwargs): """Training process in one epoch. kwargs should contain: - n_print: int, print training information every n steps. - start: time.time(), the starting time of this step. - epoch: int, """ step = 0 for batch_x, batch_y in self.make_batch(data_iterator): prediction = self.data_forward(network, batch_x) loss = self.get_loss(prediction, batch_y) self.grad_backward(loss) self.update() self._summary_writer.add_scalar("loss", loss.item(), global_step=step) if not self._graph_summaried: self._summary_writer.add_graph(network, batch_x) self._graph_summaried = True if kwargs["n_print"] > 0 and step % kwargs["n_print"] == 0: end = time.time() diff = timedelta(seconds=round(end - kwargs["start"])) print_output = "[epoch: {:>3} step: {:>4}] train loss: {:>4.2} time: {}".format( kwargs["epoch"], step, loss.data, diff) print(print_output) logger.info(print_output) step += 1 def cross_validate(self, network, train_data_cv, dev_data_cv): """Training with cross validation. :param network: the model :param train_data_cv: four-level list, of shape [num_folds, num_examples, 2, ?] :param dev_data_cv: four-level list, of shape [num_folds, num_examples, 2, ?] """ if len(train_data_cv) != len(dev_data_cv): logger.error( "the number of folds in train and dev data unequals {}!={}". format(len(train_data_cv), len(dev_data_cv))) raise RuntimeError( "the number of folds in train and dev data unequals") if self.validate is False: logger.warn( "Cross validation requires self.validate to be True. Please turn it on. " ) print( "[warning] Cross validation requires self.validate to be True. Please turn it on. " ) self.validate = True n_fold = len(train_data_cv) logger.info("perform {} folds cross validation.".format(n_fold)) for i in range(n_fold): print("CV:", i) logger.info("running the {} of {} folds cross validation".format( i + 1, n_fold)) network_copy = copy.deepcopy(network) self.train(network_copy, train_data_cv[i], dev_data_cv[i]) def make_batch(self, iterator): raise NotImplementedError def mode(self, network, test): Action.mode(network, test) def define_optimizer(self): """Define framework-specific optimizer specified by the models. """ self._optimizer = self._optimizer_proto.construct_from_pytorch( self._model.parameters()) def update(self): """ Perform weight update on a model. For PyTorch, just call optimizer to update. """ self._optimizer.step() def data_forward(self, network, x): raise NotImplementedError def grad_backward(self, loss): """Compute gradient with link rules. :param loss: a scalar where back-prop starts For PyTorch, just do "loss.backward()" """ self._model.zero_grad() loss.backward() def get_loss(self, predict, truth): """Compute loss given prediction and ground truth. :param predict: prediction label vector :param truth: ground truth label vector :return: a scalar """ return self._loss_func(predict, truth) def define_loss(self): """Define a loss for the trainer. If the model defines a loss, use model's loss. Otherwise, Trainer must has a loss argument, use it as loss. These two losses cannot be defined at the same time. Trainer does not handle loss definition or choose default losses. """ if hasattr(self._model, "loss") and self._loss_func is not None: raise ValueError( "Both the model and Trainer define loss. Please take out your loss." ) if hasattr(self._model, "loss"): self._loss_func = self._model.loss logger.info("The model has a loss function, use it.") else: if self._loss_func is None: raise ValueError("Please specify a loss function.") logger.info("The model didn't define loss, use Trainer's loss.") def best_eval_result(self, validator): """Check if the current epoch yields better validation results. :param validator: a Tester instance :return: bool, True means current results on dev set is the best. """ accuracy = validator.metrics() if accuracy > self.best_accuracy: self.best_accuracy = accuracy return True else: return False def save_model(self, network, model_name): """Save this model with such a name. This method may be called multiple times by Trainer to overwritten a better model. :param network: the PyTorch model :param model_name: str """ if model_name[-4:] != ".pkl": model_name += ".pkl" ModelSaver(os.path.join(self.pickle_path, model_name)).save_pytorch(network) def _create_validator(self, valid_args): raise NotImplementedError
def train(opt): if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) learning_rate_schedule = {"0": 1e-5, "5": 1e-4, "80": 1e-5, "110": 1e-6} training_params = { "batch_size": opt.batch_size, "shuffle": True, "drop_last": True, "collate_fn": custom_collate_fn } test_params = { "batch_size": opt.batch_size, "shuffle": False, "drop_last": False, "collate_fn": custom_collate_fn } training_set = [] training_generator = [] training_set.append( COCODataset(opt.data_path, "2014", "train", opt.image_size)) training_set.append( COCODataset(opt.data_path, "2014", "val", opt.image_size)) training_set.append( COCODataset(opt.data_path, "2017", "train", opt.image_size)) training_generator.append(DataLoader(training_set[0], **training_params)) training_generator.append(DataLoader(training_set[1], **training_params)) training_generator.append(DataLoader(training_set[2], **training_params)) test_set = COCODataset(opt.data_path, "2017", "val", opt.image_size, is_training=False) test_generator = DataLoader(test_set, **test_params) if torch.cuda.is_available(): if opt.pre_trained_model_type == "model": model = torch.load(opt.pre_trained_model_path) else: model = Yolo(training_set[0].num_classes) model.load_state_dict(torch.load(opt.pre_trained_model_path)) else: if opt.pre_trained_model_type == "model": model = torch.load(opt.pre_trained_model_path, map_location=lambda storage, loc: storage) else: model = Yolo(training_set[0].num_classes) model.load_state_dict( torch.load(opt.pre_trained_model_path, map_location=lambda storage, loc: storage)) # The following line will re-initialize weight for the last layer, which is useful # when you want to retrain the model based on my trained weights. if you uncomment it, # you will see the loss is already very small at the beginning. nn.init.normal_(list(model.modules())[-1].weight, 0, 0.01) log_path = os.path.join(opt.log_path, "{}".format("2014and2017")) if os.path.isdir(log_path): shutil.rmtree(log_path) os.makedirs(log_path) writer = SummaryWriter(log_path) if torch.cuda.is_available(): writer.add_graph( model, torch.rand(opt.batch_size, 3, opt.image_size, opt.image_size)) model.cuda() else: writer.add_graph( model, torch.rand(opt.batch_size, 3, opt.image_size, opt.image_size)) criterion = YoloLoss(training_set[0].num_classes, model.anchors, opt.reduction) optimizer = torch.optim.SGD(model.parameters(), lr=1e-5, momentum=opt.momentum, weight_decay=opt.decay) best_loss = 1e10 best_epoch = 0 model.train() num_iter_per_epoch = 0 for generator in training_generator: num_iter_per_epoch += len(generator) for epoch in range(opt.num_epoches): if str(epoch) in learning_rate_schedule.keys(): for param_group in optimizer.param_groups: param_group['lr'] = learning_rate_schedule[str(epoch)] for generator in training_generator: for iter, batch in enumerate(generator): image, label = batch if torch.cuda.is_available(): image = Variable(image.cuda(), requires_grad=True) else: image = Variable(image, requires_grad=True) optimizer.zero_grad() logits = model(image) loss, loss_coord, loss_conf, loss_cls = criterion( logits, label) loss.backward() optimizer.step() print( "Epoch: {}/{}, Iteration: {}/{}, Lr: {}, Loss:{:.2f} (Coord:{:.2f} Conf:{:.2f} Cls:{:.2f})" .format(epoch + 1, opt.num_epoches, iter + 1, num_iter_per_epoch, optimizer.param_groups[0]['lr'], loss, loss_coord, loss_conf, loss_cls)) writer.add_scalar('Train/Total_loss', loss, epoch * num_iter_per_epoch + iter) writer.add_scalar('Train/Coordination_loss', loss_coord, epoch * num_iter_per_epoch + iter) writer.add_scalar('Train/Confidence_loss', loss_conf, epoch * num_iter_per_epoch + iter) writer.add_scalar('Train/Class_loss', loss_cls, epoch * num_iter_per_epoch + iter) if epoch % opt.test_interval == 0: model.eval() loss_ls = [] loss_coord_ls = [] loss_conf_ls = [] loss_cls_ls = [] for te_iter, te_batch in enumerate(test_generator): te_image, te_label = te_batch num_sample = len(te_label) if torch.cuda.is_available(): te_image = te_image.cuda() with torch.no_grad(): te_logits = model(te_image) batch_loss, batch_loss_coord, batch_loss_conf, batch_loss_cls = criterion( te_logits, te_label) loss_ls.append(batch_loss * num_sample) loss_coord_ls.append(batch_loss_coord * num_sample) loss_conf_ls.append(batch_loss_conf * num_sample) loss_cls_ls.append(batch_loss_cls * num_sample) te_loss = sum(loss_ls) / test_set.__len__() te_coord_loss = sum(loss_coord_ls) / test_set.__len__() te_conf_loss = sum(loss_conf_ls) / test_set.__len__() te_cls_loss = sum(loss_cls_ls) / test_set.__len__() print( "Epoch: {}/{}, Lr: {}, Loss:{:.2f} (Coord:{:.2f} Conf:{:.2f} Cls:{:.2f})" .format(epoch + 1, opt.num_epoches, optimizer.param_groups[0]['lr'], te_loss, te_coord_loss, te_conf_loss, te_cls_loss)) writer.add_scalar('Test/Total_loss', te_loss, epoch) writer.add_scalar('Test/Coordination_loss', te_coord_loss, epoch) writer.add_scalar('Test/Confidence_loss', te_conf_loss, epoch) writer.add_scalar('Test/Class_loss', te_cls_loss, epoch) model.train() if te_loss + opt.es_min_delta < best_loss: best_loss = te_loss best_epoch = epoch # torch.save(model, opt.saved_path + os.sep + "trained_yolo_coco") torch.save( model.state_dict(), opt.saved_path + os.sep + "only_params_trained_yolo_coco") torch.save( model, opt.saved_path + os.sep + "whole_model_trained_yolo_coco") # Early stopping if epoch - best_epoch > opt.es_patience > 0: print( "Stop training at epoch {}. The lowest loss achieved is {}" .format(epoch, te_loss)) break # writer.export_scalars_to_json(log_path + os.sep + "all_logs.json") writer.close()
def main(): if args.arch == "resnet50": backbone = resnet50() elif args.arch == "resnet18": backbone = resnet18() else: raise NotImplementedError net = RetinaFace(backbone, pretrained_model_path=args.pretrained) if torch.cuda.is_available(): if args.cuda: # torch.set_default_tensor_type('torch.cuda.FloatTensor') if args.num_workers>1: net = torch.nn.DataParallel(net) # must after loading model weigths else: # raise NotImplementedError pass net.cuda() # net.to(device) cudnn.benchmark = True if args.use_tensorboard: from tensorboardX import SummaryWriter if not osp.exists(args.log_dir): os.mkdir(args.log_dir) if args.log_dir: if not osp.exists(args.log_dir): os.mkdir(args.log_dir) train_writer = SummaryWriter(log_dir="{}".format(args.log_dir), comment=args.arch) dummy_input = torch.rand(1, 3, 640, 640).cuda() train_writer.add_graph(backbone, (dummy_input, )) train_dataset = WiderFaceDetection(root_path=args.root, data_path=args.dataset_root, phase="train", dataset_name="WiderFace", transform=None) train_loader = data.DataLoader(train_dataset, args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=detection_collate) print("sucess train_loader") anchors = Anchor_Box() with torch.no_grad(): anchors = anchors.forward() anchors = anchors.cuda() print("anchors ready") optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) start_epoch = 0 end_epoch = args.max_epoch criterion = MultiTaskLoss() for epoch in range(start_epoch + 1, end_epoch + 1): lr = adjust_learning_rate(optimizer=optimizer, epoch=epoch, step_epoch=[55, 68, 80], gamma=0.1, base_lr=args.lr, # 0.001 warm_up_end_lr=0.01, warmup_epoch=5 ) print("Epoch[{}] lr: {}".format(epoch, lr)) if args.use_tensorboard: train_writer.add_scalar('learning_rate', lr, epoch) # train train_net(train_loader, net, criterion, optimizer, epoch, anchors, train_writer=train_writer) else: train_net(train_loader, net, criterion, optimizer, epoch, anchors) if epoch % 5 == 0: pass # TODO if (epoch == end_epoch) or (epoch % 5 == 0): torch.save(net.state_dict(), "/home/shanma/Workspace/zhubin/github_file/RetinaFace-pytorch/weights/retinaface_epoch{}_{}.pth".format(epoch, get_cur_time())) # torch.save(net.state_dict(), "/home/dc2-user/zhubin/RetinaFace-pytorch/weights/retinaface_epoch{}_{}.pth".format(epoch, get_cur_time())) # if (epoch >= 50 and epoch % 10 == 0): # eval_net( # val_dataset, # val_loader, # net, # detector, # cfg, # ValTransform, # top_k, # thresh=thresh, # batch_size=batch_size) # save_checkpoint(net, end_epoch, size, optimizer) if args.use_tensorboard: train_writer.close()
def main(args): r"""Performs the main training loop """ # Load dataset print('> Loading dataset ...') dataset_train = Dataset(train=True, gray_mode=args.gray, shuffle=True) dataset_val = Dataset(train=False, gray_mode=args.gray, shuffle=False) loader_train = DataLoader(dataset=dataset_train, num_workers=6, \ batch_size=args.batch_size, shuffle=True) print("\t# of training samples: %d\n" % int(len(dataset_train))) # Init loggers if not os.path.exists(args.log_dir): os.makedirs(args.log_dir) writer = SummaryWriter(args.log_dir) logger = init_logger(args) # Create model if not args.gray: in_ch = 3 else: in_ch = 1 net = FFDNet(num_input_channels=in_ch) # Initialize model with He init net.apply(weights_init_kaiming) # Define loss criterion = nn.MSELoss(size_average=False) # Move to GPU device_ids = [0] model = nn.DataParallel(net, device_ids=device_ids).cuda() criterion.cuda() # Optimizer optimizer = optim.Adam(model.parameters(), lr=args.lr) # Resume training or start anew if args.resume_training: resumef = os.path.join(args.log_dir, 'ckpt.pth') if os.path.isfile(resumef): checkpoint = torch.load(resumef) print("> Resuming previous training") model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) new_epoch = args.epochs new_milestone = args.milestone current_lr = args.lr args = checkpoint['args'] training_params = checkpoint['training_params'] start_epoch = training_params['start_epoch'] args.epochs = new_epoch args.milestone = new_milestone args.lr = current_lr print("=> loaded checkpoint '{}' (epoch {})"\ .format(resumef, start_epoch)) print("=> loaded parameters :") print("==> checkpoint['optimizer']['param_groups']") print("\t{}".format(checkpoint['optimizer']['param_groups'])) print("==> checkpoint['training_params']") for k in checkpoint['training_params']: print("\t{}, {}".format(k, checkpoint['training_params'][k])) argpri = vars(checkpoint['args']) print("==> checkpoint['args']") for k in argpri: print("\t{}, {}".format(k, argpri[k])) args.resume_training = False else: raise Exception("Couldn't resume training with checkpoint {}".\ format(resumef)) else: start_epoch = 0 training_params = {} training_params['step'] = 0 training_params['current_lr'] = 0 training_params['no_orthog'] = args.no_orthog # Training for epoch in range(start_epoch, args.epochs): # Learning rate value scheduling according to args.milestone if epoch > args.milestone[1]: current_lr = args.lr / 1000. training_params['no_orthog'] = True elif epoch > args.milestone[0]: current_lr = args.lr / 10. else: current_lr = args.lr # set learning rate in optimizer for param_group in optimizer.param_groups: param_group["lr"] = current_lr print('learning rate %f' % current_lr) # train for i, data in enumerate(loader_train, 0): # Pre-training step model.train() model.zero_grad() optimizer.zero_grad() # inputs: noise and noisy image img_train = data noise = torch.zeros(img_train.size()) stdn = np.random.uniform(args.noiseIntL[0], args.noiseIntL[1], \ size=noise.size()[0]) for nx in range(noise.size()[0]): sizen = noise[0, :, :, :].size() noise[nx, :, :, :] = torch.FloatTensor(sizen).\ normal_(mean=0, std=stdn[nx]) imgn_train = img_train + noise # Create input Variables img_train = Variable(img_train.cuda()) imgn_train = Variable(imgn_train.cuda()) noise = Variable(noise.cuda()) stdn_var = Variable(torch.cuda.FloatTensor(stdn)) # Evaluate model and optimize it out_train = model(imgn_train, stdn_var) loss = criterion(out_train, noise) / (imgn_train.size()[0] * 2) loss.backward() optimizer.step() # Results model.eval() out_train = torch.clamp(imgn_train - model(imgn_train, stdn_var), 0., 1.) psnr_train = batch_psnr(out_train, img_train, 1.) # PyTorch v0.4.0: loss.data[0] --> loss.item() if training_params['step'] % args.save_every == 0: # Apply regularization by orthogonalizing filters if not training_params['no_orthog']: model.apply(svd_orthogonalization) # Log the scalar values writer.add_scalar('loss', loss.item(), training_params['step']) writer.add_scalar('PSNR on training data', psnr_train, \ training_params['step']) print("[epoch %d][%d/%d] loss: %.4f PSNR_train: %.4f" %\ (epoch+1, i+1, len(loader_train), loss.item(), psnr_train)) training_params['step'] += 1 # The end of each epoch model.eval() # Validation psnr_val = 0 for valimg in dataset_val: img_val = torch.unsqueeze(valimg, 0) noise = torch.FloatTensor(img_val.size()).\ normal_(mean=0, std=args.val_noiseL) imgn_val = img_val + noise img_val, imgn_val = Variable(img_val.cuda()), Variable( imgn_val.cuda()) sigma_noise = Variable(torch.cuda.FloatTensor([args.val_noiseL])) out_val = torch.clamp(imgn_val - model(imgn_val, sigma_noise), 0., 1.) psnr_val += batch_psnr(out_val, img_val, 1.) psnr_val /= len(dataset_val) print("\n[epoch %d] PSNR_val: %.4f" % (epoch + 1, psnr_val)) writer.add_scalar('PSNR on validation data', psnr_val, epoch) writer.add_scalar('Learning rate', current_lr, epoch) # Log val images try: if epoch == 0: # Log graph of the model writer.add_graph( model, (imgn_val, sigma_noise), ) # Log validation images for idx in range(2): imclean = utils.make_grid(img_val.data[idx].clamp(0., 1.), \ nrow=2, normalize=False, scale_each=False) imnsy = utils.make_grid(imgn_val.data[idx].clamp(0., 1.), \ nrow=2, normalize=False, scale_each=False) writer.add_image('Clean validation image {}'.format(idx), imclean, epoch) writer.add_image('Noisy validation image {}'.format(idx), imnsy, epoch) for idx in range(2): imrecons = utils.make_grid(out_val.data[idx].clamp(0., 1.), \ nrow=2, normalize=False, scale_each=False) writer.add_image('Reconstructed validation image {}'.format(idx), \ imrecons, epoch) # Log training images imclean = utils.make_grid(img_train.data, nrow=8, normalize=True, \ scale_each=True) writer.add_image('Training patches', imclean, epoch) except Exception as e: logger.error("Couldn't log results: {}".format(e)) # save model and checkpoint training_params['start_epoch'] = epoch + 1 torch.save(model.state_dict(), os.path.join(args.log_dir, 'net.pth')) save_dict = { \ 'state_dict': model.state_dict(), \ 'optimizer' : optimizer.state_dict(), \ 'training_params': training_params, \ 'args': args\ } torch.save(save_dict, os.path.join(args.log_dir, 'ckpt.pth')) if epoch % args.save_every_epochs == 0: torch.save(save_dict, os.path.join(args.log_dir, \ 'ckpt_e{}.pth'.format(epoch+1))) del save_dict
num_workers=0 ) return dataloader from torchnet.meter import AverageValueMeter if __name__ == '__main__': dataloader = getdataloader(mode=True) model = LinearModel() model.cuda() trainable_num = sum(p.numel() for p in model.parameters() if p.requires_grad) print("trainable parameters ",trainable_num) criterion = torch.nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=0.01) # model.parameters()自动完成参数的初始化操作 summary = SummaryWriter('log') x = torch.rand(size=(17,IN_FEATURE)).cuda() summary.add_graph(model,x) loss_meter = AverageValueMeter() # training cycle forward, backward, update for i in range(EPOCH): myloss = 0 iter=0 for epoch, (x_data, y_data) in enumerate(dataloader): y_pred = model(x_data) # forward:predict loss = criterion(y_pred, y_data) # forward: loss myloss += loss.item() loss_meter.add(loss.item()) # print(myloss) optimizer.zero_grad() # the grad computer by .backward() will be accumulated. so before backward, remember set the grad to zero loss.backward() # backward: autograd optimizer.step() # update 参数,即更新w和b的值 iter = epoch
weight_decay=5e-4) train_scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=settings.MILESTONES, gamma=0.2) #learning rate decay iter_per_epoch = len(cifar100_training_loader) warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * args.warm) checkpoint_path = os.path.join(settings.CHECKPOINT_PATH, args.net, settings.TIME_NOW) #use tensorboard if not os.path.exists(settings.LOG_DIR): os.mkdir(settings.LOG_DIR) writer = SummaryWriter( logdir=os.path.join(settings.LOG_DIR, args.net, settings.TIME_NOW)) input_tensor = torch.Tensor(12, 3, 32, 32).cuda() writer.add_graph(net, Variable(input_tensor, requires_grad=True)) #create checkpoint folder to save model if not os.path.exists(checkpoint_path): os.makedirs(checkpoint_path) checkpoint_path = os.path.join(checkpoint_path, '{net}-{epoch}-{type}.pth') best_acc = 0.0 for epoch in range(1, settings.EPOCH): if epoch > args.warm: train_scheduler.step(epoch) train(epoch) acc = eval_training(epoch) #start to save best performance model after learning rate decay to 0.01
print(model) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=0.001) # optimize all cnn parameters writer = SummaryWriter(comment='Action_Net') for epoch in range(5): for step, (inputs, labels) in enumerate(train_loader): optimizer.zero_grad() # clear last grad inputs = torch.tensor(inputs, dtype=torch.float32) out = model(inputs) loss = criterion(out, labels) # calculate loss loss.backward() # loss backward, calculate new data optimizer.step() # add new weight to net parameters writer.add_graph(model, inputs) writer.add_scalar('Loss', loss, epoch * 100 + step) if step % 100 == 0: for i, (test_data, test_label) in enumerate(test_loader): test_data = torch.tensor(test_data, dtype=torch.float32) test_output = model(test_data) pred_y = torch.max(test_output, 1)[1].data.numpy() accuracy = float( (pred_y == test_label.data.numpy()).astype(int).sum()) / float( test_label.size(0)) writer.add_scalar('Accuracy', accuracy, epoch * 100 + step) print('Epoch: ', epoch, '| train loss: %.4f' % loss.data.numpy(), '| test accuracy: %.2f' % accuracy) writer.close() torch.save(model, 'model/net.pkl') # 保存整个神经网络的结构和模型参数
val_dataloader = DataLoader(val_dataset, batch_size=args.bs, shuffle=False, num_workers=args.workers, pin_memory=True) if args.ckpt: pass else: # save graph and clips_order samples for data in train_dataloader: clips, idxs = data writer.add_video('train/clips', clips, 0, fps=8) writer.add_text('train/idxs', str(idxs.tolist()), 0) clips = clips.to(device) writer.add_graph(model, clips) break # save init params at step 0 for name, param in model.named_parameters(): writer.add_histogram('params/{}'.format(name), param, 0) ### loss funciton, optimizer and scheduler ### criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.wd) #scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', min_lr=1e-5, patience=50, factor=0.1) scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[150], gamma=0.1)
class TrickLearner(object): """ Performs vanilla training with tricks: 1. Label Smooth; 2. Mixup; 3. SE-module is deployed in nets/decode models. """ def __init__(self, model, loaders, args, device): self.args = args self.device = device self.model = model self.__build_path() self.train_loader, self.test_loader = loaders self.setup_optim() self.criterion = nn.CrossEntropyLoss().cuda() if self.args.label_smooth: classes = 10 if args.dataset == 'cifar10' else ( 1000 if args.dataset == 'ilsvrc_12' else 100) self.criterion_smooth = CrossEntropyLabelSmooth( classes, args.label_smooth_eps).cuda() if self.check_is_primary(): self.writer = SummaryWriter(os.path.dirname(self.save_path)) # self.add_graph() def train(self, train_sampler=None): for epoch in range(self.args.epochs): if self.args.distributed: assert train_sampler != None train_sampler.set_epoch(epoch) self.model.train() if self.check_is_primary(): logging.info("Training at Epoch: %d" % epoch) train_acc, train_loss = self.epoch(True) if self.check_is_primary(): self.writer.add_scalar('train_acc', train_acc, epoch) self.writer.add_scalar('train_loss', train_loss, epoch) if (epoch + 1) % self.args.eval_epoch == 0: # evaluate every GPU, but we only show the results on a single one.! if self.check_is_primary(): logging.info("Evaluation at Epoch: %d" % epoch) self.evaluate(True, epoch) if self.check_is_primary(): self.save_model() def evaluate(self, is_train=False, epoch=None): self.model.eval() # NOTE: syncronizing the BN statistics if self.args.distributed: sync_bn_stat(self.model, self.args.world_size) if not is_train: self.load_model() with torch.no_grad(): test_acc, test_loss = self.epoch(False) if is_train and epoch and self.check_is_primary(): self.writer.add_scalar('test_acc', test_acc, epoch) self.writer.add_scalar('test_loss', test_loss, epoch) return test_acc, test_loss def finetune(self, train_sampler): self.load_model() self.evaluate() for epoch in range(self.args.epochs): if self.args.distributed: assert train_sampler != None train_sampler.set_epoch(epoch) self.model.train() # NOTE: use the preset learning rate for all epochs. ft_acc, ft_loss = self.epoch(True) if self.check_is_primary(): self.writer.add_scalar('ft_acc', ft_acc, epoch) self.writer.add_scalar('ft_loss', ft_loss, epoch) # evaluate every k step if (epoch + 1) % self.args.eval_epoch == 0: if self.check_is_primary(): logging.info("Evaluation at Epoch: %d" % epoch) self.evaluate(True, epoch) # save the model if self.check_is_primary(): self.save_model() def misc(self): raise NotImplementedError( "Misc functions are implemented in sub classes") def epoch(self, is_train): """ Rewrite this function if necessary in the sub-classes. """ loader = self.train_loader if is_train else self.test_loader # setup statistics batch_time = AverageMeter('Time', ':3.3f') # data_time = AverageMeter('Data', ':6.3f') losses = AverageMeter('Loss', ':.4e') lrs = AverageMeter('Lr', ':.4e') top1 = AverageMeter('Acc@1', ':3.3f') top5 = AverageMeter('Acc@5', ':3.3f') metrics = [batch_time, lrs, top1, top5, losses] loader_len = len(loader) progress = ProgressMeter(loader_len, *metrics, prefix='Job id: %s, ' % self.args.job_id) end = time.time() for idx, (X, y) in enumerate(loader): # data_time.update(time.time() - end) criterion = self.criterion_smooth if is_train and self.args.label_smooth else self.criterion X, y = X.to(self.device), y.to(self.device) if is_train and self.args.mixup: mixed_X, y_a, y_b, lam = self.mixup_data(X, y) mixed_yp = self.model(mixed_X) loss = (lam * criterion(mixed_yp, y_a) + (1.0 - lam) * criterion(mixed_yp, y_b)) / self.args.world_size acc1_a, acc5_a = accuracy(mixed_yp, y_a, topk=(1, 5)) acc1_b, acc5_b = accuracy(mixed_yp, y_b, topk=(1, 5)) acc1, acc5 = lam * acc1_a + ( 1 - lam) * acc1_b, lam * acc5_a + (1 - lam) * acc5_b else: yp = self.model(X) loss = criterion(yp, y) / self.args.world_size acc1, acc5 = accuracy(yp, y, topk=(1, 5)) reduced_loss = loss.data.clone() reduced_acc1 = acc1.clone() / self.args.world_size reduced_acc5 = acc5.clone() / self.args.world_size if self.args.distributed: dist.all_reduce(reduced_loss) dist.all_reduce(reduced_acc1) dist.all_reduce(reduced_acc5) if is_train: self.opt.zero_grad() loss.backward() if self.args.distributed: average_gradients(self.model) # NOTE: important self.opt.step() if self.lr_scheduler: self.lr_scheduler.step() # update statistics top1.update(reduced_acc1[0].item(), X.shape[0]) top5.update(reduced_acc5[0].item(), X.shape[0]) losses.update(reduced_loss.item(), X.shape[0]) lrs.update(self.lr_scheduler.get_lr()[0]) batch_time.update(time.time() - end) end = time.time() # show the training/evaluating statistics if self.check_is_primary() and ((idx % self.args.print_freq == 0) or (idx + 1) % loader_len == 0): progress.show(idx) return top1.avg, losses.avg def setup_optim(self): max_iter = len(self.train_loader) * self.args.epochs if self.args.model_type.startswith('model_'): self.opt = optim.SGD(self.model.parameters(), lr=self.args.lr, \ momentum=self.args.momentum, nesterov=self.args.nesterov, \ weight_decay=self.args.weight_decay) self.lr_scheduler = optim.lr_scheduler.MultiStepLR( self.opt, milestones=[int(max_iter * 0.5), int(max_iter * 0.75)]) elif self.args.model_type.startswith('resnet_'): self.opt = optim.SGD(self.model.parameters(), lr=self.args.lr, \ momentum=self.args.momentum, nesterov=self.args.nesterov, \ weight_decay=self.args.weight_decay) if self.args.lr_decy_type == 'cosine': self.lr_scheduler = optim.lr_scheduler.CosineAnnealingLR( self.opt, max_iter, eta_min=0) elif self.args.lr_decy_type == 'multi_step': self.lr_scheduler = optim.lr_scheduler.MultiStepLR( self.opt, milestones=[int(max_iter * 0.5), int(max_iter * 0.75)]) else: raise ValueError("Unknown learning rate decay type") elif self.args.model_type.startswith('mobilenet_v2'): # default on 8-gpu: 250 epochs, 2e-1 lr with cosine, 4e-5 wd, no dropout, warmup to 8e-1, nestrov, no wd for BN and bias param_groups = self.get_param_group() self.opt = optim.SGD(param_groups, lr=self.args.lr, \ momentum=self.args.momentum, nesterov=self.args.nesterov, \ weight_decay=self.args.weight_decay) # self.lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(self.opt, self.args.epochs, eta_min=0) warmup_lr = 4 * self.args.lr warmup_steps = 1250 self.lr_scheduler = WarmUpCosineLRScheduler(self.opt, max_iter, self.args.lr_min, self.args.lr, warmup_lr, warmup_steps, last_iter=-1) else: raise ValueError("Unknown model, failed to initalize optim") def add_graph(self): # create dummy input x = torch.randn(self.args.batch_size, 3, 32, 32) with self.writer: self.writer.add_graph(self.model, (x, )) def __build_path(self): if self.args.exec_mode == 'finetune': self.load_path = self.args.load_path self.save_path = os.path.join(os.path.dirname(self.load_path), 'model_ft.pt') elif self.args.exec_mode == 'train': self.save_path = os.path.join( self.args.save_path, '_'.join([self.args.model_type, self.args.learner]), self.args.job_id, 'model.pt') self.load_path = self.save_path else: self.load_path = self.args.load_path self.save_path = self.load_path def check_is_primary(self): if (self.args.distributed and self.args.rank == 0) or \ not self.args.distributed: return True else: return False def save_model(self): state = {'state_dict': self.model.state_dict(), \ 'optimizer': self.opt.state_dict()} torch.save(state, self.save_path) logging.info("Model stored at: " + self.save_path) def load_model(self): if self.args.distributed: # read parameters to each GPU seperately loc = 'cuda:{}'.format(torch.cuda.current_device()) checkpoint = torch.load(self.load_path, map_location=loc) else: checkpoint = torch.load(self.load_path) self.model.load_state_dict(checkpoint['state_dict']) self.opt.load_state_dict(checkpoint['optimizer']) logging.info("Model succesfully restored from %s" % self.load_path) if self.args.distributed: broadcast_params(self.model) def mixup_data(self, X, y): batch_size = X.size()[0] alpha = self.args.mixup_alpha if alpha > 0: lam = np.random.beta(alpha, alpha) else: lam = 1 index = torch.randperm(batch_size).to(self.device) mixed_X = lam * X + (1 - lam) * X[index, :] y_a, y_b = y, y[index] return mixed_X, y_a, y_b, lam def get_param_group(self): param_group_no_wd = [] names_no_wd = [] param_group_normal = [] for name, m in self.model.named_modules(): if isinstance(m, nn.Conv2d): if m.bias is not None: param_group_no_wd.append(m.bias) names_no_wd.append(name + '.bias') elif isinstance(m, nn.Linear): if m.bias is not None: param_group_no_wd.append(m.bias) names_no_wd.append(name + '.bias') elif isinstance(m, (nn.BatchNorm1d, nn.BatchNorm2d)): if m.weight is not None: param_group_no_wd.append(m.weight) names_no_wd.append(name + '.weight') if m.bias is not None: param_group_no_wd.append(m.bias) names_no_wd.append(name + '.bias') for name, p in self.model.named_parameters(): if name not in names_no_wd: param_group_normal.append(p) return [{ 'params': param_group_normal }, { 'params': param_group_no_wd, 'weight_decay': 0.0 }]
class model: def __init__(self, package_name, model_name, description='', model_path=None, args=None): self.package_name = __import__(package_name) self.model_name = model_name self.model_path = model_path self.model = None self.writer = None self.loss = '' self.optimizer_name = '' self.accuracy_name = '' self.checkpoint_name = '' self.optimizer = None self.accuracy = None self.criterion = None self.epoch = 0 self.description = description self.create_model(args=args) # self.create_writer() def create_model(self, args): if self.model_path is not None: print("=> Loading checkpoint '{}'".format(self.model_path)) self.load_checkpoint(self.model_path, args=args) else: print("=> Creating new model") self.model = getattr(self.package_name, self.model_name)(args) if torch.cuda.is_available(): print("Using GPU") # self.model = nn.DataParallel(self.model, device_ids=[0]) self.model = self.model.cuda() def create_writer(self, checkpoint_name=''): if checkpoint_name is '': self.checkpoint_name = datetime.datetime.now().strftime( '%b%d_%H-%M') + '_' + self.model_name + '_' + self.description else: self.checkpoint_name = checkpoint_name writer_name = '{checkpoint_name}_{optimizer}_{loss_name}.pth.tar'\ .format(checkpoint_name=self.checkpoint_name, optimizer=self.optimizer_name, loss_name=self.loss) writer_dir = os.path.join('runs', writer_name) self.writer = SummaryWriter(log_dir=writer_dir) def fit(self, loss='MSELoss', optimizer_name='Adam', lr=0.01, weight_decay=0, accuracy_name='', create_writer=True): self.loss = loss self.optimizer_name = optimizer_name self.accuracy_name = accuracy_name self.criterion = getattr(nn, loss)() # self.criterion = getattr(self.package_name, loss)() if optimizer_name == 'SGD': self.optimizer = getattr(optim, optimizer_name)(self.model.parameters(), lr=lr, momentum=0.9) else: self.optimizer = getattr(optim, optimizer_name)(self.model.parameters(), lr=lr) if accuracy_name == 'argmax': self.accuracy = argmax elif accuracy_name == 'count_success': self.accuracy = count_success elif accuracy_name != '': self.accuracy = getattr(nn, accuracy_name)() # self.accuracy = getattr(self.package_name, accuracy_name)() if create_writer: self.create_writer(self.checkpoint_name) def save_checkpoint(self, save_dir='', epochs_per_save=1): # TODO: add is_best to save, to save the best model in the training filename = '{checkpoint_name}_{epoch_num}_{optimizer}_{loss_name}.pth.tar'\ .format(checkpoint_name=self.checkpoint_name, epoch_num=self.epoch, optimizer=self.optimizer_name, loss_name=self.loss) torch.save( { 'epoch': self.epoch, 'model_state_dict': self.model.state_dict(), 'optimizer_state_dict': self.optimizer.state_dict(), 'loss': self.loss, 'optimizer_name': self.optimizer_name, 'accuracy_name': self.accuracy_name, 'checkpoint_name': self.checkpoint_name }, os.path.join(save_dir, filename)) print("Saved checkpoint as: {}".format(os.path.join( save_dir, filename))) # removing the old checkpoint: # TODO: need to check if the remove is working old_filename = '{checkpoint_name}_{epoch_num}_{optimizer}_{loss_name}.pth.tar'\ .format(checkpoint_name=self.checkpoint_name, epoch_num=self.epoch-epochs_per_save, optimizer=self.optimizer_name, loss_name=self.loss) #TODO check it # if os.path.exists(os.path.join(save_dir, old_filename)): # os.remove(os.path.join(save_dir, old_filename)) def load_checkpoint(self, filename, args): """ loads checkpoint (that was save with save_checkpoint) No need to do .fit after :param filename: path to the checkpoint :return: """ self.model = getattr(self.package_name, self.model_name)(args) checkpoint = torch.load(filename) self.model.load_state_dict(checkpoint['model_state_dict']) self.epoch = checkpoint['epoch'] self.loss = checkpoint['loss'] self.optimizer_name = checkpoint['optimizer_name'] self.accuracy_name = checkpoint['accuracy_name'] # self.checkpoint_name = None self.checkpoint_name = checkpoint['checkpoint_name'] self.fit(self.loss, self.optimizer_name, accuracy_name=self.accuracy_name) self.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) print("Loaded checkpoint as: {}".format(filename)) def print_summary(self, input_size=(1, 32, 128, 128)): summ = summary(self.model, input_size=input_size) # self.writer.add_text('Summary', summ) def print_graph(self, dummy_input): # dummy_input = Variable(torch.rand(1, 1, 32, 64, 64)) if self.writer is not None: self.writer.add_graph(model=self.model, input_to_model=(dummy_input, )) def print_epoch_statistics(self, epoch, epoch_time, running_loss, running_accuracy, validation_accuracy=None): """ :param epoch: number of epoch this results from :param running_loss: array of all the losses in this epoch :param running_accuracy: array of all the training accuracies in this epoch :param validation_accuracy: array of all the validation accuracies in this epoch :return: print on the stdout the results and log in tensorboard if defined """ if validation_accuracy is None: print( "End of epoch {:3d} in {:3d} sec | Training loss = {:5.4f} | Training acc = {:5.4f}" .format(epoch, int(epoch_time), np.mean(running_loss), np.mean(running_accuracy))) else: print( "End of epoch {:3d} in {:3d} sec | Training loss = {:5.4f} | Training acc = {:5.4f} | Valid acc = {:5.4f}" .format(epoch, int(epoch_time), np.mean(running_loss), np.mean(running_accuracy), np.mean(validation_accuracy))) if self.writer is not None: self.writer.add_scalar('Train/Loss', float(np.mean(running_loss)), epoch) self.writer.add_scalar('Train/accuracy', float(np.mean(running_accuracy)), epoch) if validation_accuracy is not None: self.writer.add_scalar('Validation/accuracy', float(np.mean(validation_accuracy)), epoch) def add_images_tensorboard(self, inputs, labels, outputs): # TODO: check this function """ :param inputs: the net input, a 5 dim tensor shape: [batch, channels, z, x, y] :param labels: the ground truth, a 5 dim tensor shape: [batch, channels, z, x, y] :param outputs: the net output, a 5 dim tensor shape: [batch, channels, z, x, y] :return: add images to tensorboard """ def test_validation(self, validationloader=None): validation_accuracy = None if validationloader is not None: self.model.eval() # changing to eval mode valid_running_accuracy = [] with torch.no_grad(): for k, sample in enumerate(validationloader, 0): if isinstance(sample, dict): if self.model_name == 'LSTMClassifaierAndDenoise': valid_mfcc = sample['mfcc'].reshape(-1, 1, 39) valid_stft = sample['stft'].reshape(-1, 1, 257) valid_labels = sample['ground_truth'].reshape( -1, 257) else: valid_mfcc = sample['mfcc'].reshape(-1, 1, 351) valid_stft = sample['stft'].reshape(-1, 1, 2313) valid_labels = sample['ground_truth'].reshape( -1, 1, 257) else: valid_mfcc, valid_stft, valid_labels = sample # wrap them in Variable if torch.cuda.is_available(): valid_mfcc,valid_stft, valid_labels = Variable(valid_mfcc.cuda()).float(),\ Variable(valid_stft.cuda()).float(),\ Variable(valid_labels.cuda()).float() else: valid_mfcc,valid_stft, valid_labels = Variable(valid_mfcc), Variable(valid_stft), \ Variable(valid_labels) valid_outputs = self.model(valid_stft, valid_mfcc).cuda() acc = self.accuracy(valid_outputs.cpu().data, valid_labels.cpu().data) valid_running_accuracy.append(acc) validation_accuracy = valid_running_accuracy self.model.train() # back to train mode return validation_accuracy def train(self, num_epochs, trainloader, valloader=None, epochs_per_save=10): print("Start training") start_train_time = time.time() for epoch in range( self.epoch, num_epochs ): # loop over the dataset multiple times # adds that aaafter loadndigng the epochs will start for the last one start_epoch_time = time.time() self.epoch = epoch running_loss = [] running_accuracy = [] for i, sample in enumerate(trainloader, 0): # print(i) if isinstance(sample, dict): if self.model_name == 'LSTMClassifaierAndDenoise': mfcc = sample['mfcc'].reshape(-1, 1, 39) stft = sample['stft'].reshape(-1, 1, 257) labels = sample['ground_truth'].reshape(-1, 257) else: # reshape because we entered batch as one sample mfcc = sample['mfcc'].reshape(-1, 1, 351) stft = sample['stft'].reshape(-1, 1, 2313) labels = sample['ground_truth'].reshape(-1, 1, 257) else: inputs, labels = sample # wrap them in Variable if torch.cuda.is_available(): mfcc, stft, labels = Variable( mfcc.cuda()).float(), Variable( stft.cuda()).float(), Variable( labels.cuda()).float() else: mfcc, stft, labels = Variable(mfcc), Variable( stft), Variable(labels) # forward + backward + optimize outputs = self.model(stft, mfcc).cuda() loss = self.criterion(outputs, labels) # zero the parameter gradients self.optimizer.zero_grad() loss.backward() # if i >0: # loss.backward() # else: # loss.backward(retain_graph=True) self.optimizer.step() # for loss per epoch running_loss.append(loss.item()) if self.accuracy is not None: # for accuracy per epoch running_accuracy.append( self.accuracy(outputs.cpu().data, labels.cpu().data)) if i % 10 == 0: print('tmp accuracy {} in i = {} in epoch {}'.format( np.mean(running_accuracy), i, epoch)) validation_accuracy = self.test_validation(valloader) self.print_epoch_statistics( epoch, int(time.time() - start_epoch_time), running_loss, running_accuracy, validation_accuracy, ) if epoch % epochs_per_save == 0: self.save_checkpoint('saved_models', epochs_per_save) # self.add_images_tensorboard(inputs, labels, outputs) self.save_checkpoint('saved_models') print('=' * 89) print("Finish Training, {} epochs in {} seconds".format( num_epochs, int(time.time() - start_train_time))) print('=' * 89)
class SummaryWorker(multiprocessing.Process): def __init__(self, env): super(SummaryWorker, self).__init__() self.env = env self.config = env.config self.queue = multiprocessing.Queue() try: self.timer_scalar = utils.train.Timer(env.config.getfloat('summary', 'scalar')) except configparser.NoOptionError: self.timer_scalar = lambda: False try: self.timer_image = utils.train.Timer(env.config.getfloat('summary', 'image')) except configparser.NoOptionError: self.timer_image = lambda: False try: self.timer_histogram = utils.train.Timer(env.config.getfloat('summary', 'histogram')) except configparser.NoOptionError: self.timer_histogram = lambda: False with open(os.path.expanduser(os.path.expandvars(env.config.get('summary_histogram', 'parameters'))), 'r') as f: self.histogram_parameters = utils.RegexList([line.rstrip() for line in f]) self.draw_points = utils.visualize.DrawPoints(env.limbs_index, colors=env.config.get('draw_points', 'colors').split()) self._draw_points = utils.visualize.DrawPoints(env.limbs_index, thickness=1) self.draw_bbox = utils.visualize.DrawBBox() self.draw_feature = utils.visualize.DrawFeature() self.draw_cluster = utils.visualize.DrawCluster() def __call__(self, name, **kwargs): if getattr(self, 'timer_' + name)(): kwargs = getattr(self, 'copy_' + name)(**kwargs) self.queue.put((name, kwargs)) def stop(self): self.queue.put((None, {})) def run(self): self.writer = SummaryWriter(os.path.join(self.env.model_dir, self.env.args.run)) try: height, width = tuple(map(int, self.config.get('image', 'size').split())) tensor = torch.randn(1, 3, height, width) step, epoch, dnn, stages = self.env.load() inference = model.Inference(self.config, dnn, stages) forward = inference.forward inference.forward = lambda self, *x: list(forward(self, *x)[-1].values()) self.writer.add_graph(inference, (tensor,)) except: traceback.print_exc() while True: name, kwargs = self.queue.get() if name is None: break func = getattr(self, 'summary_' + name) try: func(**kwargs) except: traceback.print_exc() def copy_scalar(self, **kwargs): step, loss_total, losses, losses_hparam = (kwargs[key] for key in 'step, loss_total, losses, losses_hparam'.split(', ')) loss_total = loss_total.detach().cpu().numpy() losses = [{name: l.detach().cpu().numpy() for name, l in loss.items()} for loss in losses] losses_hparam = [{name: l.detach().cpu().numpy() for name, l in loss.items()} for loss in losses_hparam] return dict( step=step, loss_total=loss_total, losses=losses, losses_hparam=losses_hparam, ) def summary_scalar(self, **kwargs): step, loss_total, losses, losses_hparam = (kwargs[key] for key in 'step, loss_total, losses, losses_hparam'.split(', ')) for i, loss in enumerate(losses): for name, l in loss.items(): self.writer.add_scalar('loss/%s%d' % (name, i), l, step) if self.config.getboolean('summary_scalar', 'loss_hparam'): self.writer.add_scalars('loss_hparam', {'%s%d' % (name, i): l for name, l in loss.items() for i, loss in enumerate(losses_hparam)}, step) self.writer.add_scalar('loss_total', loss_total, step) def copy_image(self, **kwargs): step, height, width, data, outputs = (kwargs[key] for key in 'step, height, width, data, outputs'.split(', ')) image, mask, keypoints, yx_min, yx_max, parts, limbs, index = (data[key].clone().cpu().numpy() for key in 'image, mask, keypoints, yx_min, yx_max, parts, limbs, index'.split(', ')) output = outputs[self.config.getint('summary_image', 'stage')] output = {name: output[name].detach().cpu().numpy() for name in self.config.get('summary_image', 'output').split()} return dict( step=step, height=height, width=width, image=image, mask=mask, keypoints=keypoints, yx_min=yx_min, yx_max=yx_max, parts=parts, limbs=limbs, index=index, output=output, ) def summary_image(self, **kwargs): step, height, width, image, mask, keypoints, yx_min, yx_max, parts, limbs, index, output = (kwargs[key] for key in 'step, height, width, image, mask, keypoints, yx_min, yx_max, parts, limbs, index, output'.split(', ')) limit = min(self.config.getint('summary_image', 'limit'), image.shape[0]) image = image[:limit, :, :, :] if self.config.getboolean('summary_image', 'estimate'): canvas = np.copy(image) fn = pybenchmark.profile('output/estimate')(self.draw_clusters) canvas = [fn(canvas, parts[:-1], limbs) for canvas, parts, limbs in zip(canvas, *(output[name] for name in 'parts, limbs'.split(', ')))] self.writer.add_image('output/estimate', torchvision.utils.make_grid(torch.from_numpy(np.stack(canvas)).permute(0, 3, 1, 2).float(), normalize=True, scale_each=True), step) if self.config.getboolean('summary_image', 'data_keypoints'): canvas = np.copy(image) fn = pybenchmark.profile('data/keypoints')(self.draw_keypoints) canvas = [fn(canvas, mask, keypoints, yx_min, yx_max, index) for canvas, mask, keypoints, yx_min, yx_max, index in zip(canvas, mask, keypoints, yx_min, yx_max, index)] self.writer.add_image('data/keypoints', torchvision.utils.make_grid(torch.from_numpy(np.stack(canvas)).permute(0, 3, 1, 2).float(), normalize=True, scale_each=True), step) if self.config.getboolean('summary_image', 'data_parts'): fn = pybenchmark.profile('data/parts')(self.draw_feature) for i in range(parts.shape[1]): canvas = np.copy(image) canvas = [fn(canvas, feature[i]) for canvas, feature in zip(canvas, parts)] self.writer.add_image('data/parts%d' % i, torchvision.utils.make_grid(torch.from_numpy(np.stack(canvas)).permute(0, 3, 1, 2).float(), normalize=True, scale_each=True), step) if self.config.getboolean('summary_image', 'data_limbs'): fn = pybenchmark.profile('data/limbs')(self.draw_feature) for i in range(limbs.shape[1]): canvas = np.copy(image) canvas = [fn(canvas, feature[i]) for canvas, feature in zip(canvas, limbs)] self.writer.add_image('data/limbs%d' % i, torchvision.utils.make_grid(torch.from_numpy(np.stack(canvas)).permute(0, 3, 1, 2).float(), normalize=True, scale_each=True), step) for name, feature in output.items(): fn = pybenchmark.profile('output/' + name)(self.draw_feature) for i in range(feature.shape[1]): canvas = np.copy(image) canvas = [fn(canvas, feature[i]) for canvas, feature in zip(canvas, feature)] self.writer.add_image('output/%s%d' % (name, i), torchvision.utils.make_grid(torch.from_numpy(np.stack(canvas)).permute(0, 3, 1, 2).float(), normalize=True, scale_each=True), step) def draw_keypoints(self, image, mask, keypoints, yx_min, yx_max, index): image = utils.visualize.draw_mask(image, mask, 1) size = yx_max - yx_min target = np.logical_and(*[np.squeeze(a, -1) > 0 for a in np.split(size, size.shape[-1], -1)]) keypoints, yx_min, yx_max = (a[target] for a in (keypoints, yx_min, yx_max)) for i, points in enumerate(keypoints): if i == index: image = self.draw_points(image, points) else: image = self._draw_points(image, points) image = self.draw_bbox(image, yx_min.astype(np.int), yx_max.astype(np.int)) return image def draw_clusters(self, image, parts, limbs): try: interpolation = getattr(cv2, 'INTER_' + self.config.get('estimate', 'interpolation').upper()) parts, limbs = (np.stack([cv2.resize(feature, image.shape[1::-1], interpolation=interpolation) for feature in a]) for a in (parts, limbs)) except configparser.NoOptionError: pass clusters = pyopenpose.estimate( parts, limbs, self.env.limbs_index, self.config.getfloat('nms', 'threshold'), self.config.getfloat('integration', 'step'), tuple(map(int, self.config.get('integration', 'step_limits').split())), self.config.getfloat('integration', 'min_score'), self.config.getint('integration', 'min_count'), self.config.getfloat('cluster', 'min_score'), self.config.getint('cluster', 'min_count'), ) scale_y, scale_x = np.array(image.shape[1::-1], parts.dtype) / np.array(parts.shape[-2:], parts.dtype) for cluster in clusters: cluster = [((i1, int(y1 * scale_y), int(x1 * scale_x)), (i2, int(y2 * scale_y), int(x2 * scale_x))) for (i1, y1, x1), (i2, y2, x2) in cluster] image = self.draw_cluster(image, cluster) return image def copy_histogram(self, **kwargs): return { 'step': kwargs['step'], 'state_dict': self.env.dnn.state_dict(), } def summary_histogram(self, **kwargs): step, state_dict = (kwargs[key] for key in 'step, state_dict'.split(', ')) for name, var in state_dict.items(): if self.histogram_parameters(name): self.writer.add_histogram(name, var, step)
def train(opt): if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) training_params = { "batch_size": opt.batch_size, "shuffle": True, "drop_last": True, "collate_fn": custom_collate_fn } test_params = { "batch_size": opt.batch_size, "shuffle": False, "drop_last": False, "collate_fn": custom_collate_fn } training_set = VOCDataset(opt.data_path, opt.dataset, opt.image_size) training_generator = DataLoader(training_set, **training_params) test_set = VOCDataset(opt.data_path, opt.dataset, opt.image_size, is_training=False) test_generator = DataLoader(test_set, **test_params) model = Deeplab(num_classes=training_set.num_classes + 1) model.load_state_dict(torch.load(opt.pre_trained_model)) log_path = os.path.join(opt.log_path, "{}".format(opt.dataset)) if os.path.isdir(log_path): shutil.rmtree(log_path) os.makedirs(log_path) writer = SummaryWriter(log_path) writer.add_graph( model, torch.rand(opt.batch_size, 3, opt.image_size, opt.image_size)) if torch.cuda.is_available(): model.cuda() best_loss = 1e10 best_epoch = 0 model.train() num_iter_per_epoch = len(training_generator) for epoch in range(opt.num_epoches): for iter, batch in enumerate(training_generator): current_step = epoch * num_iter_per_epoch + iter current_lr = update_lr(opt.lr, current_step, num_iter_per_epoch * opt.num_epoches) optimizer = get_optimizer(model, current_lr, opt.momentum, opt.decay) if torch.cuda.is_available(): batch = [torch.Tensor(record).cuda() for record in batch] else: batch = [torch.Tensor(record) for record in batch] image, gt1, gt2 = batch gt1 = gt1.long() gt2 = gt2.long() optimizer.zero_grad() results = model(image) mul_losses = multiple_losses(results, [gt1, gt1, gt2, gt1]) mul_losses[4].backward() optimizer.step() print( "Epoch: {}/{}, Iteration: {}/{}, Lr: {}, Loss: {:.2f} (1xloss: {:.2f} 0.75xloss: {:.2f} 0.5xloss: {:.2f} Max_merged_loss: {:.2f})" .format(epoch + 1, opt.num_epoches, iter + 1, num_iter_per_epoch, optimizer.param_groups[0]['lr'], mul_losses[4], mul_losses[0], mul_losses[1], mul_losses[2], mul_losses[3])) writer.add_scalar('Train/Total_loss', mul_losses[4], current_step) writer.add_scalar('Train/1x_scale_loss', mul_losses[0], current_step) writer.add_scalar('Train/0.75x_scale_loss', mul_losses[1], current_step) writer.add_scalar('Train/0.5x_scale_loss', mul_losses[2], current_step) writer.add_scalar('Train/Max_merged_loss', mul_losses[3], current_step) if epoch % opt.test_interval == 0: model.eval() loss_ls = [] loss_scale_1_ls = [] loss_scale_2_ls = [] loss_scale_3_ls = [] loss_max_merged_ls = [] for te_batch in test_generator: if torch.cuda.is_available(): te_batch = [ torch.Tensor(record).cuda() for record in te_batch ] else: te_batch = [torch.Tensor(record) for record in te_batch] te_image, te_gt1, te_gt2 = te_batch te_gt1 = te_gt1.long() te_gt2 = te_gt2.long() num_sample = len(te_gt1) with torch.no_grad(): te_results = model(te_image) te_mul_losses = multiple_losses( te_results, [te_gt1, te_gt1, te_gt2, te_gt1]) loss_ls.append(te_mul_losses[4] * num_sample) loss_scale_1_ls.append(te_mul_losses[0] * num_sample) loss_scale_2_ls.append(te_mul_losses[1] * num_sample) loss_scale_3_ls.append(te_mul_losses[2] * num_sample) loss_max_merged_ls.append(te_mul_losses[3] * num_sample) te_loss = sum(loss_ls) / test_set.__len__() te_scale_1_loss = sum(loss_scale_1_ls) / test_set.__len__() te_scale_2_loss = sum(loss_scale_2_ls) / test_set.__len__() te_scale_3_loss = sum(loss_scale_3_ls) / test_set.__len__() te_max_merged_loss = sum(loss_max_merged_ls) / test_set.__len__() print( "Epoch: {}/{}, Lr: {}, Loss: {:.2f} (1xloss: {:.2f} 0.75xloss: {:.2f} 0.5xloss: {:.2f} Max_merged_loss: {:.2f})" .format(epoch + 1, opt.num_epoches, optimizer.param_groups[0]['lr'], te_loss, te_scale_1_loss, te_scale_2_loss, te_scale_3_loss, te_max_merged_loss)) writer.add_scalar('Test/Total_loss', te_loss, epoch) writer.add_scalar('Test/1x_scale_loss', te_scale_1_loss, epoch) writer.add_scalar('Test/0.75x_scale_loss', te_scale_2_loss, epoch) writer.add_scalar('Test/0.5x_scale_loss', te_scale_3_loss, epoch) writer.add_scalar('Test/Max_merged_loss', te_max_merged_loss, epoch) model.train() if te_loss + opt.es_min_delta < best_loss: best_loss = te_loss best_epoch = epoch torch.save( model.state_dict(), opt.saved_path + os.sep + "only_params_trained_deeplab_voc") torch.save( model, opt.saved_path + os.sep + "whole_model_trained_deeplab_voc") # Early stopping if epoch - best_epoch > opt.es_patience > 0: print( "Stop training at epoch {}. The lowest loss achieved is {}" .format(epoch, te_loss)) break writer.close()
net = build_refinedet(Config.INPUT_SIZE, len(Config.CLASSES), is_refine=True) if torch.cuda.device_count() > 1: # 判断是不是有多个GPU print("Let's use", torch.cuda.device_count(), "GPUs!") net = torch.nn.DataParallel(net, device_ids=range(torch.cuda.device_count())) device = torch.device('cpu') if torch.cuda.is_available() and Config.DEVICE == 'gpu': device = torch.device('cuda') net.to(device) cudnn.benchmark = True if Config.IS_TENSORBOARDX: net_input_size = torch.zeros(Config.BATCH_SIZE, 3, Config.INPUT_SIZE[0], Config.INPUT_SIZE[1]) writer.add_graph(net, (net_input_size, )) model_info = {'RESUME_EPOCH': 0, 'RESUME_MODEL': None} if not op.exists('tools/generate_dep_info/model_info.json'): with open('tools/generate_dep_info/model_info.json', 'w', encoding='utf-8') as f: json.dump(model_info, f) with open('tools/generate_dep_info/model_info.json', 'r', encoding='utf-8') as f: model_info = json.load(f) if model_info['RESUME_MODEL'] is None or not op.exists( model_info['RESUME_MODEL']): model_info['RESUME_EPOCH'] = 0 print('Loading base network...')
def train_challenge2020(hype_space): # Paths to save log, checkpoint, tensorboard logs and results run_id = datetime.now().strftime(r'%m%d_%H%M%S') base_path = save_path + '/' + run_id os.makedirs(base_path) write_json(hype_space, base_path + '/hype_space.json') checkpoint_dir = base_path + '/checkpoints' log_dir = base_path + '/log' tb_dir = base_path + '/tb_log' result_dir = base_path + '/results' os.makedirs(result_dir) os.makedirs(log_dir) os.makedirs(checkpoint_dir) os.makedirs(tb_dir) # Logger for train logger = get_logger(log_dir + '/info.log', name='train' + run_id) logger.info(hype_space) # Tensorboard train_writer = SummaryWriter(tb_dir + '/train') val_writer = SummaryWriter(tb_dir + '/valid') # Hyper Parameters split_index = "../process/data_split/" + hype_space['data_split'] # Setup Cuda use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") # Data_loader train_loader = ChallengeDataLoader2( label_dir, data_dir, split_index, batch_size=hype_space['trainer']['batch_size'], normalization=hype_space['data_normalization'], augmentations=hype_space['augmentation']['method'], p=hype_space['augmentation']['prob']) valid_loader = train_loader.valid_data_loader test_loader = train_loader.test_data_loader # Build model architecture global model for file, types in files_models.items(): for type in types: if hype_space["arch"]["type"] == type: model = init_obj(hype_space, 'arch', eval("module_arch_" + file)) dummy_input = Variable(torch.rand(16, 12, 3000)) train_writer.add_graph(model, (dummy_input, )) model.to(device) # Get function handles of loss and metrics criterion = getattr(module_loss, hype_space['loss']['type']) # Get function handles of metrics challenge_metrics = ChallengeMetric(label_dir) metric = challenge_metrics.challenge_metric # Get indices of the scored labels if hype_space['only_scored']: indices = challenge_metrics.indices else: indices = None # Build optimizer, learning rate scheduler trainable_params = filter(lambda p: p.requires_grad, model.parameters()) optimizer = init_obj(hype_space, 'optimizer', torch.optim, trainable_params) if hype_space['lr_scheduler']['type'] == 'GradualWarmupScheduler': params = hype_space["lr_scheduler"]["args"] scheduler_steplr_args = dict(params["after_scheduler"]["args"]) scheduler_steplr = getattr(torch.optim.lr_scheduler, params["after_scheduler"]["type"])( optimizer, **scheduler_steplr_args) lr_scheduler = GradualWarmupScheduler( optimizer, multiplier=params["multiplier"], total_epoch=params["total_epoch"], after_scheduler=scheduler_steplr) else: lr_scheduler = init_obj(hype_space, 'lr_scheduler', torch.optim.lr_scheduler, optimizer) # Begin training process trainer = hype_space['trainer'] epochs = trainer['epochs'] # Full train and valid logic mnt_metric_name, mnt_mode, mnt_best, early_stop = get_mnt_mode(trainer) not_improved_count = 0 for epoch in range(epochs): best = False train_loss, train_metric = train(model, optimizer, train_loader, criterion, metric, indices, epoch, device=device) val_loss, val_metric = valid(model, valid_loader, criterion, metric, indices, device=device) if hype_space['lr_scheduler']['type'] == 'ReduceLROnPlateau': # if hype_space['lr_scheduler']['args']['mode'] == 'min': # lr_scheduler.step(train_loss) # else: # lr_scheduler.step(train_metric) lr_scheduler.step(val_loss) elif hype_space['lr_scheduler']['type'] == 'GradualWarmupScheduler': lr_scheduler.step(epoch, val_loss) else: lr_scheduler.step() logger.info('Epoch:[{}/{}]\t {:10s}: {:.5f}\t {:10s}: {:.5f}'.format( epoch, epochs, 'loss', train_loss, 'metric', train_metric)) logger.info(' \t {:10s}: {:.5f}\t {:10s}: {:.5f}'.format( 'val_loss', val_loss, 'val_metric', val_metric)) logger.info(' \t learning_rate: {}'.format( optimizer.param_groups[0]['lr'])) # check whether model performance improved or not, according to specified metric(mnt_metric) if mnt_mode != 'off': mnt_metric = val_loss if mnt_metric_name == 'val_loss' else val_metric improved = (mnt_mode == 'min' and mnt_metric <= mnt_best) or \ (mnt_mode == 'max' and mnt_metric >= mnt_best) if improved: mnt_best = mnt_metric not_improved_count = 0 best = True else: not_improved_count += 1 if not_improved_count > early_stop: logger.info( "Validation performance didn\'t improve for {} epochs. Training stops." .format(early_stop)) break if best == True: save_checkpoint(model, epoch, optimizer, mnt_best, hype_space, checkpoint_dir, save_best=True) logger.info("Saving current best: model_best.pth ...") # Tensorboard log train_writer.add_scalar('loss', train_loss, epoch) train_writer.add_scalar('metric', train_metric, epoch) train_writer.add_scalar('learning_rate', optimizer.param_groups[0]['lr'], epoch) val_writer.add_scalar('loss', val_loss, epoch) val_writer.add_scalar('metric', val_metric, epoch) # Logger for test logger = get_logger(result_dir + '/info.log', name='test' + run_id) logger.propagate = False # Load model_best checkpoint model = load_checkpoint(model, checkpoint_dir + '/model_best.pth', logger) # Testing test_loss, test_metric = test(model, test_loader, criterion, metric, device=device) logger.info(' {:10s}: {:.5f}\t {:10s}: {:.5f}'.format( 'loss', test_loss, 'metric', test_metric)) challenge_metrics.return_metric_list() analyze(model, test_loader, criterion, challenge_metrics, logger, result_dir, device=device) write_json(hype_space, '{}/{}_{:.5f}.json'.format(save_path, run_id, test_metric)) return -test_metric
# -*- coding:utf-8 -*- import torch import torchvision from tensorboardX import SummaryWriter from models.I2T import Encoder_Image, Generator_I2T from models.T2I import Generator_T2I from models.cyclegan_TI import CycleGAN_TI net1 = Encoder_Image(num_channels=100).cuda() net2 = Generator_I2T(vocab_size=1000, input_size=200, hidden_size=100).cuda() net3 = Generator_T2I( embedding_size=100, filter_sizes=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20], num_filters=[100, 200, 200, 200, 200, 100, 100, 100, 100, 100, 160, 160]).cuda() model = CycleGAN_TI().cuda() writer1 = SummaryWriter(log_dir='../logs1', comment='Encoder_Image') writer2 = SummaryWriter(log_dir='../logs2', comment='Generator_T2I') images = torch.ones([9, 3, 64, 64]).cuda() text_input = torch.ones([9, 1, 20, 100]).cuda() with writer1: writer1.add_graph(net1, input_to_model=(images, ), verbose=True) with writer2: writer2.add_graph(net3, input_to_model=(text_input, ), verbose=True)
def forward(self, x): x = F.relu(F.max_pool2d(self.conv1(x), 2)) x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) x = x.view(-1, 44180) x = F.relu(self.fc1(x)) #x = F.dropout(x,p=0.1, training=self.training) x = self.fc2(x) return F.log_softmax(x, dim=1) # ----------------------------------------------------------------------------------- cnn = CNN() # ---- write the model on tensorboard writer_train.add_graph( cnn, Variable( (torch.Tensor(train_loader.dataset.dataImages[0:1])).cpu(), )) if is_cuda: cnn.cuda() # ----------------------------------------------------------------------------------- # Loss and Optimizer learning_rate = 0.001 criterion = nn.CrossEntropyLoss() #criterion = F.cross_entropy() optimizer = torch.optim.Adam(cnn.parameters(), lr=learning_rate) # -----------------------------------------------------------------------------------
targets = autograd.Variable(targets_tens) # Set up network model = nn.Sequential( nn.Linear(3, 4, bias=False), nn.Sigmoid(), nn.Linear(4, 1, bias=False), nn.Sigmoid() ) loss_func = nn.MSELoss(size_average=False) params = list(model.parameters()) # For tensorboard, evaluate loss once to make the graph writer = SummaryWriter(comment='_ann_basic') loss = loss_func(model(inputs), targets) writer.add_graph(model, loss) # add random image just for testing image = torch.Tensor(scipy.misc.ascent()) writer.add_image('ascent', image) # Parameters for learning learning_rate = 1.0 niter = 5000 # Function mapping output to predicted labels def predicted_labels(x): if isinstance(x, autograd.Variable): x = x.data out = torch.zeros_like(x)
class TensorBoardLogger(Callback): """Callback that logs epoch results to a TensorBoard file.""" def __init__(self, log_dir=None, comment='', ignores=None, log_model_graph=False, log_param_interval=0, *args, **kwargs): """Initialization for TensorBoardLogger. Parameters ---------- log_dir: str Path to save tensorboard file, Default: 'runs/{fmt_datetime}_{hostname}{comment}'. comment: str Comment that appends to the log_dir. Default: ''. ignores: list A list of names will be not logged. Default: None. log_model_graph: bool Whether to save model graph definition. Default: False. log_param_interlval: int Number of epochs between logging parameters histogram. Default: 0(No log). """ super(TensorBoardLogger, self).__init__(*args, **kwargs) self.writer = SummaryWriter(log_dir, comment=comment) if ignores is None: ignores = [] self.ignores = ignores self.log_model_graph = log_model_graph self.log_param_interval = log_param_interval self.epochs_since_logged_params = 0 def _teardown(self): self.writer.close() def log(self, step, meter): log_type = meter.meter_type method = getattr(self, 'log_' + log_type, None) if not method: return method(meter.alias, meter.value, step) def log_image(self, tag, img_tensor, step=None): self.writer.add_image(tag, img_tensor, step) def log_scalar(self, tag, scalar_value, step=None): self.writer.add_scalar(tag, scalar_value, step) def log_graph(self, model, input): self.writer.add_graph(model, input) def log_hist(self, tag, value, step=None, bins='tensorflow'): self.writer.add_histogram(tag, value, step, bins) def log_text(self): pass def log_audio(self): pass def _log_model_and_params(self, trainer, state): if state['mode'] != TRAIN_MODE: return if self.log_model_graph: model = state['model'] input = state['input'] self.log_graph(model, input) self.log_model_graph = False if self.log_param_interval == 0: return self.epochs_since_logged_params += 1 if self.epochs_since_logged_params < self.log_param_interval: return self.epochs_since_logged_params = 0 model = state['model'] epochs = state['epochs'] for name, params in model.named_parameters(): self.log_hist(name, params.clone().cpu().data.numpy(), epochs) def __on_batch_end(self, trainer, state): """Deprecated""" iters = state['iters'] mode = state['mode'] for name, meter in state['meters'].items(): if meter.meter_mode != mode: continue if meter.reset_mode == BATCH_RESET and \ name not in self.ignores and meter.can_call: self.log(iters, meter) def on_epoch_end(self, trainer, state): self._log_model_and_params(trainer, state) epochs = state['epochs'] mode = state['mode'] for meter in state['meters'].values(): if meter.mode != mode: continue alias = meter.alias if (meter.reset_mode == EPOCH_RESET and alias not in self.ignores): self.log(epochs, meter) def on_validate_end(self, trainer, state): self.on_epoch_end(trainer, state)
# coding=utf-8 from mypackage.utils import Model from tensorboardX import SummaryWriter import torch import torchvision model = Model(50).model writer = SummaryWriter(log_dir="log/network") input = torch.autograd.Variable(torch.Tensor(1, 1, 256, 256), requires_grad=True) writer.add_graph(model=model, input_to_model=(input, )) # model = torchvision.models.AlexNet(num_classes=10) # # 准备写tensorboard, 必须放在'.to(device)'之前,不然会报错 # writer = SummaryWriter(log_dir="log/network") # dummy_input = torch.autograd.Variable(torch.rand(1, 3, 227, 227)) # writer.add_graph(model=model, input_to_model=dummy_input)
model = save["model"] params_list = changegrad(model) model.cuda() optimizer = torch.optim.Adam(params_list, lr=lr, weight_decay=0.00008) try: optimizer.load_state_dict(save["optimizer"]) except: pass epoch = save["epoch"] print(f"load from './model/{NetTitle}Newest'") else: print("全新训练") params_list = changegrad(modelNet) optimizer = torch.optim.Adam(params_list, lr=lr, weight_decay=0.00008) epoch = 0 writer.add_graph(modelNet, (torch.rand([1, 1, 224, 224]))) model = modelNet.cuda() scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='min', factor=0.96, patience=2 * len(train_loader), verbose=True, threshold=0.0001, threshold_mode='rel', cooldown=1, min_lr=0, eps=1e-08) rate1, rate2 = test()
class Trainer(object): def __init__(self, model, datasets, criterion, args): """ :param model: `torch.nn.Module` to be trained :param datasets: dict of datasets including 'train', 'valid', and 'test' :param criterion: callable loss function, returns dict of losses :param args: parsed results of `ArgumentParser` """ self.is_main_process = args.local_rank is None or args.local_rank == 0 self.datasets = datasets log_dir = Path(args.log_dir) # iteration counters self.iteration = 0 self.start_epoch = 0 self.min_epoch_loss = float('inf') self.max_metric_score = 0 optimizer_state = None lr_scheduler_state = None self.args = args if self.is_main_process: # print args args_yaml = yaml.dump((vars(args))) terminal_columns = shutil.get_terminal_size().columns self.println("=" * terminal_columns) self.println(args_yaml + ("=" * terminal_columns)) if args.local_rank is not None: assert args.device in ('auto', 'cuda') torch.cuda.set_device(args.local_rank) args.device = 'cuda' # only support GPU if args.resume: self.println('resume checkpoint ...') resume_checkpoint = torch.load( args.resume_checkpoint_file, map_location=lambda storage, loc: storage) model = model.load(resume_checkpoint['model_file']) self.start_epoch = resume_checkpoint['epoch'] self.min_epoch_loss = resume_checkpoint.get( 'min_epoch_loss', self.min_epoch_loss) self.max_metric_score = resume_checkpoint.get( 'max_metric_score', self.max_metric_score) self.iteration = resume_checkpoint['iteration'] optimizer_state = resume_checkpoint['optimizer'] lr_scheduler_state = resume_checkpoint['lr_scheduler'] self.println('resume epoch {} iteration {}'.format( self.start_epoch, self.iteration)) device = choose_device(args.device) self.use_cuda = device.type == 'cuda' self.mixup_epochs = args.no_mixup_epochs if args.no_mixup_epochs > 1.0 else ( 1 - args.no_mixup_epochs) * args.max_epochs self.criterion = criterion self.model = model self.net, self.optimizer = create_optimizer( model, args.optim, args.learning_rate, args.weight_decay, args.momentum, args.apex_opt_level, optimizer_state=optimizer_state, device=device, no_bn_wd=args.no_bn_wd, local_rank=args.local_rank, sync_bn=args.sync_bn) self.data_loaders = { k: create_dataset_loaders(d, args, self.use_cuda, shuffle=(k == 'train')) for k, d in datasets.items() if k != 'test' } self.lr_scheduler = create_lr_scheduler(self.optimizer, **vars(args)) if lr_scheduler_state: print(f'resume lr_scheduler_state {lr_scheduler_state}') self.lr_scheduler.load_state_dict(lr_scheduler_state) print( f'resumed lr_scheduler_state{self.lr_scheduler.state_dict()}') self.checkpoints_folder = log_dir / 'checkpoints' self.checkpoints_folder.mkdir(parents=True, exist_ok=True) datasets_text = '\n '.join( ['{} {}'.format(k, v) for k, v in datasets.items()]) self.println("datasets:\n") self.println(datasets_text) self.tb_writer = None if self.is_main_process: print("logging into {}".format(log_dir)) self.tb_writer = SummaryWriter(log_dir=str(log_dir)) self.tb_writer.add_text('args', repr(args_yaml)[1:-1], 0) self.tb_writer.add_text('datasets', repr(datasets_text)[1:-1], 0) with (log_dir / 'args.yml').open('w') as f: f.write(args_yaml) #tb_writer.add_text('cfg', str(ssd_net) + "\n" + str(ssd_net.cfg), 0) #if not use_fp16: if args.write_graph: # write graph with torch.no_grad(): images = next(iter(self.data_loaders['train']))['image'] images = images.to(device) model.trace_mode = True self.tb_writer.add_graph(model, images) model.trace_mode = False def println(self, *args, **kwargs): if self.is_main_process: print(*args, **kwargs) def run_epoch(self, epoch, phase): data_loader = self.data_loaders[phase] if isinstance(data_loader.sampler, DistributedSampler): data_loader.sampler.set_epoch(epoch) # loss counters epoch_loss_dict = {} is_train = phase == 'train' if is_train: if self.lr_scheduler.name != 'plateau': self.lr_scheduler.step(epoch=epoch) self.optimizer.zero_grad() if self.tb_writer: self.tb_writer.add_scalar('learning_rate', self.get_lr(), epoch) self.net.train(is_train) torch.set_grad_enabled(is_train) desc = f"Epoch {epoch} {phase}" if self.args.local_rank is not None: desc = f"[{self.args.local_rank}]" + desc pbar_disable = False if epoch == self.start_epoch + 1 else None pbar = tqdm(data_loader, desc=desc, unit="images", unit_scale=data_loader.batch_size, leave=False, disable=pbar_disable, mininterval=10, smoothing=1) it = 0 # for logging images min_loss_in_epoch = float("inf") max_loss_in_epoch = 0 batch_of_min_loss_in_epoch = None batch_of_max_loss_in_epoch = None for batch in pbar: inputs = batch.pop('input', None) targets = batch if inputs is None: warnings.warn( f'no input, skip (data in batch are {batch.keys()})') assert False continue if self.use_cuda: inputs = inputs.cuda(non_blocking=True) targets = targets_to_cuda(targets) criterion = self.criterion if phase == 'train' and self.args.mixup > 0: if epoch < self.mixup_epochs: inputs, criterion = mixup(inputs, alpha=self.args.mixup, criterion=criterion) # forward outputs = self.net(inputs) losses = criterion(outputs, targets) # compute overall loss if multi losses is returned if isinstance(losses, dict): if 'All' not in losses: losses['All'] = sum(losses.values()) elif isinstance(losses, torch.Tensor): losses = dict(All=losses) else: raise RuntimeError(type(losses)) loss = losses['All'] optimize_step = False if phase == 'train': self.optimizer.backward(loss / self.args.gradient_accumulation) if self.iteration % self.args.gradient_accumulation == 0: optimize_step = True it += self.args.gradient_accumulation if self.args.clip_grad_norm > 0: clip_grad_norm_(self.net.parameters(), self.args.clip_grad_norm) self.optimizer.step() self.optimizer.zero_grad() self.iteration += 1 if self.lr_scheduler and self.lr_scheduler.name == 'findlr': self.lr_scheduler.step(self.iteration) if self.tb_writer: self.tb_writer.add_scalar('learning_rate', self.get_lr(), self.iteration) elif phase == 'valid': it += 1 if self.args.local_rank is not None: # sync loss between processes world_size = torch.distributed.get_world_size() for l in losses.values(): torch.distributed.reduce(l, dst=0) if self.is_main_process: l /= world_size if not self.is_main_process: continue # Below are logging in optimization step batch_loss_dict = {k: v.item() for k, v in losses.items()} if self.tb_writer and optimize_step and self.args.log_loss_interval > 0 and self.iteration % self.args.log_loss_interval == 0: # tb_writer.add_scalars('Loss', batch_loss_dict, iteration) for k, v in batch_loss_dict.items(): self.tb_writer.add_scalar(phase + '/Loss/' + k, v, epoch) epoch_loss_dict = { k: epoch_loss_dict.get(k, 0) + v for k, v in batch_loss_dict.items() } batch_loss = batch_loss_dict['All'] if batch_loss < min_loss_in_epoch: min_loss_in_epoch = batch_loss batch_of_min_loss_in_epoch = (inputs, targets) if batch_loss > max_loss_in_epoch: max_loss_in_epoch = batch_loss batch_of_max_loss_in_epoch = (inputs, targets) if it > 0: # update the progress bar scalars = { k: "%.03f" % (v / it) for k, v in epoch_loss_dict.items() } pbar.set_postfix(scalars, refresh=False) if not self.is_main_process: return 0 epoch_loss_dict = {k: v / it for k, v in epoch_loss_dict.items()} if self.tb_writer: if self.args.log_images: name_batch = { "min_loss": batch_of_min_loss_in_epoch, "max_loss": batch_of_max_loss_in_epoch } for name, batch in name_batch.items(): if batch is not None: images = self.visualize_batch(*batch) images_grid = vutils.make_grid(images, normalize=False) self.tb_writer.add_image('/'.join([phase, name]), images_grid, epoch) #scalars = {phase + k: v for k, v in epoch_loss_dict.items()} #tb_writer.add_scalars('EpochLoss', scalars, epoch) for k, v in epoch_loss_dict.items(): self.tb_writer.add_scalar(phase + '/EpochLoss/' + k, v, epoch) return epoch_loss_dict['All'] def get_lr(self): return self.optimizer.param_groups[0]['lr'] def save_checkpoint(self, epoch, model_filename, checkpoint_filename=None): if not checkpoint_filename: checkpoint_filename = model_filename model_filename = str( self.checkpoints_folder / model_filename) + '.model.pth' checkpoint_filename = str( self.checkpoints_folder / checkpoint_filename) + '.checkpoint.pth' self.model.save(model_filename) optimizer_state_dict = optimizer_cpu_state_dict(self.optimizer) torch.save( { 'epoch': epoch, 'min_epoch_loss': self.min_epoch_loss, 'max_metric_score': self.max_metric_score, 'iteration': self.iteration, 'optimizer': optimizer_state_dict, 'lr_scheduler': self.lr_scheduler.state_dict(), 'model_file': model_filename, 'args': self.args }, checkpoint_filename) checkpoint_saved = Path(checkpoint_filename) last_checkpoint_file = self.checkpoints_folder / 'last.checkpoint' if last_checkpoint_file.exists(): last_checkpoint_file.unlink() last_checkpoint_file.symlink_to( checkpoint_saved.relative_to(self.checkpoints_folder)) def run(self): self.println('Training', repr(self.model), 'Epochs:', self.start_epoch, '/', self.args.max_epochs) pbar_epoch = trange(self.start_epoch + 1, self.args.max_epochs + 1, unit="epoch", disable=not self.is_main_process) for epoch in pbar_epoch: epoch_state = {} for phase in self.data_loaders: if phase == 'valid' and epoch % self.args.validation_interval != 0: continue epoch_loss = self.run_epoch(epoch, phase) evaluation = None if 'test' in self.datasets and phase == 'valid': evaluation = self.test() if not self.is_main_process: continue # Below are processing between epoch, e.g. save checkpoints, logging, etc. early_stopping = False if evaluation is not None: epoch_state['metric'] = metric_score = evaluation['score'] for k, v in evaluation.items(): if isinstance(v, dict) and 'score' in v: self.tb_writer.add_scalar( 'test/' + k.replace(' ', '_'), v['score'], epoch) if metric_score > self.max_metric_score: self.max_metric_score = metric_score print( '\nsave checkpoint at epoch {} with best {} metric {}' .format(epoch, phase, self.max_metric_score)) self.save_checkpoint(epoch, "best_metric") if phase == 'valid' or 'valid' not in self.data_loaders: if self.min_epoch_loss > epoch_loss: self.min_epoch_loss = epoch_loss print( '\nsave checkpoint at epoch {} with best {} loss {}' .format(epoch, phase, self.min_epoch_loss)) self.save_checkpoint(epoch, 'best_loss') if epoch % self.args.validation_interval == 0: if self.args.lr_scheduler == 'plateau': self.lr_scheduler.step(metrics=epoch_loss) early_stopping = (self.get_lr() < self.args.stopping_learning_rate) if (early_stopping or (epoch == self.args.max_epochs) or (phase == 'valid') or (self.args.checkpoints_interval > 0 and epoch % self.args.checkpoints_interval == 0 and epoch % self.args.validation_interval != 0)): print( '\nsave checkpoint at epoch {} with {} loss {}'.format( epoch, phase, epoch_loss)) self.save_checkpoint(epoch, "last") epoch_state[phase + '_loss'] = epoch_loss if early_stopping: print('early stopping!') print('Metric Score = {}'.format(self.max_metric_score)) return if self.args.lr_scheduler == 'findlr': print('finish find lr') return if self.is_main_process: epoch_state['time'] = datetime.now().strftime('%d%b%H:%M') epoch_state['min_loss'] = self.min_epoch_loss epoch_state['lr'] = self.get_lr() pbar_epoch.set_postfix(epoch_state, refresh=False) self.println('Metric Score = {}'.format(self.max_metric_score)) def test(self): raise NotImplementedError() def visualize_batch(self, inputs, targets): raise NotImplementedError()
def pre_trained(judge): writer = SummaryWriter(log_dir='./loss/pre_train_loss_model1/pre_train_loss_SMI_2020_07_31_0%d' % date_num) if judge == 0: model = BSsequential_net_lstm().to(device) print("Total number of paramerters in networks is {} ".format(sum(x.numel() for x in model.parameters()))) # model.apply(weights_init) temp = 10000000000000 epoch_num = 1 else: model = BSsequential_net_lstm().to(device) mode_patch = './model_file/pre_trained_network_model_model1/pre_trained_network_model_SMI_2020_07_24_02.pth' model.load_state_dict(torch.load(mode_patch)) temp = 10000000000000 epoch_num = 1 # path_temp = './Temporary_parameters/pre_temp_model1.mat' # temp = scipio.loadmat(path_temp) # temp = temp['temp'].item() # path_epoch = './Temporary_parameters/pre_epoch_num_model1.mat' # epoch_num = scipio.loadmat(path_epoch) # epoch_num = epoch_num['epoch_num'].item()+1 if is_consistent == 0: map_xline = np.zeros(0) map_inline = np.zeros(0) else: is_path = './SMI_out/map_number_2020_05_14_06.mat' Random_path = scipio.loadmat(is_path) map_xline = Random_path['map_xline'] map_inline = Random_path['map_inline'] count = 0 # 检验网络权重是否变化的计数器 lr = 0.001 # 学习步长 for epoch in range(epoch_num, EPOCHS+1): print(epoch, count) # temp_weight = model.fc60.weight # 检验网络权重是否变化的初始网络参数 # temp_a = torch.sum(temp_weight.data) # print(temp_weight) temp_weight = model.lstm60 temp_a = torch.sum(temp_weight.weight_hh_l0.data) + torch.sum(temp_weight.weight_ih_l0.data) # print(a) if np.mod(epoch + 1, 200) == 0: lr = lr * 0.99 optimizer = optim.Adam(model.parameters(), lr=lr) if is_consistent == 1: trace_number = np.int(map_xline[0, epoch-1]*142+map_inline[0, epoch-1]) else: temp_1 = np.random.randint(0, 142, 1) # 29 temp_2 = np.random.randint(0, 110, 1) # 22 trace_number = temp_2*142+temp_1 map_xline = np.append(map_xline, temp_2) map_inline = np.append(map_inline, temp_1) # trace_number = temp_2*5*142+temp_1*5 # map_xline = np.append(map_xline, temp_2 * 5) # map_inline = np.append(map_inline, temp_1 * 5) # trace_number = np.random.randint(0, 142*110*data_rate, 1) # print(trace_number) # 计算相关系数 coef_seismic = np.zeros((105, Xline1_110_label_impedance.shape[1])) coef_seismic[0, :] = train1_110_seismic[trace_number, :] coef_seismic[1:105, :] = train_well_seismic[:, :] temp_coef = np.corrcoef(coef_seismic) # 优选出相关系数大于阈值并且半径范围内的井 tempval_1 = np.zeros(0) temp_train_well_1 = np.zeros(0) temp_train_well_seisic_1 = np.zeros(0) absCORcoef = np.abs(temp_coef[0, 1:105]) if which_choose_well == 1: num = 0 for k in range(0, 104): if absCORcoef[k] > coefval: # 井数据的坐标 wellxline = Xline_Inline_number[0, k] wellinline = Xline_Inline_number[1, k] # 目标地震数据的坐标 seismicinline = np.mod(trace_number + 1, 142) seismicxline = (trace_number + 1 - seismicinline) / 142 + 1 R = np.sqrt((seismicxline - wellxline) * (seismicxline - wellxline) + (seismicinline - wellinline) * ( seismicinline - wellinline)) if R < Rval: tempval_1 = np.append(tempval_1, absCORcoef[k]) temp_train_well_1 = np.append(temp_train_well_1, train_well[k, :]) temp_train_well_seisic_1 = np.append(temp_train_well_seisic_1, train_well_seismic[k, :]) num = num + 1 temp_train_well = np.zeros(0) temp_train_well_seisic = np.zeros(0) if num < num_well: num = num_well tempval = np.zeros(0) for max_num in range(0, num): temp_tempval = max(absCORcoef) tempval = np.append(tempval, temp_tempval) for max_num2 in range(0, 104): if temp_tempval == absCORcoef[max_num2]: absCORcoef[max_num2] = 0 temp_train_well = np.append(temp_train_well, train_well[max_num2, :]) temp_train_well_seisic = np.append(temp_train_well_seisic, train_well_seismic[max_num2, :]) else: tempval = np.zeros(0) temp_train_well_1 = torch.from_numpy(temp_train_well_1) temp_train_well_1 = temp_train_well_1.view(num, -1) temp_train_well_1 = temp_train_well_1.cpu().detach().numpy() temp_train_well_seisic_1 = torch.from_numpy(temp_train_well_seisic_1) temp_train_well_seisic_1 = temp_train_well_seisic_1.view(num, -1) temp_train_well_seisic_1 = temp_train_well_seisic_1.cpu().detach().numpy() for max_num in range(0, num_well): temp_tempval = max(tempval_1) tempval = np.append(tempval, temp_tempval) for max_num2 in range(0, num): if temp_tempval == tempval_1[max_num2]: tempval_1[max_num2] = 0 temp_train_well = np.append(temp_train_well, temp_train_well_1[max_num2, :]) temp_train_well_seisic = np.append(temp_train_well_seisic, temp_train_well_seisic_1[max_num2, :]) else: num = num_well tempval = np.zeros(0) temp_train_well = np.zeros(0) temp_train_well_seisic = np.zeros(0) for max_num in range(0, num): temp_tempval = max(absCORcoef) tempval = np.append(tempval, temp_tempval) for max_num2 in range(0, 104): if temp_tempval == absCORcoef[max_num2]: absCORcoef[max_num2] = 0 temp_train_well = np.append(temp_train_well, train_well[max_num2, :]) temp_train_well_seisic = np.append(temp_train_well_seisic, train_well_seismic[max_num2, :]) num = num_well maxval = max(tempval) minval = min(tempval) max_minlen = maxval - minval tempval = (tempval - minval) / max_minlen valsum = sum(tempval) tempval = tempval / valsum tempval = torch.from_numpy(tempval) tempval = tempval.view(1, -1) tempval = tempval.float() tempval = tempval.to(device) temp_train_well = torch.from_numpy(temp_train_well) temp_train_well = temp_train_well.view(num, -1) temp_train_well = temp_train_well.float() # temp_train_well = temp_train_well.to(device) # temp_train_well = temp_train_well.view(num, -1) # temp_train_well_seisic = torch.from_numpy(temp_train_well_seisic) # temp_train_well_seisic = temp_train_well_seisic.float() # temp_train_well_seisic = temp_train_well_seisic.to(device) # temp_train_well_seisic = temp_train_well_seisic.view(num, -1) # temp_seismic = torch.from_numpy(train1_75_seismic[trace_number, :]) # temp_seismic = temp_seismic.float() # temp_seismic = temp_seismic.to(device) # temp_seismic = temp_seismic.view(1, -1) temp_lable = torch.from_numpy(Xline1_110_label_impedance[trace_number, :]) temp_lable = temp_lable.float() # temp_lable = temp_lable.to(device) temp_lable = temp_lable.view(1, -1) # for rand in range(0, 60 - BATCH_LEN + 1): for num_rand in range(0, number): rand = np.random.randint(0, 60 - BATCH_LEN + 1, 1) temp_train_seismic = train1_110_seismic[trace_number, rand[0]:rand[0] + BATCH_LEN] temp_train_seismic = torch.from_numpy(temp_train_seismic) temp_train_seismic = temp_train_seismic.float() temp_train_seismic = temp_train_seismic.to(device) temp_train_seismic = temp_train_seismic.view(1, -1) # 利用优选出来的井数据,井旁道,加上一个目标道组成网络的输入 train_dataset = MyDataset2(temp_train_well[:, rand[0]:rand[0] + BATCH_LEN], temp_lable[:, rand[0]:rand[0] + BATCH_LEN]) train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, num_workers=1, shuffle=True, drop_last=False) epoch_loss = [] for itr, (train_dt, train_lable) in enumerate(train_dataloader): train_dt, train_lable = train_dt.to(device), train_lable.to(device) train_dt = train_dt.float() train_lable = train_lable.float() model.train() optimizer.zero_grad() output = model(train_dt, temp_train_seismic) if is_synseismic == 1: syn_seismic = syn_seismic_fun2(output, wavelet) syn_seismic = syn_seismic.float() loss = F.mse_loss(syn_seismic, temp_train_seismic) + F.mse_loss(output, train_lable) else: loss = F.mse_loss(output, train_lable) loss.backward() optimizer.step() # print(model.conv1.weight) # print(model.conv2.weight) # print(model.lstm.weight) # print(model.fc1.weight.data[:, 0]) epoch_loss.append(loss.item()) # temp_b = torch.sum(model.fc60.weight.data) temp_b = torch.sum(model.lstm60.weight_hh_l0.data) + torch.sum(model.lstm60.weight_ih_l0.data) # print(b) if temp_a == temp_b: count = count + 1 else: count = 0 if count > 50: break epoch_loss = np.sum(np.array(epoch_loss)) writer.add_scalar('Train/MSE', epoch_loss, epoch) epoch_num = epoch print('Train set: Average loss: {:.15f}'.format(epoch_loss)) if epoch_loss < temp: path = './model_file/pre_trained_network_model_model1/pre_trained_network_model_SMI_2020_07_31_0%d.pth' % date_num torch.save(model.state_dict(), path) path_loss = './Temporary_parameters/pre_temp_model1.mat' path_epoch = './Temporary_parameters/pre_epoch_num_model1.mat' scipio.savemat(path_loss, {'epoch_loss': epoch_loss}) scipio.savemat(path_epoch, {'epoch_num': epoch_num}) if is_consistent == 0: pathmat = './SMI_out/map_number_2020_07_31_0%d.mat' % date_num scipio.savemat(pathmat, {'map_xline': map_xline, 'map_inline': map_inline}) writer.add_graph(model, (train_dt, temp_train_seismic)) writer.close()
class GN(): def __init__(self, lr=1e-3, batchs=8, cuda=True): ''' :param tt: train_test :param tag: 1 - evaluation on testing data, 0 - without evaluation on testing data :param lr: :param batchs: :param cuda: ''' # all the tensor should set the 'volatile' as True, and False when update the network self.hungarian = Munkres() self.device = torch.device("cuda" if cuda else "cpu") self.nEpochs = 999 self.lr = lr self.batchsize = batchs self.numWorker = 4 self.show_process = 0 # interaction self.step_input = 1 print ' Preparing the model...' self.resetU() self.Uphi = uphi().to(self.device) self.Ephi = ephi().to(self.device) self.criterion = nn.MSELoss() if criterion_s else nn.CrossEntropyLoss() self.criterion = self.criterion.to(self.device) self.optimizer = optim.Adam([{ 'params': self.Uphi.parameters() }, { 'params': self.Ephi.parameters() }], lr=lr) # seqs = [2, 4, 5, 9, 10, 11, 13] # lengths = [600, 1050, 837, 525, 654, 900, 750] seqs = [2, 4, 5, 10] lengths = [600, 1050, 837, 654] for i in xrange(len(seqs)): self.writer = SummaryWriter() # print ' Loading Data...' seq = seqs[i] self.seq_index = seq start = time.time() sequence_dir = 'MOT16/train/MOT16-%02d' % seq self.outName = t_dir + 'result_%02d.txt' % seq self.train_set = DatasetFromFolder(sequence_dir, self.outName) self.train_test = lengths[i] self.tag = 0 self.loss_threhold = 0.03 self.update() print ' Logging...' t_data = time.time() - start self.log(t_data) def getEdges( self ): # the statistic data of the graph among two frames' detections self.train_set.setBuffer(1) step = 1 edge_counter = 0.0 for head in xrange(1, self.train_test): self.train_set.loadNext() # Get the next frame edge_counter += self.train_set.m * self.train_set.n step += 1 self.train_set.swapFC() out = open(self.outName, 'a') print >> out, 'Average edge:', edge_counter * 1.0 / step out.close() def showNetwork(self): # add the graph into tensorboard E = torch.rand(1, 2).to(self.device) V = torch.rand(1, 512).to(self.device) u = torch.rand(1, 100).to(self.device) self.writer.add_graph(self.Uphi, (E, V, u)) E = torch.rand(1, 2).to(self.device) V1 = torch.rand(1, 512).to(self.device) V2 = torch.rand(1, 512).to(self.device) u = torch.rand(1, 100).to(self.device) self.writer.add_graph(self.Ephi, (E, V1, V2, u)) def log(self, t_data): out = open(self.outName, 'w') print >> out, self.criterion print >> out, 'lr:{}'.format(self.lr) print >> out, self.optimizer.state_dict() print >> out, self.Uphi print >> out, self.Ephi print >> out, 'Time consuming for loading datasets:', t_data out.close() # self.showNetwork() def resetU(self): if u_initial: self.u = torch.FloatTensor( [random.random() for i in xrange(u_num)]).view(1, -1) else: self.u = torch.FloatTensor([0.0 for i in xrange(u_num)]).view(1, -1) self.u = self.u.to(self.device) def updateNetwork(self): self.train_set.setBuffer(1) step = 1 average_epoch = 0 edge_counter = 0.0 for head in xrange(1, self.train_test): self.train_set.loadNext() # Get the next frame edge_counter += self.train_set.m * self.train_set.n start = time.time() show_name = 'LOSS_{}'.format(step) # print ' Step -', step data_loader = DataLoader(dataset=self.train_set, num_workers=self.numWorker, batch_size=self.batchsize, shuffle=True) for epoch in xrange(1, self.nEpochs): num = 0 epoch_loss = 0.0 arpha_loss = 0.0 for iteration in enumerate(data_loader, 1): index, (e, gt, vs_index, vr_index) = iteration # print '*'*36 # print e.size() # print gt.size() e = e.to(self.device) gt = gt.to(self.device) self.optimizer.zero_grad() u_ = self.Uphi(self.train_set.E, self.train_set.V, self.u) v1 = self.train_set.getApp(1, vs_index) v2 = self.train_set.getApp(0, vr_index) e_ = self.Ephi(e, v1, v2, u_) if self.show_process: print '-' * 66 print vs_index, vr_index print 'e:', e.cpu().data.numpy()[0][0], print 'e_:', e_.cpu().data.numpy()[0][0], if criterion_s: print 'GT:', gt.cpu().data.numpy()[0][0] else: print 'GT:', gt.cpu().data.numpy()[0] # Penalize the u to let its value not too big arpha = torch.mean(torch.abs(u_)) arpha_loss += arpha.item() arpha.backward(retain_graph=True) # The regular loss # print e_.size(), e_ # print gt.size(), gt loss = self.criterion(e_, gt.squeeze(1)) # print loss epoch_loss += loss.item() loss.backward() # update the network: Uphi and Ephi self.optimizer.step() # Show the parameters of the Uphi and Ephi to check the process of optimiser # print self.Uphi.features[0].weight.data # print self.Ephi.features[0].weight.data # raw_input('continue?') num += self.batchsize if self.show_process and self.step_input: a = raw_input( 'Continue(0-step, 1-run, 2-run with showing)?') if a == '1': self.show_process = 0 elif a == '2': self.step_input = 0 epoch_loss /= num # print ' Loss of epoch {}: {}.'.format(epoch, epoch_loss) self.writer.add_scalars(show_name, { 'regular': epoch_loss, 'u': arpha_loss / num * self.batchsize }, epoch) if epoch_loss < self.loss_threhold: break # print ' Time consuming:{}\n\n'.format(time.time()-start) self.updateUE() self.train_set.showE() self.showU() average_epoch += epoch self.writer.add_scalar('epoch', epoch, step) step += 1 self.train_set.swapFC() out = open(self.outName, 'a') print >> out, 'Average edge:', edge_counter * 1.0 / step, '.', print >> out, 'Average epoch:', average_epoch * 1.0 / step, 'for', print >> out, 'Random' if edge_initial else 'IoU' out.close() def saveModel(self): print 'Saving the Uphi model...' torch.save(self.Uphi, t_dir + 'uphi_%02d.pth' % self.seq_index) print 'Saving the Ephi model...' torch.save(self.Ephi, t_dir + 'ephi_%02d.pth' % self.seq_index) print 'Saving the global variable u...' torch.save(self.u, t_dir + 'u_%02d.pth' % self.seq_index) print 'Done!' def updateUE(self): u_ = self.Uphi(self.train_set.E, self.train_set.V, self.u) self.u = u_.data # update the edges for edge in self.train_set: e, gt, vs_index, vr_index = edge e = e.to(self.device).view(1, -1) v1 = self.train_set.getApp(1, vs_index) v2 = self.train_set.getApp(0, vr_index) e_ = self.Ephi(e, v1, v2, u_) self.train_set.edges[vs_index][vr_index] = e_.data.view(-1) def update(self): start = time.time() self.evaluation(1) if self.tag: self.evaluation(self.train_test) self.updateNetwork() self.saveModel() self.evaluation(1) if self.tag: self.evaluation(self.train_test) out = open(self.outName, 'a') print >> out, 'The final time consuming:{}\n\n'.format( (time.time() - start) / 60) out.close() self.outputScalars() def outputScalars(self): self.writer.export_scalars_to_json(t_dir + 'scalars_%02d.json' % self.seq_index) self.writer.close() def evaluation(self, head): self.train_set.setBuffer(head) total_gt = 0.0 total_ed = 0.0 for step in xrange(1, self.train_test): self.train_set.loadNext() # print head+step, 'F', u_ = self.Uphi(self.train_set.E, self.train_set.V, self.u) # print 'Fo' m = self.train_set.m n = self.train_set.n ret = [[0.0 for i in xrange(n)] for j in xrange(m)] step_gt = self.train_set.step_gt total_gt += step_gt # update the edges # print 'T', for edge in self.train_set.candidates: e, gt, vs_index, vr_index = edge e = e.to(self.device).view(1, -1) v1 = self.train_set.getApp(1, vs_index) v2 = self.train_set.getApp(0, vr_index) e_ = self.Ephi(e, v1, v2, u_) self.train_set.edges[vs_index][vr_index] = e_.data.view(-1) tmp = F.softmax(e_) tmp = tmp.cpu().data.numpy()[0] ret[vs_index][vr_index] = float(tmp[0]) self.train_set.showE() self.showU() # for j in ret: # print j results = self.hungarian.compute(ret) # print head+step, results, step_ed = 0.0 for (j, k) in results: step_ed += self.train_set.gts[j][k].numpy()[0] total_ed += step_ed # print 'Fi' # print 'Step ACC:{}/{}({}%)'.format(int(step_ed), int(step_gt), step_ed/step_gt*100) self.train_set.swapFC() tra_tst = 'training sets' if head == 1 else 'testing sets' # print 'Final {} ACC:{}/{}({}%)'.format(tra_tst, int(total_ed), int(total_gt), total_ed/total_gt*100) out = open(self.outName, 'a') print >> out, 'Final {} ACC:{}/{}({}%)'.format( tra_tst, int(total_ed), int(total_gt), total_ed / total_gt * 100) out.close() def showU(self): out = open(self.outName, 'a') print >> out, ' u' print >> out, self.u.view( 10, -1) # reshape the size of z with aspect of 10 * 10 out.close()
num_workers=args.workers, pin_memory=True) if args.ckpt: pass else: # save graph and clips_order samples for i, data in enumerate(train_dataloader): tuple_clips, targets = data for i in range(args.tl): writer.add_video('train/tuple_clips', tuple_clips[:, i, :, :, :, :], i, fps=8) tuple_clips = tuple_clips.to(device) writer.add_graph(vcpn, tuple_clips) break # save init params at step 0 for name, param in vcpn.named_parameters(): writer.add_histogram('params/{}'.format(name), param, 0) ### loss funciton, optimizer and scheduler ### criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(vcpn.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.wd) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', min_lr=1e-5, patience=50,
class Train: __device = [] __writer = [] __model = [] __transformations = [] __dataset_train = [] __train_loader = [] __loss_func = [] __optimizer = [] __exp_lr_scheduler = [] def __init__(self, gpu='0'): # Device configuration self.__device = torch.device('cuda:'+gpu if torch.cuda.is_available() else 'cpu') self.__writer = SummaryWriter('logs') self.__model = CNNDriver() # Set model to train mode self.__model.train() print(self.__model) self.__writer.add_graph(self.__model, torch.rand(10, 3, 66, 200)) # Put model on GPU self.__model = self.__model.to(self.__device) def train(self, num_epochs=100, batch_size=400, lr=0.0001, l2_norm=0.001, save_dir='./save', input='./DataLMDB'): # Create log/save directory if it does not exist if not os.path.exists('./logs'): os.makedirs('./logs') if not os.path.exists(save_dir): os.makedirs(save_dir) self.__transformations = transforms.Compose([AugmentDrivingTransform(), RandomBrightness(), ConvertToGray(), ConvertToSepia(), AddNoise(), DrivingDataToTensor(),]) self.__dataset_train = DriveData_LMDB(input, self.__transformations) self.__train_loader = DataLoader(self.__dataset_train, batch_size=batch_size, shuffle=True, num_workers=4) # Loss and Optimizer self.__loss_func = nn.MSELoss() # self.__loss_func = nn.SmoothL1Loss() self.__optimizer = torch.optim.Adam(self.__model.parameters(), lr=lr, weight_decay=l2_norm) # Decay LR by a factor of 0.1 every 10 epochs self.__exp_lr_scheduler = lr_scheduler.StepLR(self.__optimizer, step_size=15, gamma=0.1) print('Train size:', len(self.__dataset_train), 'Batch size:', batch_size) print('Batches per epoch:', len(self.__dataset_train) // batch_size) # Train the Model iteration_count = 0 for epoch in range(num_epochs): for batch_idx, samples in enumerate(self.__train_loader): # Send inputs/labels to GPU images = samples['image'].to(self.__device) labels = samples['label'].to(self.__device) self.__optimizer.zero_grad() # Forward + Backward + Optimize outputs = self.__model(images) loss = self.__loss_func(outputs, labels.unsqueeze(dim=1)) loss.backward() self.__optimizer.step() self.__exp_lr_scheduler.step(epoch) # Send loss to tensorboard self.__writer.add_scalar('loss/', loss.item(), iteration_count) self.__writer.add_histogram('steering_out', outputs.clone().detach().cpu().numpy(), iteration_count, bins='doane') self.__writer.add_histogram('steering_in', labels.unsqueeze(dim=1).clone().detach().cpu().numpy(), iteration_count, bins='doane') # Get current learning rate (To display on Tensorboard) for param_group in self.__optimizer.param_groups: curr_learning_rate = param_group['lr'] self.__writer.add_scalar('learning_rate/', curr_learning_rate, iteration_count) # Display on each epoch if batch_idx == 0: # Send image to tensorboard self.__writer.add_image('Image', images, epoch) self.__writer.add_text('Steering', 'Steering:' + str(outputs[batch_idx].item()), epoch) # Print Epoch and loss print('Epoch [%d/%d] Loss: %.4f' % (epoch + 1, num_epochs, loss.item())) # Save the Trained Model parameters torch.save(self.__model.state_dict(), save_dir+'/cnn_' + str(epoch) + '.pkl') iteration_count += 1
def train(h5file, h5key, pklfile, validationh5, trainedlossplot, train_target, train_lossh5): # ******* input dataset from h5, then divide it into train dataset and test dataset(16:1) print("Let's use", torch.cuda.device_count(), "GPUs!") net = Net(n_feature=75, n_output=1) # pklfile6 = 'train6/NN_train_params_3975284924_2.pkl' # net.load_state_dict(torch.load(pklfile6)) net.cuda() net = net.double() print(net) # optimizer = torch.optim.SGD(net.parameters(), lr=LR, weight_decay=0.01,momentum=0.9) # optimizer = torch.optim.SGD(net.parameters(), lr=LR, momentum=0.5) # optimizer = torch.optim.Adagrad(net.parameters(), lr=LR, lr_decay=0.01) optimizer = torch.optim.Adam(net.parameters(), lr=LR) # optimizer = torch.optim.RMSprop(net.parameters(), lr=LR, weight_decay=5e-2) loss_func = nn.MSELoss() train_mode_file = Dir_training + "train_mode.txt" train_mode = open(train_mode_file, "w") train_mode.write(str(net) + '\n') train_mode.write("Activation: " + "Relu" + '\n') train_mode.write("Optimizer: " + str(optimizer) + '\n') train_mode.write("EPOCH: " + str(EPOCH) + '\n') train_mode.write("BATCH_SIZE: " + str(BATCH_SIZE) + '\n') train_mode.write("Leaning rate: " + str(LR) + '\n') train_mode.write("Training data set : " + h5file + '\n') train_mode.write("Test data size : " + "1000" + '\n') train_mode.write("Additional : " + "For crystal 2626. And wide layer." + '\n') train_mode.close() logdir = Dir_training + 'NN_logs_' + h5key if os.path.isdir(logdir): shutil.rmtree(logdir) logger = Logger(logdir) if os.path.exists(train_lossh5): print("The file", train_lossh5, " exist, will remove it!") os.remove(train_lossh5) else: print("The file", train_lossh5, "does not exist!") plt.ion() plt.figure(figsize=(10, 4)) loss_list_train = [] loss_list_test = [] step_list = [] # par_np = net.parameters() Step = 0 lri = LR # ****** test dataset mydf_test = pd.read_hdf(h5file, h5key, start=0, stop=400) test_data_np = mydf_test.iloc[:, 4:].replace(np.nan, 0.0).values test_data_tensor = torch.from_numpy(test_data_np).double() if train_target == 'phi': test_labels_np = mydf_test.mcPhi.values.reshape( (mydf_test.shape[0], 1)) test_rec_np = mydf_test.phi.values.reshape((mydf_test.shape[0], 1)) elif train_target == 'theta': test_labels_np = mydf_test.mcTheta.values.reshape( (mydf_test.shape[0], 1)) test_rec_np = mydf_test.theta.values.reshape((mydf_test.shape[0], 1)) else: print("Wrong train target!") test_labels_tensor = torch.from_numpy(test_labels_np).double() test_rec_tensor = torch.from_numpy(test_rec_np).double() test_dataset = Data.TensorDataset(test_data_tensor, test_labels_tensor) test_loader = Data.DataLoader(test_dataset, batch_size=BATCH_SIZE_test) res = test_data_tensor.cuda() #res = Variable(torch.rand(75,640)) writer = SummaryWriter(logdir) writer.add_graph(net, (res, )) writer.close() for epoch in range(EPOCH): print('EPOCH: ', epoch) loss_df_EPOCH_i = pd.DataFrame(columns=['step', 'train', 'test']) reader = pd.read_hdf(h5file, h5key, chunksize=BATCH_SIZE * 2, start=400) for mydf_readd5 in reader: mydf_train = mydf_readd5 # mydf_train = mydf_readd5.iloc[: int(mydf_readd5.shape[0]*15/16)] # mydf_test = mydf_readd5.iloc[int(mydf_readd5.shape[0]*15/16):] # print(mydf_train.iloc[:,54:].head()) # print(mydf_test.iloc[:,54:].head()) # print(mydf_train.shape) # ****** train dataset train_data_np = mydf_train.iloc[:, 4:].replace(np.nan, 0.0).values train_data_tensor = torch.from_numpy(train_data_np).double() if train_target == 'phi': train_labels_np = mydf_train.mcPhi.values.reshape( (mydf_train.shape[0], 1)) elif train_target == 'theta': train_labels_np = mydf_train.mcTheta.values.reshape( (mydf_train.shape[0], 1)) else: print("Wrong train target!") train_labels_tensor = torch.from_numpy(train_labels_np).double() train_dataset = Data.TensorDataset(train_data_tensor, train_labels_tensor) train_loader = Data.DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=8) for step, data in enumerate(train_loader): # b_x, b_y = data b_X, b_Y = data b_x = b_X.cuda() b_y = b_Y.cuda() # ****** L2 regularization reg_lambda = torch.tensor(0.2) l2_reg = torch.tensor(0.) for param in net.parameters(): l2_reg += param.cpu().float().norm(2) prediction = net(b_x).cuda() loss = loss_func(prediction, b_y) # loss += (reg_lambda*l2_reg).cuda().double() optimizer.zero_grad() loss.backward() optimizer.step() Step += 1 if (Step + 1) % 100 == 0: test_output = net(test_data_tensor.cuda()) test_pred_y = test_output.cpu().data.numpy() # test_pred_y = test_output.data.numpy() accuracy_test = sum(test_pred_y - test_labels_np) loss_test = loss_func(test_output, test_labels_tensor.cuda()) # loss_rec = loss_func(test_rec_tensor.cuda(), test_labels_tensor.cuda()) print('Epoch:', epoch, '|step:', Step, '|train loss:%.8f' % loss.item(), '|test loss:%.8f' % loss_test.item()) step_list.append(Step) loss_list_train.append(loss.item()) loss_list_test.append(loss_test.item()) loss_df = pd.DataFrame.from_dict({ 'step': [Step], 'train': [loss.item()], 'test': [loss_test.item()] }) loss_df.to_hdf(train_lossh5, key=h5key + 'step', append=True, mode='a') loss_df_EPOCH_i = pd.DataFrame.from_dict({ 'epoch': [epoch], 'train': [loss.item()], 'test': [loss_test.item()] }) plt.subplot(131) plt.cla() plt.plot(step_list, loss_list_train, 'b-', lw=1, label='train') plt.plot(step_list, loss_list_test, 'r-', lw=3, label='test') plt.xlabel('step') plt.ylabel('loss') plt.text(10, 0.027, 'Loss_train=%.8f' % loss.item(), fontdict={ 'size': 10, 'color': 'blue' }) plt.text(10, 0.025, 'Loss_test=%.8f' % loss_test.item(), fontdict={ 'size': 10, 'color': 'red' }) # plt.text(10, 0.023, 'Loss_rec=%.8f' % loss_rec.data[0], fontdict={'size': 10, 'color': 'red'}) legend = plt.legend(loc="best") #(loc="best") frame = legend.get_frame() frame.set_facecolor('none') # 璁剧疆鍥句緥legend鑳屾櫙閫忔槑 Theta1 = 0.8336485385269553 Theta2 = 0.8647267287924316 if train_target == 'phi': Range = [-3.2, 3.2] elif train_target == 'theta': Range = [Theta1 * 0.995, Theta2 * 1.005] # [0.4, 2.4] plt.subplot(133) plt.cla() plt.hist(test_labels_np, bins=200, range=Range, color='red', alpha=0.7, fill=False, histtype='step', label='test_truth') plt.hist(test_pred_y, bins=200, range=Range, color='blue', alpha=0.7, fill=False, histtype='step', label='test_pre') plt.hist(test_rec_np, bins=200, range=Range, color='green', alpha=0.7, fill=False, histtype='step', label='test_rec') plt.xlabel(r'$' + '\\' + train_target + '$') legend = plt.legend(loc="best") #(loc="best") frame = legend.get_frame() frame.set_facecolor('none') # 璁剧疆鍥句緥legend鑳屾櫙閫忔槑 plt.subplot(132) plt.cla() plt.hist(b_y.cpu().data.numpy(), bins=200, range=Range, color='red', alpha=0.7, fill=False, histtype='step', label='train_truth') plt.hist(prediction.cpu().data.numpy(), bins=200, range=Range, color='blue', alpha=0.7, fill=False, histtype='step', label='train_pre') plt.xlabel(r'$' + '\\' + train_target + '$') legend = plt.legend(loc="best") #(loc="best") frame = legend.get_frame() frame.set_facecolor('none') # 璁剧疆鍥句緥legend鑳屾櫙閫忔槑 plt.pause(0.1) # ================================================================== # # Tensorboard Logging # # ================================================================== # # 1. Log scalar values (scalar summary) info = { 'loss': loss.item(), 'loss_test': loss_test.item(), 'accuracy': accuracy_test.item() } for tag, value in info.items(): logger.scalar_summary(tag, value, Step + 1) # 2. Log values and gradients of the parameters (histogram summary) for tag, value in net.named_parameters(): tag = tag.replace('.', '/') logger.histo_summary(tag, value.data.cpu().numpy(), Step + 1) logger.histo_summary(tag + '/grad', value.grad.data.cpu().numpy(), Step + 1) # 3. Log training images (image summary) info = {'images': b_x.view(-1, 5, 5)[:10].cpu().numpy()} for tag, images in info.items(): logger.image_summary(tag, images, Step + 1) #lri = lri/(1 + 0.005) # print("lri: ",lri) # for param_group in optimizer.param_groups: # param_group['lr'] = lri loss_df_EPOCH_i.to_hdf(train_lossh5, key=h5key + 'epoch', append=True, mode='a') if (epoch + 1) % 50 == 0: pklfile_epoch = Dir_pkl + 'NN_train_params_epoch' + str( epoch) + '.pkl' torch.save(net.state_dict(), pklfile_epoch) plt.ioff() plt.savefig(trainedlossplot, dpi=300) plt.show() #loss_df = pd.DataFrame.from_dict({'step' : step_list, 'train' : loss_list_train, 'test' : loss_list_test}) #loss_df.to_hdf(train_lossh5, key=h5key, mode='w') test_output = net(test_data_tensor[:10].cuda()) test_pred_y = test_output.cpu().data.numpy() # test_pred_y = test_output.data.numpy() print('prediction number: ', test_pred_y) print('real number: ', test_labels_np[:10]) # ****** The model after train for name, param in net.state_dict().items(): print(name, param.size()) # ****** save the whole model # torch.save(model_object, 'model.pkl') # only save the parameters ((recommended)) torch.save(net.state_dict(), pklfile) test_pred_y = np.empty((0, 1)) for step, data in enumerate(test_loader): t_X, t_Y = data t_x = t_X.cuda() t_y = t_Y.cuda() test_output = net(t_x).cuda() test_pred_y = np.vstack([test_pred_y, test_output.cpu().data.numpy()]) # test_pred_y = np.delete(test_pred_y, 0, 0) print("shapes: ", test_pred_y.shape) pred_df = pd.DataFrame(mydf_test[['mcPhi', 'phi', 'mcTheta', 'theta']]) print("shapes: ", test_pred_y.shape, pred_df.shape) if train_target == 'phi': pred_df['prePhi'] = test_pred_y elif train_target == 'theta': pred_df['preTheta'] = test_pred_y pred_df.to_hdf(validationh5, key=h5key, mode='w')