import mxnet as mx from mxnet import gluon, nd from mxboard import SummaryWriter import time ''' 加载一个网络,进行前向计算,然后绘制网络图 ''' ctx = mx.gpu(0) net = gluon.model_zoo.vision.AlexNet(classes=10) net.hybridize() net.initialize(ctx=ctx, init=mx.init.Xavier()) net.forward(nd.ones( (1, 3, 227, 227)).as_in_context(ctx)) #注意这里net是在gpu上,所以也需要将数据放在gpu上 sw = SummaryWriter('./log/%s' % (time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime()))) sw.add_graph(net)
# s = net.tojson() # print(s) # net.export("model10") #output = output.transpose((2,1,0)) output = nd.expand_dims(output, axis=1) output = nd.transpose(output,(2,1,0)) #label = nd.expand_dims(label, axis=1) #print("output ",output.shape,label.shape) L = loss(output,label) L.backward() train_loss = nd.mean(L).asscalar() sw.add_scalar(tag="loss",value=train_loss,global_step=global_step) global_step = global_step + 1 if epoch == 1 : sw.add_graph(lstm) trainer.step(1) lstm.save_parameters("mo1.params") if(epoch %100 == 0): print('train_loss %.4f'%(train_loss)) # print('output max', output.argmax(axis=2)) #print(" result ",predict(lstm,data,state)) #export the model lstm.export("mod1") net = gluon.nn.SymbolBlock.imports('mod1-symbol.json', ['data0'], ctx=mx.cpu()) net.initialize() net.hybridize() # net.load_parameters(lstm.begin_state()) #net = mx.nd.load("mod1-0000.params")
class TrainerAgentGluon: # Probably needs refactoring """Main training loop""" def __init__( self, net, val_data, train_config: TrainConfig, train_objects: TrainObjects, use_rtpt: bool, ): """ Class for training the neural network. :param net: The NN with loaded parameters that shall be trained. :param val_data: The validation data loaded with gluon DataLoader. :param train_config: An instance of the TrainConfig data class. :param train_objects: Am omstamce pf the TrainObject data class. :param use_rtpt: If True, an RTPT object will be created and modified within this class. """ # Too many instance attributes (29/7) - Too many arguments (24/5) - Too many local variables (25/15) # Too few public methods (1/2) self.tc = train_config self.to = train_objects if self.to.metrics is None: self.to.metrics = {} self._ctx = get_context(train_config.context, train_config.device_id) self._net = net self._graph_exported = False self._val_data = val_data # define a summary writer that logs data and flushes to the file every 5 seconds if self.tc.log_metrics_to_tensorboard: self.sum_writer = SummaryWriter(logdir=self.tc.export_dir + "logs", flush_secs=5, verbose=False) # Define the two loss functions self._softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss( sparse_label=self.tc.sparse_policy_label) self._l2_loss = gluon.loss.L2Loss() if self.tc.optimizer_name != "nag": raise NotImplementedError( "The requested optimizer %s Isn't supported yet." % self.tc.optimizer_name) self._trainer = gluon.Trainer( self._net.collect_params(), "nag", { "learning_rate": self.to.lr_schedule(0), "momentum": self.to.momentum_schedule(0), "wd": self.tc.wd, }, ) # collect parameter names for logging the gradients of parameters in each epoch self._params = self._net.collect_params() self._param_names = self._params.keys() self.ordering = list( range(self.tc.nb_parts) ) # define a list which describes the order of the processed batches self.use_rtpt = use_rtpt self.rtpt = None # Set this later in training function def _log_metrics(self, metric_values, global_step, prefix="train_"): """ Logs a dictionary object of metric value to the console and to tensorboard if _log_metrics_to_tensorboard is set to true :param metric_values: Dictionary object storing the current metrics :param global_step: X-Position point of all metric entries :param prefix: Used for labelling the metrics :return: """ for name in metric_values.keys(): # show the metric stats print(" - %s%s: %.4f" % (prefix, name, metric_values[name]), end="") # add the metrics to the tensorboard event file if self.tc.log_metrics_to_tensorboard: self.sum_writer.add_scalar( name, [prefix.replace("_", ""), metric_values[name]], global_step) def _process_on_data_plane_file(self, train_data, batch_proc_tmp): for _, (data, value_label, policy_label) in enumerate(train_data): data = data.as_in_context(self._ctx) value_label = value_label.as_in_context(self._ctx) policy_label = policy_label.as_in_context(self._ctx) # update a dummy metric to see a proper progress bar # (the metrics will get evaluated at the end of 100k steps) # if self.batch_proc_tmp > 0: # self._metrics['value_loss'].update(old_label, value_out) # old_label = value_label with autograd.record(): [value_out, policy_out] = self._net(data) if self.tc.select_policy_from_plane and not self.tc.is_policy_from_plane_data: policy_out = policy_out[:, FLAT_PLANE_IDX] value_loss = self._l2_loss(value_out, value_label) policy_loss = self._softmax_cross_entropy( policy_out, policy_label) # weight the components of the combined loss combined_loss = self.tc.val_loss_factor * value_loss.sum( ) + self.tc.policy_loss_factor * policy_loss.sum() # update a dummy metric to see a proper progress bar self.to.metrics["value_loss"].update(preds=value_out, labels=value_label) combined_loss.backward() self._trainer.step(data.shape[0]) batch_proc_tmp += 1 return batch_proc_tmp, self.to.metrics["value_loss"].get()[1] def train(self, cur_it=None): # Probably needs refactoring """ Training model :param cur_it: Current iteration which is used for the learning rate and momentum schedule. If set to None it will be initialized """ # Too many local variables (44/15) - Too many branches (18/12) - Too many statements (108/50) # set a custom seed for reproducibility random.seed(self.tc.seed) # define and initialize the variables which will be used t_s = time() # predefine the local variables that will be used in the training loop val_loss_best = val_p_acc_best = k_steps_best = val_metric_values_best = old_label = value_out = None patience_cnt = epoch = batch_proc_tmp = 0 # track on how many batches have been processed in this epoch k_steps = self.tc.k_steps_initial # counter for thousands steps # calculate how many log states will be processed k_steps_end = round(self.tc.total_it / self.tc.batch_steps) # we use k-steps instead of epochs here if k_steps_end == 0: k_steps_end = 1 if self.use_rtpt: self.rtpt = RTPT(name_initials=self.tc.name_initials, experiment_name='crazyara', max_iterations=k_steps_end - self.tc.k_steps_initial) if cur_it is None: cur_it = self.tc.k_steps_initial * 1000 nb_spikes = 0 # count the number of spikes that have been detected # initialize the loss to compare with, with a very high value old_val_loss = np.inf graph_exported = False # create a state variable to check if the net architecture has been reported yet if not self.ordering: # safety check to prevent eternal loop raise Exception( "You must have at least one part file in your planes-dataset directory!" ) if self.use_rtpt: # Start the RTPT tracking self.rtpt.start() while True: # Too many nested blocks (7/5) # reshuffle the ordering of the training game batches (shuffle works in place) random.shuffle(self.ordering) epoch += 1 logging.info("EPOCH %d", epoch) logging.info("=========================") t_s_steps = time() for part_id in tqdm_notebook(self.ordering): # load one chunk of the dataset from memory _, x_train, yv_train, yp_train, _, _ = load_pgn_dataset( dataset_type="train", part_id=part_id, normalize=self.tc.normalize, verbose=False, q_value_ratio=self.tc.q_value_ratio) yp_train = prepare_policy( y_policy=yp_train, select_policy_from_plane=self.tc.select_policy_from_plane, sparse_policy_label=self.tc.sparse_policy_label, is_policy_from_plane_data=self.tc.is_policy_from_plane_data ) # update the train_data object train_dataset = gluon.data.ArrayDataset( nd.array(x_train), nd.array(yv_train), nd.array(yp_train)) train_data = gluon.data.DataLoader( train_dataset, batch_size=self.tc.batch_size, shuffle=True, num_workers=self.tc.cpu_count) for _, (data, value_label, policy_label) in enumerate(train_data): data = data.as_in_context(self._ctx) value_label = value_label.as_in_context(self._ctx) policy_label = policy_label.as_in_context(self._ctx) # update a dummy metric to see a proper progress bar # (the metrics will get evaluated at the end of 100k steps) if batch_proc_tmp > 0: self.to.metrics["value_loss"].update( old_label, value_out) old_label = value_label with autograd.record(): [value_out, policy_out] = self._net(data) value_loss = self._l2_loss(value_out, value_label) policy_loss = self._softmax_cross_entropy( policy_out, policy_label) # weight the components of the combined loss combined_loss = ( self.tc.val_loss_factor * value_loss + self.tc.policy_loss_factor * policy_loss) # update a dummy metric to see a proper progress bar # self._metrics['value_loss'].update(preds=value_out, labels=value_label) combined_loss.backward() learning_rate = self.to.lr_schedule( cur_it) # update the learning rate self._trainer.set_learning_rate(learning_rate) momentum = self.to.momentum_schedule( cur_it) # update the momentum self._trainer._optimizer.momentum = momentum self._trainer.step(data.shape[0]) cur_it += 1 batch_proc_tmp += 1 # add the graph representation of the network to the tensorboard log file if not graph_exported and self.tc.log_metrics_to_tensorboard: self.sum_writer.add_graph(self._net) graph_exported = True if batch_proc_tmp >= self.tc.batch_steps: # show metrics every thousands steps # log the current learning rate # update batch_proc_tmp counter by subtracting the batch_steps batch_proc_tmp = batch_proc_tmp - self.tc.batch_steps ms_step = ( (time() - t_s_steps) / self.tc.batch_steps) * 1000 # measure elapsed time # update the counters k_steps += 1 patience_cnt += 1 logging.info("Step %dK/%dK - %dms/step", k_steps, k_steps_end, ms_step) logging.info("-------------------------") logging.debug("Iteration %d/%d", cur_it, self.tc.total_it) logging.debug("lr: %.7f - momentum: %.7f", learning_rate, momentum) train_metric_values = evaluate_metrics( self.to.metrics, train_data, self._net, nb_batches=10, #25, ctx=self._ctx, sparse_policy_label=self.tc.sparse_policy_label, apply_select_policy_from_plane=self.tc. select_policy_from_plane and not self.tc.is_policy_from_plane_data) val_metric_values = evaluate_metrics( self.to.metrics, self._val_data, self._net, nb_batches=None, ctx=self._ctx, sparse_policy_label=self.tc.sparse_policy_label, apply_select_policy_from_plane=self.tc. select_policy_from_plane and not self.tc.is_policy_from_plane_data) if self.use_rtpt: # update process title according to loss self.rtpt.step( subtitle= f"loss={val_metric_values['loss']:2.2f}") if self.tc.use_spike_recovery and ( old_val_loss * self.tc.spike_thresh < val_metric_values["loss"] or np.isnan(val_metric_values["loss"]) ): # check for spikes nb_spikes += 1 logging.warning( "Spike %d/%d occurred - val_loss: %.3f", nb_spikes, self.tc.max_spikes, val_metric_values["loss"], ) if nb_spikes >= self.tc.max_spikes: val_loss = val_metric_values["loss"] val_p_acc = val_metric_values["policy_acc"] logging.debug( "The maximum number of spikes has been reached. Stop training." ) # finally stop training because the number of lr drops has been achieved print() print("Elapsed time for training(hh:mm:ss): " + str( datetime.timedelta( seconds=round(time() - t_s)))) if self.tc.log_metrics_to_tensorboard: self.sum_writer.close() return return_metrics_and_stop_training( k_steps, val_metric_values, k_steps_best, val_metric_values_best) logging.debug("Recover to latest checkpoint") model_path = self.tc.export_dir + "weights/model-%.5f-%.3f-%04d.params" % ( val_loss_best, val_p_acc_best, k_steps_best, ) # Load the best model once again logging.debug("load current best model:%s", model_path) self._net.load_parameters(model_path, ctx=self._ctx) k_steps = k_steps_best logging.debug("k_step is back at %d", k_steps_best) # print the elapsed time t_delta = time() - t_s_steps print(" - %.ds" % t_delta) t_s_steps = time() else: # update the val_loss_value to compare with using spike recovery old_val_loss = val_metric_values["loss"] # log the metric values to tensorboard self._log_metrics(train_metric_values, global_step=k_steps, prefix="train_") self._log_metrics(val_metric_values, global_step=k_steps, prefix="val_") if self.tc.export_grad_histograms: grads = [] # logging the gradients of parameters for checking convergence for _, name in enumerate(self._param_names): if "bn" not in name and "batch" not in name and name != "policy_flat_plane_idx": grads.append(self._params[name].grad()) self.sum_writer.add_histogram( tag=name, values=grads[-1], global_step=k_steps, bins=20) # check if a new checkpoint shall be created if val_loss_best is None or val_metric_values[ "loss"] < val_loss_best: # update val_loss_best val_loss_best = val_metric_values["loss"] val_p_acc_best = val_metric_values[ "policy_acc"] val_metric_values_best = val_metric_values k_steps_best = k_steps if self.tc.export_weights: prefix = self.tc.export_dir + "weights/model-%.5f-%.3f" \ % (val_loss_best, val_p_acc_best) # the export function saves both the architecture and the weights self._net.export(prefix, epoch=k_steps_best) print() logging.info( "Saved checkpoint to %s-%04d.params", prefix, k_steps_best) patience_cnt = 0 # reset the patience counter # print the elapsed time t_delta = time() - t_s_steps print(" - %.ds" % t_delta) t_s_steps = time() # log the samples per second metric to tensorboard self.sum_writer.add_scalar( tag="samples_per_second", value={ "hybrid_sync": data.shape[0] * self.tc.batch_steps / t_delta }, global_step=k_steps, ) # log the current learning rate self.sum_writer.add_scalar( tag="lr", value=self.to.lr_schedule(cur_it), global_step=k_steps) # log the current momentum value self.sum_writer.add_scalar( tag="momentum", value=self.to.momentum_schedule(cur_it), global_step=k_steps) if cur_it >= self.tc.total_it: val_loss = val_metric_values["loss"] val_p_acc = val_metric_values["policy_acc"] logging.debug( "The number of given iterations has been reached" ) # finally stop training because the number of lr drops has been achieved print() print("Elapsed time for training(hh:mm:ss): " + str( datetime.timedelta( seconds=round(time() - t_s)))) if self.tc.log_metrics_to_tensorboard: self.sum_writer.close() return return_metrics_and_stop_training( k_steps, val_metric_values, k_steps_best, val_metric_values_best)
def run(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], graphviz=True, epoch=100, input_size=[512, 512], batch_size=16, batch_log=100, batch_interval=10, subdivision=4, train_dataset_path="Dataset/train", valid_dataset_path="Dataset/valid", multiscale=True, factor_scale=[8, 5], data_augmentation=True, num_workers=4, optimizer="ADAM", lambda_off=1, lambda_size=0.1, save_period=5, load_period=10, learning_rate=0.001, decay_lr=0.999, decay_step=10, GPU_COUNT=0, base=18, pretrained_base=True, pretrained_path="modelparam", AMP=True, valid_size=8, eval_period=5, tensorboard=True, valid_graph_path="valid_Graph", using_mlflow=True, topk=100, plot_class_thresh=0.5): ''' AMP 가 모든 연산을 지원하지는 않는다. modulated convolution을 지원하지 않음 ''' if GPU_COUNT == 0: ctx = mx.cpu(0) AMP = False elif GPU_COUNT == 1: ctx = mx.gpu(0) else: ctx = [mx.gpu(i) for i in range(GPU_COUNT)] # 운영체제 확인 if platform.system() == "Linux": logging.info(f"{platform.system()} OS") elif platform.system() == "Windows": logging.info(f"{platform.system()} OS") else: logging.info(f"{platform.system()} OS") if isinstance(ctx, (list, tuple)): for i, c in enumerate(ctx): free_memory, total_memory = mx.context.gpu_memory_info(i) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info( f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB' ) else: if GPU_COUNT == 1: free_memory, total_memory = mx.context.gpu_memory_info(0) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info( f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB' ) else: logging.info(f'Running on {ctx}') if GPU_COUNT > 0 and batch_size < GPU_COUNT: logging.info("batch size must be greater than gpu number") exit(0) if AMP: amp.init() if multiscale: logging.info("Using MultiScale") if data_augmentation: logging.info("Using Data Augmentation") logging.info("training Center Detector") input_shape = (1, 3) + tuple(input_size) scale_factor = 4 # 고정 logging.info(f"scale factor {scale_factor}") try: train_dataloader, train_dataset = traindataloader( multiscale=multiscale, factor_scale=factor_scale, augmentation=data_augmentation, path=train_dataset_path, input_size=input_size, batch_size=batch_size, batch_interval=batch_interval, num_workers=num_workers, shuffle=True, mean=mean, std=std, scale_factor=scale_factor, make_target=True) valid_dataloader, valid_dataset = validdataloader( path=valid_dataset_path, input_size=input_size, batch_size=valid_size, num_workers=num_workers, shuffle=True, mean=mean, std=std, scale_factor=scale_factor, make_target=True) except Exception as E: logging.info(E) exit(0) train_update_number_per_epoch = len(train_dataloader) if train_update_number_per_epoch < 1: logging.warning("train batch size가 데이터 수보다 큼") exit(0) valid_list = glob.glob(os.path.join(valid_dataset_path, "*")) if valid_list: valid_update_number_per_epoch = len(valid_dataloader) if valid_update_number_per_epoch < 1: logging.warning("valid batch size가 데이터 수보다 큼") exit(0) num_classes = train_dataset.num_class # 클래스 수 name_classes = train_dataset.classes optimizer = optimizer.upper() if pretrained_base: model = str(input_size[0]) + "_" + str( input_size[1]) + "_" + optimizer + "_P" + "CENTER_RES" + str(base) else: model = str(input_size[0]) + "_" + str( input_size[1]) + "_" + optimizer + "_CENTER_RES" + str(base) weight_path = f"weights/{model}" sym_path = os.path.join(weight_path, f'{model}-symbol.json') param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params') if os.path.exists(param_path) and os.path.exists(sym_path): start_epoch = load_period logging.info(f"loading {os.path.basename(param_path)} weights\n") net = gluon.SymbolBlock.imports(sym_path, ['data'], param_path, ctx=ctx) else: start_epoch = 0 net = CenterNet(base=base, heads=OrderedDict([('heatmap', { 'num_output': num_classes, 'bias': -2.19 }), ('offset', { 'num_output': 2 }), ('wh', { 'num_output': 2 })]), head_conv_channel=64, pretrained=pretrained_base, root=pretrained_path, use_dcnv2=False, ctx=ctx) if isinstance(ctx, (list, tuple)): net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.summary(mx.nd.ones(shape=input_shape, ctx=ctx)) ''' active (bool, default True) – Whether to turn hybrid on or off. static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase. static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower. ''' if multiscale: net.hybridize(active=True, static_alloc=True, static_shape=False) else: net.hybridize(active=True, static_alloc=True, static_shape=True) if start_epoch + 1 >= epoch + 1: logging.info("this model has already been optimized") exit(0) if tensorboard: summary = SummaryWriter(logdir=os.path.join("mxboard", model), max_queue=10, flush_secs=10, verbose=False) if isinstance(ctx, (list, tuple)): net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.forward(mx.nd.ones(shape=input_shape, ctx=ctx)) summary.add_graph(net) if graphviz: gluoncv.utils.viz.plot_network(net, shape=input_shape, save_prefix=model) # optimizer unit = 1 if (len(train_dataset) // batch_size) < 1 else len(train_dataset) // batch_size step = unit * decay_step lr_sch = mx.lr_scheduler.FactorScheduler(step=step, factor=decay_lr, stop_factor_lr=1e-12, base_lr=learning_rate) for p in net.collect_params().values(): if p.grad_req != "null": p.grad_req = 'add' if AMP: ''' update_on_kvstore : bool, default None Whether to perform parameter updates on kvstore. If None, then trainer will choose the more suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored. ''' if optimizer.upper() == "ADAM": trainer = gluon.Trainer( net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "beta1": 0.9, "beta2": 0.999, 'multi_precision': False }, update_on_kvstore=False) # for Dynamic loss scaling elif optimizer.upper() == "RMSPROP": trainer = gluon.Trainer( net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "gamma1": 0.9, "gamma2": 0.999, 'multi_precision': False }, update_on_kvstore=False) # for Dynamic loss scaling elif optimizer.upper() == "SGD": trainer = gluon.Trainer( net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": 0.0001, "momentum": 0.9, 'multi_precision': False }, update_on_kvstore=False) # for Dynamic loss scaling else: logging.error("optimizer not selected") exit(0) amp.init_trainer(trainer) else: if optimizer.upper() == "ADAM": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "beta1": 0.9, "beta2": 0.999, 'multi_precision': False }) elif optimizer.upper() == "RMSPROP": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "gamma1": 0.9, "gamma2": 0.999, 'multi_precision': False }) elif optimizer.upper() == "SGD": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": 0.0001, "momentum": 0.9, 'multi_precision': False }) else: logging.error("optimizer not selected") exit(0) heatmapfocalloss = HeatmapFocalLoss(from_sigmoid=True, alpha=2, beta=4) normedl1loss = NormedL1Loss() prediction = Prediction(batch_size=valid_size, topk=topk, scale=scale_factor) precision_recall = Voc_2007_AP(iou_thresh=0.5, class_names=name_classes) start_time = time.time() for i in tqdm(range(start_epoch + 1, epoch + 1, 1), initial=start_epoch + 1, total=epoch): heatmap_loss_sum = 0 offset_loss_sum = 0 wh_loss_sum = 0 time_stamp = time.time() ''' target generator를 train_dataloader에서 만들어 버리는게 학습 속도가 훨씬 빠르다. ''' for batch_count, (image, _, heatmap, offset_target, wh_target, mask_target, _) in enumerate(train_dataloader, start=1): td_batch_size = image.shape[0] image_split = mx.nd.split(data=image, num_outputs=subdivision, axis=0) heatmap_split = mx.nd.split(data=heatmap, num_outputs=subdivision, axis=0) offset_target_split = mx.nd.split(data=offset_target, num_outputs=subdivision, axis=0) wh_target_split = mx.nd.split(data=wh_target, num_outputs=subdivision, axis=0) mask_target_split = mx.nd.split(data=mask_target, num_outputs=subdivision, axis=0) if subdivision == 1: image_split = [image_split] heatmap_split = [heatmap_split] offset_target_split = [offset_target_split] wh_target_split = [wh_target_split] mask_target_split = [mask_target_split] ''' autograd 설명 https://mxnet.apache.org/api/python/docs/tutorials/getting-started/crash-course/3-autograd.html ''' with autograd.record(train_mode=True): heatmap_all_losses = [] offset_all_losses = [] wh_all_losses = [] for image_part, heatmap_part, offset_target_part, wh_target_part, mask_target_part in zip( image_split, heatmap_split, offset_target_split, wh_target_split, mask_target_split): if GPU_COUNT <= 1: image_part = gluon.utils.split_and_load( image_part, [ctx], even_split=False) heatmap_part = gluon.utils.split_and_load( heatmap_part, [ctx], even_split=False) offset_target_part = gluon.utils.split_and_load( offset_target_part, [ctx], even_split=False) wh_target_part = gluon.utils.split_and_load( wh_target_part, [ctx], even_split=False) mask_target_part = gluon.utils.split_and_load( mask_target_part, [ctx], even_split=False) else: image_part = gluon.utils.split_and_load( image_part, ctx, even_split=False) heatmap_part = gluon.utils.split_and_load( heatmap_part, ctx, even_split=False) offset_target_part = gluon.utils.split_and_load( offset_target_part, ctx, even_split=False) wh_target_part = gluon.utils.split_and_load( wh_target_part, ctx, even_split=False) mask_target_part = gluon.utils.split_and_load( mask_target_part, ctx, even_split=False) # prediction, target space for Data Parallelism heatmap_losses = [] offset_losses = [] wh_losses = [] total_loss = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, heatmap_target, offset_target, wh_target, mask_target in zip( image_part, heatmap_part, offset_target_part, wh_target_part, mask_target_part): heatmap_pred, offset_pred, wh_pred = net(img) heatmap_loss = heatmapfocalloss( heatmap_pred, heatmap_target) offset_loss = normedl1loss(offset_pred, offset_target, mask_target) * lambda_off wh_loss = normedl1loss(wh_pred, wh_target, mask_target) * lambda_size heatmap_losses.append(heatmap_loss.asscalar()) offset_losses.append(offset_loss.asscalar()) wh_losses.append(wh_loss.asscalar()) total_loss.append(heatmap_loss + offset_loss + wh_loss) if AMP: with amp.scale_loss(total_loss, trainer) as scaled_loss: autograd.backward(scaled_loss) else: autograd.backward(total_loss) heatmap_all_losses.append(sum(heatmap_losses)) offset_all_losses.append(sum(offset_losses)) wh_all_losses.append(sum(wh_losses)) trainer.step(batch_size=td_batch_size, ignore_stale_grad=False) # 비우기 for p in net.collect_params().values(): p.zero_grad() heatmap_loss_sum += sum(heatmap_all_losses) / td_batch_size offset_loss_sum += sum(offset_all_losses) / td_batch_size wh_loss_sum += sum(wh_all_losses) / td_batch_size if batch_count % batch_log == 0: logging.info( f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],' f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],' f'[Lr = {trainer.learning_rate}]' f'[heatmap loss = {sum(heatmap_all_losses) / td_batch_size:.3f}]' f'[offset loss = {sum(offset_all_losses) / td_batch_size:.3f}]' f'[wh loss = {sum(wh_all_losses) / td_batch_size:.3f}]') time_stamp = time.time() train_heatmap_loss_mean = np.divide(heatmap_loss_sum, train_update_number_per_epoch) train_offset_loss_mean = np.divide(offset_loss_sum, train_update_number_per_epoch) train_wh_loss_mean = np.divide(wh_loss_sum, train_update_number_per_epoch) train_total_loss_mean = train_heatmap_loss_mean + train_offset_loss_mean + train_wh_loss_mean logging.info( f"train heatmap loss : {train_heatmap_loss_mean} / train offset loss : {train_offset_loss_mean} / train wh loss : {train_wh_loss_mean} / train total loss : {train_total_loss_mean}" ) if i % eval_period == 0 and valid_list: heatmap_loss_sum = 0 offset_loss_sum = 0 wh_loss_sum = 0 # loss 구하기 for image, label, heatmap_all, offset_target_all, wh_target_all, mask_target_all, _ in valid_dataloader: vd_batch_size = image.shape[0] if GPU_COUNT <= 1: image = gluon.utils.split_and_load(image, [ctx], even_split=False) label = gluon.utils.split_and_load(label, [ctx], even_split=False) heatmap_split = gluon.utils.split_and_load( heatmap_all, [ctx], even_split=False) offset_target_split = gluon.utils.split_and_load( offset_target_all, [ctx], even_split=False) wh_target_split = gluon.utils.split_and_load( wh_target_all, [ctx], even_split=False) mask_target_split = gluon.utils.split_and_load( mask_target_all, [ctx], even_split=False) else: image = gluon.utils.split_and_load(image, ctx, even_split=False) label = gluon.utils.split_and_load(label, ctx, even_split=False) heatmap_split = gluon.utils.split_and_load( heatmap_all, ctx, even_split=False) offset_target_split = gluon.utils.split_and_load( offset_target_all, ctx, even_split=False) wh_target_split = gluon.utils.split_and_load( wh_target_all, ctx, even_split=False) mask_target_split = gluon.utils.split_and_load( mask_target_all, ctx, even_split=False) # prediction, target space for Data Parallelism heatmap_losses = [] offset_losses = [] wh_losses = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, lb, heatmap_target, offset_target, wh_target, mask_target in zip( image, label, heatmap_split, offset_target_split, wh_target_split, mask_target_split): gt_box = lb[:, :, :4] gt_id = lb[:, :, 4:5] heatmap_pred, offset_pred, wh_pred = net(img) id, score, bbox = prediction(heatmap_pred, offset_pred, wh_pred) precision_recall.update(pred_bboxes=bbox, pred_labels=id, pred_scores=score, gt_boxes=gt_box * scale_factor, gt_labels=gt_id) heatmap_loss = heatmapfocalloss(heatmap_pred, heatmap_target) offset_loss = normedl1loss(offset_pred, offset_target, mask_target) * lambda_off wh_loss = normedl1loss(wh_pred, wh_target, mask_target) * lambda_size heatmap_losses.append(heatmap_loss.asscalar()) offset_losses.append(offset_loss.asscalar()) wh_losses.append(wh_loss.asscalar()) heatmap_loss_sum += sum(heatmap_losses) / vd_batch_size offset_loss_sum += sum(offset_losses) / vd_batch_size wh_loss_sum += sum(wh_losses) / vd_batch_size valid_heatmap_loss_mean = np.divide(heatmap_loss_sum, valid_update_number_per_epoch) valid_offset_loss_mean = np.divide(offset_loss_sum, valid_update_number_per_epoch) valid_wh_loss_mean = np.divide(wh_loss_sum, valid_update_number_per_epoch) valid_total_loss_mean = valid_heatmap_loss_mean + valid_offset_loss_mean + valid_wh_loss_mean logging.info( f"valid heatmap loss : {valid_heatmap_loss_mean} / valid offset loss : {valid_offset_loss_mean} / valid wh loss : {valid_wh_loss_mean} / valid total loss : {valid_total_loss_mean}" ) AP_appender = [] round_position = 2 class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list( ) for j, c, p, r in zip(range(len(recall)), class_name, precision, recall): name, AP = precision_recall.get_AP(c, p, r) logging.info( f"class {j}'s {name} AP : {round(AP * 100, round_position)}%" ) AP_appender.append(AP) mAP_result = np.mean(AP_appender) logging.info(f"mAP : {round(mAP_result * 100, round_position)}%") precision_recall.get_PR_curve(name=class_name, precision=precision, recall=recall, threshold=threshold, AP=AP_appender, mAP=mAP_result, folder_name=valid_graph_path, epoch=i) precision_recall.reset() if tensorboard: # gpu N 개를 대비한 코드 (Data Parallelism) dataloader_iter = iter(valid_dataloader) image, label, _, _, _, _, _ = next(dataloader_iter) if GPU_COUNT <= 1: image = gluon.utils.split_and_load(image, [ctx], even_split=False) label = gluon.utils.split_and_load(label, [ctx], even_split=False) else: image = gluon.utils.split_and_load(image, ctx, even_split=False) label = gluon.utils.split_and_load(label, ctx, even_split=False) ground_truth_colors = {} for k in range(num_classes): ground_truth_colors[k] = (0, 0, 1) batch_image = [] heatmap_image = [] for img, lb in zip(image, label): gt_boxes = lb[:, :, :4] gt_ids = lb[:, :, 4:5] heatmap_pred, offset_pred, wh_pred = net(img) ids, scores, bboxes = prediction(heatmap_pred, offset_pred, wh_pred) for ig, gt_id, gt_box, heatmap, id, score, bbox in zip( img, gt_ids, gt_boxes, heatmap_pred, ids, scores, bboxes): ig = ig.transpose((1, 2, 0)) * mx.nd.array( std, ctx=ig.context) + mx.nd.array(mean, ctx=ig.context) ig = (ig * 255).clip(0, 255) # heatmap 그리기 heatmap = mx.nd.multiply(heatmap, 255.0) # 0 ~ 255 범위로 바꾸기 heatmap = mx.nd.max( heatmap, axis=0, keepdims=True) # channel 축으로 가장 큰것 뽑기 heatmap = mx.nd.transpose( heatmap, axes=(1, 2, 0)) # (height, width, channel=1) heatmap = mx.nd.repeat( heatmap, repeats=3, axis=-1) # (height, width, channel=3) heatmap = heatmap.asnumpy( ) # mxnet.ndarray -> numpy.ndarray heatmap = cv2.resize(heatmap, dsize=(input_size[1], input_size[0])) # 사이즈 원복 heatmap = heatmap.astype("uint8") # float32 -> uint8 heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET) heatmap[:, :, (0, 1, 2)] = heatmap[:, :, (2, 1, 0)] # BGR -> RGB heatmap = np.transpose( heatmap, axes=(2, 0, 1)) # (channel=3, height, width) # ground truth box 그리기 ground_truth = plot_bbox( ig, gt_box * scale_factor, scores=None, labels=gt_id, thresh=None, reverse_rgb=True, class_names=valid_dataset.classes, absolute_coordinates=True, colors=ground_truth_colors) # prediction box 그리기 prediction_box = plot_bbox( ground_truth, bbox, scores=score, labels=id, thresh=plot_class_thresh, reverse_rgb=False, class_names=valid_dataset.classes, absolute_coordinates=True) # Tensorboard에 그리기 위해 BGR -> RGB / (height, width, channel) -> (channel, height, width) 를한다. prediction_box = cv2.cvtColor(prediction_box, cv2.COLOR_BGR2RGB) prediction_box = np.transpose(prediction_box, axes=(2, 0, 1)) batch_image.append( prediction_box) # (batch, channel, height, width) heatmap_image.append(heatmap) all_image = np.concatenate( [np.array(batch_image), np.array(heatmap_image)], axis=-1) summary.add_image(tag="valid_result", image=all_image, global_step=i) summary.add_scalar(tag="heatmap_loss", value={ "train_heatmap_loss_mean": train_heatmap_loss_mean, "valid_heatmap_loss_mean": valid_heatmap_loss_mean }, global_step=i) summary.add_scalar(tag="offset_loss", value={ "train_offset_loss_mean": train_offset_loss_mean, "valid_offset_loss_mean": valid_offset_loss_mean }, global_step=i) summary.add_scalar(tag="wh_loss", value={ "train_wh_loss_mean": train_wh_loss_mean, "valid_wh_loss_mean": valid_wh_loss_mean }, global_step=i) summary.add_scalar(tag="total_loss", value={ "train_total_loss": train_total_loss_mean, "valid_total_loss": valid_total_loss_mean }, global_step=i) params = net.collect_params().values() if GPU_COUNT > 1: for c in ctx: for p in params: summary.add_histogram(tag=p.name, values=p.data(ctx=c), global_step=i, bins='default') else: for p in params: summary.add_histogram(tag=p.name, values=p.data(), global_step=i, bins='default') if i % save_period == 0: if not os.path.exists(weight_path): os.makedirs(weight_path) ''' Hybrid models can be serialized as JSON files using the export function Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface. When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc. ''' if GPU_COUNT >= 1: context = mx.gpu(0) else: context = mx.cpu(0) postnet = PostNet(net=net, auxnet=prediction) # 새로운 객체가 생성 try: net.export(os.path.join(weight_path, f"{model}"), epoch=i, remove_amp_cast=True) net.save_parameters(os.path.join(weight_path, f"{i}.params")) # onnx 추출용 # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함 export_block_for_cplusplus( path=os.path.join(weight_path, f"{model}_prepost"), block=postnet, data_shape=tuple(input_size) + tuple((3, )), epoch=i, preprocess= True, # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨 layout='HWC', ctx=context, remove_amp_cast=True) except Exception as E: logging.error(f"json, param model export 예외 발생 : {E}") else: logging.info("json, param model export 성공") net.collect_params().reset_ctx(ctx) end_time = time.time() learning_time = end_time - start_time logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H") logging.info("optimization completed") if using_mlflow: ml.log_metric("learning time", round(learning_time / 3600, 2))
import cv2
def run(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], anchor_alloc_size=[256, 256], anchor_sizes=[32, 64, 128, 256, 512], anchor_size_ratios=[1, pow(2, 1 / 3), pow(2, 2 / 3)], anchor_aspect_ratios=[0.5, 1, 2], anchor_box_clip=True, graphviz=True, epoch=100, input_size=[512, 512], batch_log=100, batch_size=16, batch_interval=10, subdivision=4, train_dataset_path="Dataset/train", valid_dataset_path="Dataset/valid", multiscale=True, factor_scale=[8, 5], foreground_iou_thresh=0.5, background_iou_thresh=0.4, data_augmentation=True, num_workers=4, optimizer="ADAM", weight_decay=0.000001, save_period=5, load_period=10, learning_rate=0.001, decay_lr=0.999, decay_step=10, GPU_COUNT=0, base=0, AMP=True, valid_size=8, eval_period=5, tensorboard=True, valid_graph_path="valid_Graph", valid_html_auto_open=True, using_mlflow=True, decode_number=5000, multiperclass=True, nms_thresh=0.5, nms_topk=500, iou_thresh=0.5, except_class_thresh=0.05, plot_class_thresh=0.5): if GPU_COUNT == 0: ctx = mx.cpu(0) AMP = False elif GPU_COUNT == 1: ctx = mx.gpu(0) else: ctx = [mx.gpu(i) for i in range(GPU_COUNT)] # 운영체제 확인 if platform.system() == "Linux": logging.info(f"{platform.system()} OS") elif platform.system() == "Windows": logging.info(f"{platform.system()} OS") else: logging.info(f"{platform.system()} OS") if isinstance(ctx, (list, tuple)): for i, c in enumerate(ctx): free_memory, total_memory = mx.context.gpu_memory_info(i) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info( f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB' ) else: if GPU_COUNT == 1: free_memory, total_memory = mx.context.gpu_memory_info(0) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info( f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB' ) else: logging.info(f'Running on {ctx}') if GPU_COUNT > 0 and batch_size < GPU_COUNT: logging.info("batch size must be greater than gpu number") exit(0) if AMP: amp.init() if multiscale: logging.info("Using MultiScale") if data_augmentation: logging.info("Using Data Augmentation") logging.info("training Efficient Detector") input_shape = (1, 3) + tuple(input_size) net = Efficient(version=base, anchor_sizes=anchor_sizes, anchor_size_ratios=anchor_size_ratios, anchor_aspect_ratios=anchor_aspect_ratios, anchor_box_clip=anchor_box_clip, alloc_size=anchor_alloc_size, ctx=mx.cpu()) train_dataloader, train_dataset = traindataloader( multiscale=multiscale, factor_scale=factor_scale, augmentation=data_augmentation, path=train_dataset_path, input_size=input_size, batch_size=batch_size, batch_interval=batch_interval, num_workers=num_workers, shuffle=True, mean=mean, std=std, net=net, foreground_iou_thresh=foreground_iou_thresh, background_iou_thresh=background_iou_thresh, make_target=True) train_update_number_per_epoch = len(train_dataloader) if train_update_number_per_epoch < 1: logging.warning("train batch size가 데이터 수보다 큼") exit(0) valid_list = glob.glob(os.path.join(valid_dataset_path, "*")) if valid_list: valid_dataloader, valid_dataset = validdataloader( path=valid_dataset_path, input_size=input_size, batch_size=valid_size, num_workers=num_workers, shuffle=True, mean=mean, std=std, net=net, foreground_iou_thresh=foreground_iou_thresh, background_iou_thresh=background_iou_thresh, make_target=True) valid_update_number_per_epoch = len(valid_dataloader) if valid_update_number_per_epoch < 1: logging.warning("valid batch size가 데이터 수보다 큼") exit(0) num_classes = train_dataset.num_class # 클래스 수 name_classes = train_dataset.classes optimizer = optimizer.upper() model = str(input_size[0]) + "_" + str( input_size[1]) + "_" + optimizer + "_EFF_" + str(base) weight_path = os.path.join("weights", f"{model}") sym_path = os.path.join(weight_path, f'{model}-symbol.json') param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params') optimizer_path = os.path.join(weight_path, f'{model}-{load_period:04d}.opt') if os.path.exists(param_path) and os.path.exists(sym_path): start_epoch = load_period logging.info(f"loading {os.path.basename(param_path)}\n") net = gluon.SymbolBlock.imports(sym_path, ['data'], param_path, ctx=ctx) else: start_epoch = 0 net = Efficient( version=base, input_size=input_size, anchor_sizes=anchor_sizes, anchor_size_ratios=anchor_size_ratios, anchor_aspect_ratios=anchor_aspect_ratios, num_classes=num_classes, # foreground만 anchor_box_clip=anchor_box_clip, alloc_size=anchor_alloc_size, ctx=ctx) if isinstance(ctx, (list, tuple)): net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.summary(mx.nd.ones(shape=input_shape, ctx=ctx)) ''' active (bool, default True) – Whether to turn hybrid on or off. static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase. static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower. ''' if multiscale: net.hybridize(active=True, static_alloc=True, static_shape=False) else: net.hybridize(active=True, static_alloc=True, static_shape=True) if start_epoch + 1 >= epoch + 1: logging.info("this model has already been optimized") exit(0) if tensorboard: summary = SummaryWriter(logdir=os.path.join("mxboard", model), max_queue=10, flush_secs=10, verbose=False) if isinstance(ctx, (list, tuple)): net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.forward(mx.nd.ones(shape=input_shape, ctx=ctx)) summary.add_graph(net) if graphviz: gluoncv.utils.viz.plot_network(net, shape=input_shape, save_prefix=model) # optimizer unit = 1 if (len(train_dataset) // batch_size) < 1 else len(train_dataset) // batch_size step = unit * decay_step lr_sch = mx.lr_scheduler.FactorScheduler(step=step, factor=decay_lr, stop_factor_lr=1e-12, base_lr=learning_rate) for p in net.collect_params().values(): if p.grad_req != "null": p.grad_req = 'add' ''' update_on_kvstore : bool, default None Whether to perform parameter updates on kvstore. If None, then trainer will choose the more suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored. ''' if optimizer.upper() == "ADAM": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": weight_decay, "beta1": 0.9, "beta2": 0.999, 'multi_precision': False }, update_on_kvstore=False if AMP else None) # for Dynamic loss scaling elif optimizer.upper() == "RMSPROP": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": weight_decay, "gamma1": 0.9, "gamma2": 0.999, 'multi_precision': False }, update_on_kvstore=False if AMP else None) # for Dynamic loss scaling elif optimizer.upper() == "SGD": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": weight_decay, "momentum": 0.9, 'multi_precision': False }, update_on_kvstore=False if AMP else None) # for Dynamic loss scaling else: logging.error("optimizer not selected") exit(0) if AMP: amp.init_trainer(trainer) # optimizer weight 불러오기 if os.path.exists(optimizer_path): try: trainer.load_states(optimizer_path) except Exception as E: logging.info(E) else: logging.info(f"loading {os.path.basename(optimizer_path)}\n") ''' localization loss -> Smooth L1 loss confidence loss -> Focal ''' confidence_loss = FocalLoss(alpha=0.25, gamma=2, sparse_label=True, from_sigmoid=False, batch_axis=None, num_class=num_classes, reduction="sum", exclude=False) localization_loss = HuberLoss(rho=1, batch_axis=None, reduction="sum", exclude=False) prediction = Prediction(batch_size=batch_size, from_sigmoid=False, num_classes=num_classes, decode_number=decode_number, nms_thresh=nms_thresh, nms_topk=nms_topk, except_class_thresh=except_class_thresh, multiperclass=multiperclass) precision_recall = Voc_2007_AP(iou_thresh=iou_thresh, class_names=name_classes) ctx_list = ctx if isinstance(ctx, (list, tuple)) else [ctx] start_time = time.time() for i in tqdm(range(start_epoch + 1, epoch + 1, 1), initial=start_epoch + 1, total=epoch): conf_loss_sum = 0 loc_loss_sum = 0 time_stamp = time.time() for batch_count, (image, _, cls_all, box_all, _) in enumerate(train_dataloader, start=1): td_batch_size = image.shape[0] image = mx.nd.split(data=image, num_outputs=subdivision, axis=0) cls_all = mx.nd.split(data=cls_all, num_outputs=subdivision, axis=0) box_all = mx.nd.split(data=box_all, num_outputs=subdivision, axis=0) if subdivision == 1: image = [image] cls_all = [cls_all] box_all = [box_all] ''' autograd 설명 https://mxnet.apache.org/api/python/docs/tutorials/getting-started/crash-course/3-autograd.html ''' with autograd.record(train_mode=True): cls_all_losses = [] box_all_losses = [] for image_split, cls_split, box_split in zip( image, cls_all, box_all): image_split = gluon.utils.split_and_load(image_split, ctx_list, even_split=False) cls_split = gluon.utils.split_and_load(cls_split, ctx_list, even_split=False) box_split = gluon.utils.split_and_load(box_split, ctx_list, even_split=False) # prediction, target space for Data Parallelism cls_losses = [] box_losses = [] total_loss = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, cls_target, box_target in zip( image_split, cls_split, box_split): cls_pred, box_pred, anchor = net(img) except_ignore_samples = cls_target > -1 positive_samples = cls_target > 0 positive_numbers = positive_samples.sum() conf_loss = confidence_loss( cls_pred, cls_target, except_ignore_samples.expand_dims(axis=-1)) conf_loss = mx.nd.divide(conf_loss, positive_numbers + 1) cls_losses.append(conf_loss.asscalar()) loc_loss = localization_loss( box_pred, box_target, positive_samples.expand_dims(axis=-1)) box_losses.append(loc_loss.asscalar()) total_loss.append(conf_loss + loc_loss) if AMP: with amp.scale_loss(total_loss, trainer) as scaled_loss: autograd.backward(scaled_loss) else: autograd.backward(total_loss) cls_all_losses.append(sum(cls_losses)) box_all_losses.append(sum(box_losses)) trainer.step(batch_size=td_batch_size, ignore_stale_grad=False) # 비우기 for p in net.collect_params().values(): p.zero_grad() conf_loss_sum += sum(cls_all_losses) / td_batch_size loc_loss_sum += sum(box_all_losses) / td_batch_size if batch_count % batch_log == 0: logging.info( f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],' f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],' f'[Lr = {trainer.learning_rate}]' f'[confidence loss = {sum(cls_all_losses) / td_batch_size:.3f}]' f'[localization loss = {sum(box_all_losses) / td_batch_size:.3f}]' ) time_stamp = time.time() train_conf_loss_mean = np.divide(conf_loss_sum, train_update_number_per_epoch) train_loc_loss_mean = np.divide(loc_loss_sum, train_update_number_per_epoch) train_total_loss_mean = train_conf_loss_mean + train_loc_loss_mean logging.info( f"train confidence loss : {train_conf_loss_mean} / train localization loss : {train_loc_loss_mean} / train total loss : {train_total_loss_mean}" ) if i % save_period == 0: weight_epoch_path = os.path.join(weight_path, str(i)) if not os.path.exists(weight_epoch_path): os.makedirs(weight_epoch_path) # optimizer weight 저장하기 try: trainer.save_states( os.path.join(weight_path, f'{model}-{i:04d}.opt')) except Exception as E: logging.error(f"optimizer weight export 예외 발생 : {E}") else: logging.info("optimizer weight export 성공") ''' Hybrid models can be serialized as JSON files using the export function Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface. When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc. ''' if GPU_COUNT >= 1: context = mx.gpu(0) else: context = mx.cpu(0) ''' mxnet1.6.0 버전 에서 AMP 사용시 위에 미리 선언한 prediction을 사용하면 문제가 될 수 있다. -yolo v3, gaussian yolo v3 에서는 문제가 발생한다. mxnet 1.5.x 버전에서는 아래와 같이 새로 선언하지 않아도 정상 동작한다. block들은 함수 인자로 보낼 경우 자기 자신이 보내진다.(복사되는 것이 아님) export_block_for_cplusplus 에서 prediction 이 hybridize 되면서 미리 선언한 prediction도 hybridize화 되면서 symbol 형태가 된다. 이런 현상을 보면 아래와같이 다시 선언해 주는게 맞는 것 같다. ''' auxnet = Prediction(from_sigmoid=False, num_classes=num_classes, decode_number=decode_number, nms_thresh=nms_thresh, nms_topk=nms_topk, except_class_thresh=except_class_thresh, multiperclass=multiperclass) postnet = PostNet(net=net, auxnet=auxnet) try: net.export(os.path.join(weight_path, f"{model}"), epoch=i, remove_amp_cast=True) net.save_parameters(os.path.join(weight_path, f"{i}.params")) # onnx 추출용 # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함 export_block_for_cplusplus( path=os.path.join(weight_epoch_path, f"{model}_prepost"), block=postnet, data_shape=tuple(input_size) + tuple((3, )), epoch=i, preprocess= True, # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨 layout='HWC', ctx=context, remove_amp_cast=True) except Exception as E: logging.error(f"json, param model export 예외 발생 : {E}") else: logging.info("json, param model export 성공") net.collect_params().reset_ctx(ctx) if i % eval_period == 0 and valid_list: conf_loss_sum = 0 loc_loss_sum = 0 # loss 구하기 for image, label, cls_all, box_all, _ in valid_dataloader: vd_batch_size = image.shape[0] image = gluon.utils.split_and_load(image, ctx_list, even_split=False) label = gluon.utils.split_and_load(label, ctx_list, even_split=False) cls_all = gluon.utils.split_and_load(cls_all, ctx_list, even_split=False) box_all = gluon.utils.split_and_load(box_all, ctx_list, even_split=False) # prediction, target space for Data Parallelism cls_losses = [] box_losses = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, lb, cls_target, box_target in zip( image, label, cls_all, box_all): gt_box = lb[:, :, :4] gt_id = lb[:, :, 4:5] cls_pred, box_pred, anchor = net(img) id, score, bbox = prediction(cls_pred, box_pred, anchor) precision_recall.update(pred_bboxes=bbox, pred_labels=id, pred_scores=score, gt_boxes=gt_box, gt_labels=gt_id) except_ignore_samples = cls_target > -1 positive_samples = cls_target > 0 positive_numbers = positive_samples.sum() conf_loss = confidence_loss( cls_pred, cls_target, except_ignore_samples.expand_dims(axis=-1)) conf_loss = mx.nd.divide(conf_loss, positive_numbers + 1) cls_losses.append(conf_loss.asscalar()) loc_loss = localization_loss( box_pred, box_target, positive_samples.expand_dims(axis=-1)) box_losses.append(loc_loss.asscalar()) conf_loss_sum += sum(cls_losses) / vd_batch_size loc_loss_sum += sum(box_losses) / vd_batch_size valid_conf_loss_mean = np.divide(conf_loss_sum, valid_update_number_per_epoch) valid_loc_loss_mean = np.divide(loc_loss_sum, valid_update_number_per_epoch) valid_total_loss_mean = valid_conf_loss_mean + valid_loc_loss_mean logging.info( f"valid confidence loss : {valid_conf_loss_mean} / valid localization loss : {valid_loc_loss_mean} / valid total loss : {valid_total_loss_mean}" ) AP_appender = [] round_position = 2 class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list( ) for j, c, p, r in zip(range(len(recall)), class_name, precision, recall): name, AP = precision_recall.get_AP(c, p, r) logging.info( f"class {j}'s {name} AP : {round(AP * 100, round_position)}%" ) AP_appender.append(AP) AP_appender = np.nan_to_num(AP_appender) mAP_result = np.mean(AP_appender) logging.info(f"mAP : {round(mAP_result * 100, round_position)}%") precision_recall.get_PR_curve(name=class_name, precision=precision, recall=recall, threshold=threshold, AP=AP_appender, mAP=mAP_result, folder_name=valid_graph_path, epoch=i, auto_open=valid_html_auto_open) precision_recall.reset() if tensorboard: # gpu N 개를 대비한 코드 (Data Parallelism) dataloader_iter = iter(valid_dataloader) image, label, _, _, _ = next(dataloader_iter) image = gluon.utils.split_and_load(image, ctx_list, even_split=False) label = gluon.utils.split_and_load(label, ctx_list, even_split=False) ground_truth_colors = {} for k in range(num_classes): ground_truth_colors[k] = (0, 1, 0) batch_image = [] for img, lb in zip(image, label): gt_boxes = lb[:, :, :4] gt_ids = lb[:, :, 4:5] cls_pred, box_pred, anchor = net(img) ids, scores, bboxes = prediction(cls_pred, box_pred, anchor) for ig, gt_id, gt_box, id, score, bbox in zip( img, gt_ids, gt_boxes, ids, scores, bboxes): ig = ig.transpose((1, 2, 0)) * mx.nd.array( std, ctx=ig.context) + mx.nd.array(mean, ctx=ig.context) ig = (ig * 255).clip(0, 255) ig = ig.astype(np.uint8) # ground truth box 그리기 ground_truth = plot_bbox( ig, gt_box, scores=None, labels=gt_id, thresh=None, reverse_rgb=False, class_names=valid_dataset.classes, absolute_coordinates=True, colors=ground_truth_colors) # prediction box 그리기 prediction_box = plot_bbox( ground_truth, bbox, scores=score, labels=id, thresh=plot_class_thresh, reverse_rgb=False, class_names=valid_dataset.classes, absolute_coordinates=True) # Tensorboard에 그리기 (height, width, channel) -> (channel, height, width) 를한다. prediction_box = np.transpose(prediction_box, axes=(2, 0, 1)) batch_image.append( prediction_box) # (batch, channel, height, width) summary.add_image(tag="valid_result", image=np.array(batch_image), global_step=i) summary.add_scalar(tag="conf_loss", value={ "train_conf_loss": train_conf_loss_mean, "valid_conf_loss": valid_conf_loss_mean }, global_step=i) summary.add_scalar(tag="loc_loss", value={ "train_loc_loss": train_loc_loss_mean, "valid_loc_loss": valid_loc_loss_mean }, global_step=i) summary.add_scalar(tag="total_loss", value={ "train_total_loss": train_total_loss_mean, "valid_total_loss": valid_total_loss_mean }, global_step=i) for p in net.collect_params().values(): summary.add_histogram(tag=p.name, values=p.data(ctx=ctx_list[0]), global_step=i, bins='default') end_time = time.time() learning_time = end_time - start_time logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H") logging.info("optimization completed") if using_mlflow: ml.log_metric("learning time", round(learning_time / 3600, 2))
def run(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], offset_alloc_size=(64, 64), anchors={"shallow": [(10, 13), (16, 30), (33, 23)], "middle": [(30, 61), (62, 45), (59, 119)], "deep": [(116, 90), (156, 198), (373, 326)]}, graphviz=False, epoch=100, input_size=[416, 416], batch_log=100, batch_size=16, batch_interval=10, subdivision=4, train_dataset_path="Dataset/train", valid_dataset_path="Dataset/valid", multiscale=False, factor_scale=[13, 5], ignore_threshold=0.5, dynamic=False, data_augmentation=True, num_workers=4, optimizer="ADAM", save_period=5, load_period=10, learning_rate=0.001, decay_lr=0.999, decay_step=10, GPU_COUNT=0, Darknetlayer=53, pretrained_base=True, pretrained_path="modelparam", AMP=True, valid_size=8, eval_period=5, tensorboard=True, valid_graph_path="valid_Graph", using_mlflow=True, multiperclass=True, nms_thresh=0.5, nms_topk=500, iou_thresh=0.5, except_class_thresh=0.05, plot_class_thresh=0.5): if GPU_COUNT == 0: ctx = mx.cpu(0) AMP = False elif GPU_COUNT == 1: ctx = mx.gpu(0) else: ctx = [mx.gpu(i) for i in range(GPU_COUNT)] # 운영체제 확인 if platform.system() == "Linux": logging.info(f"{platform.system()} OS") elif platform.system() == "Windows": logging.info(f"{platform.system()} OS") else: logging.info(f"{platform.system()} OS") if isinstance(ctx, (list, tuple)): for i, c in enumerate(ctx): free_memory, total_memory = mx.context.gpu_memory_info(i) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info(f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB') else: if GPU_COUNT == 1: free_memory, total_memory = mx.context.gpu_memory_info(0) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info(f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB') else: logging.info(f'Running on {ctx}') # 입력 사이즈를 32의 배수로 지정해 버리기 - stride가 일그러지는 것을 막기 위함 if input_size[0] % 32 != 0 and input_size[1] % 32 != 0: logging.info("The input size must be a multiple of 32") exit(0) if GPU_COUNT > 0 and batch_size < GPU_COUNT: logging.info("batch size must be greater than gpu number") exit(0) if AMP: amp.init() if multiscale: logging.info("Using MultiScale") if data_augmentation: logging.info("Using Data Augmentation") logging.info("training YoloV3 Detector") input_shape = (1, 3) + tuple(input_size) try: net = Yolov3(Darknetlayer=Darknetlayer, anchors=anchors, pretrained=False, ctx=mx.cpu()) train_dataloader, train_dataset = traindataloader(multiscale=multiscale, factor_scale=factor_scale, augmentation=data_augmentation, path=train_dataset_path, input_size=input_size, batch_size=batch_size, batch_interval=batch_interval, num_workers=num_workers, shuffle=True, mean=mean, std=std, net=net, ignore_threshold=ignore_threshold, dynamic=dynamic, from_sigmoid=False, make_target=True) valid_dataloader, valid_dataset = validdataloader(path=valid_dataset_path, input_size=input_size, batch_size=valid_size, num_workers=num_workers, shuffle=True, mean=mean, std=std, net=net, ignore_threshold=ignore_threshold, dynamic=dynamic, from_sigmoid=False, make_target=True) except Exception: logging.info("dataset 없음") exit(0) train_update_number_per_epoch = len(train_dataloader) if train_update_number_per_epoch < 1: logging.warning("train batch size가 데이터 수보다 큼") exit(0) valid_list = glob.glob(os.path.join(valid_dataset_path, "*")) if valid_list: valid_update_number_per_epoch = len(valid_dataloader) if valid_update_number_per_epoch < 1: logging.warning("valid batch size가 데이터 수보다 큼") exit(0) num_classes = train_dataset.num_class # 클래스 수 name_classes = train_dataset.classes optimizer = optimizer.upper() if pretrained_base: model = str(input_size[0]) + "_" + str(input_size[1]) + "_" + optimizer + "_P" + "Dark_" + str(Darknetlayer) else: model = str(input_size[0]) + "_" + str(input_size[1]) + "_" + optimizer + "_Dark_" + str(Darknetlayer) weight_path = f"weights/{model}" sym_path = os.path.join(weight_path, f'{model}-symbol.json') param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params') if os.path.exists(param_path) and os.path.exists(sym_path): start_epoch = load_period logging.info(f"loading {os.path.basename(param_path)} weights\n") net = gluon.SymbolBlock.imports(sym_path, ['data'], param_path, ctx=ctx) else: start_epoch = 0 ''' mxnet c++에서 arbitrary input image 를 받기 위한 전략 alloc_size : tuple of int, default is (128, 128) For advanced users. Define `alloc_size` to generate large enough offset maps, which will later saved in parameters. During inference, we support arbitrary input image by cropping corresponding area of the anchor map. This allow us to export to symbol so we can run it in c++, Scalar, etc. ''' net = Yolov3(Darknetlayer=Darknetlayer, input_size=input_size, anchors=anchors, num_classes=num_classes, # foreground만 pretrained=pretrained_base, pretrained_path=pretrained_path, alloc_size=offset_alloc_size, ctx=ctx) if isinstance(ctx, (list, tuple)): net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.summary(mx.nd.ones(shape=input_shape, ctx=ctx)) ''' active (bool, default True) – Whether to turn hybrid on or off. static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase. static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower. ''' if multiscale: net.hybridize(active=True, static_alloc=True, static_shape=False) else: net.hybridize(active=True, static_alloc=True, static_shape=True) if start_epoch + 1 >= epoch + 1: logging.info("this model has already been optimized") exit(0) if tensorboard: summary = SummaryWriter(logdir=os.path.join("mxboard", model), max_queue=10, flush_secs=10, verbose=False) if isinstance(ctx, (list, tuple)): net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.forward(mx.nd.ones(shape=input_shape, ctx=ctx)) summary.add_graph(net) if graphviz: gluoncv.utils.viz.plot_network(net, shape=input_shape, save_prefix=model) # optimizer unit = 1 if (len(train_dataset) // batch_size) < 1 else len(train_dataset) // batch_size step = unit * decay_step lr_sch = mx.lr_scheduler.FactorScheduler(step=step, factor=decay_lr, stop_factor_lr=1e-12, base_lr=learning_rate) for p in net.collect_params().values(): if p.grad_req != "null": p.grad_req = 'add' if AMP: ''' update_on_kvstore : bool, default None Whether to perform parameter updates on kvstore. If None, then trainer will choose the more suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored. ''' if optimizer.upper() == "ADAM": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate, "lr_scheduler": lr_sch, "beta1": 0.9, "beta2": 0.999, 'multi_precision': False}, update_on_kvstore=False) # for Dynamic loss scaling elif optimizer.upper() == "RMSPROP": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate, "lr_scheduler": lr_sch, "gamma1": 0.9, "gamma2": 0.999, 'multi_precision': False}, update_on_kvstore=False) # for Dynamic loss scaling elif optimizer.upper() == "SGD": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": 0.0005, "momentum": 0.9, 'multi_precision': False}, update_on_kvstore=False) # for Dynamic loss scaling else: logging.error("optimizer not selected") exit(0) amp.init_trainer(trainer) else: if optimizer.upper() == "ADAM": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate, "lr_scheduler": lr_sch, "beta1": 0.9, "beta2": 0.999, 'multi_precision': False}) elif optimizer.upper() == "RMSPROP": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate, "lr_scheduler": lr_sch, "gamma1": 0.9, "gamma2": 0.999, 'multi_precision': False}) elif optimizer.upper() == "SGD": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={"learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": 0.0005, "momentum": 0.9, 'multi_precision': False}) else: logging.error("optimizer not selected") exit(0) loss = Yolov3Loss(sparse_label=True, from_sigmoid=False, batch_axis=None, num_classes=num_classes, reduction="sum", exclude=False) prediction = Prediction( from_sigmoid=False, num_classes=num_classes, nms_thresh=nms_thresh, nms_topk=nms_topk, except_class_thresh=except_class_thresh, multiperclass=multiperclass) precision_recall = Voc_2007_AP(iou_thresh=iou_thresh, class_names=name_classes) start_time = time.time() for i in tqdm(range(start_epoch + 1, epoch + 1, 1), initial=start_epoch + 1, total=epoch): xcyc_loss_sum = 0 wh_loss_sum = 0 object_loss_sum = 0 class_loss_sum = 0 time_stamp = time.time() for batch_count, (image, _, xcyc_all, wh_all, objectness_all, class_all, weights_all, _) in enumerate( train_dataloader, start=1): td_batch_size = image.shape[0] image = mx.nd.split(data=image, num_outputs=subdivision, axis=0) xcyc_all = mx.nd.split(data=xcyc_all, num_outputs=subdivision, axis=0) wh_all = mx.nd.split(data=wh_all, num_outputs=subdivision, axis=0) objectness_all = mx.nd.split(data=objectness_all, num_outputs=subdivision, axis=0) class_all = mx.nd.split(data=class_all, num_outputs=subdivision, axis=0) weights_all = mx.nd.split(data=weights_all, num_outputs=subdivision, axis=0) if subdivision == 1: image = [image] xcyc_all = [xcyc_all] wh_all = [wh_all] objectness_all = [objectness_all] class_all = [class_all] weights_all = [weights_all] ''' autograd 설명 https://mxnet.apache.org/api/python/docs/tutorials/getting-started/crash-course/3-autograd.html ''' with autograd.record(train_mode=True): xcyc_all_losses = [] wh_all_losses = [] object_all_losses = [] class_all_losses = [] for image_split, xcyc_split, wh_split, objectness_split, class_split, weights_split in zip(image, xcyc_all, wh_all, objectness_all, class_all, weights_all): if GPU_COUNT <= 1: image_split = gluon.utils.split_and_load(image_split, [ctx], even_split=False) xcyc_split = gluon.utils.split_and_load(xcyc_split, [ctx], even_split=False) wh_split = gluon.utils.split_and_load(wh_split, [ctx], even_split=False) objectness_split = gluon.utils.split_and_load(objectness_split, [ctx], even_split=False) class_split = gluon.utils.split_and_load(class_split, [ctx], even_split=False) weights_split = gluon.utils.split_and_load(weights_split, [ctx], even_split=False) else: image_split = gluon.utils.split_and_load(image_split, ctx, even_split=False) xcyc_split = gluon.utils.split_and_load(xcyc_split, ctx, even_split=False) wh_split = gluon.utils.split_and_load(wh_split, ctx, even_split=False) objectness_split = gluon.utils.split_and_load(objectness_split, ctx, even_split=False) class_split = gluon.utils.split_and_load(class_split, ctx, even_split=False) weights_split = gluon.utils.split_and_load(weights_split, ctx, even_split=False) xcyc_losses = [] wh_losses = [] object_losses = [] class_losses = [] total_loss = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, xcyc_target, wh_target, objectness, class_target, weights in zip(image_split, xcyc_split, wh_split, objectness_split, class_split, weights_split): output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3 = net( img) xcyc_loss, wh_loss, object_loss, class_loss = loss(output1, output2, output3, xcyc_target, wh_target, objectness, class_target, weights) xcyc_losses.append(xcyc_loss.asscalar()) wh_losses.append(wh_loss.asscalar()) object_losses.append(object_loss.asscalar()) class_losses.append(class_loss.asscalar()) total_loss.append(xcyc_loss + wh_loss + object_loss + class_loss) if AMP: with amp.scale_loss(total_loss, trainer) as scaled_loss: autograd.backward(scaled_loss) else: autograd.backward(total_loss) xcyc_all_losses.append(sum(xcyc_losses)) wh_all_losses.append(sum(wh_losses)) object_all_losses.append(sum(object_losses)) class_all_losses.append(sum(class_losses)) trainer.step(batch_size=td_batch_size, ignore_stale_grad=False) # 비우기 for p in net.collect_params().values(): p.zero_grad() xcyc_loss_sum += sum(xcyc_all_losses) / td_batch_size wh_loss_sum += sum(wh_all_losses) / td_batch_size object_loss_sum += sum(object_all_losses) / td_batch_size class_loss_sum += sum(class_all_losses) / td_batch_size if batch_count % batch_log == 0: logging.info(f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],' f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],' f'[Lr = {trainer.learning_rate}]' f'[xcyc loss = {sum(xcyc_all_losses) / td_batch_size:.3f}]' f'[wh loss = {sum(wh_all_losses) / td_batch_size:.3f}]' f'[obj loss = {sum(object_all_losses) / td_batch_size:.3f}]' f'[class loss = {sum(class_all_losses) / td_batch_size:.3f}]') time_stamp = time.time() train_xcyc_loss_mean = np.divide(xcyc_loss_sum, train_update_number_per_epoch) train_wh_loss_mean = np.divide(wh_loss_sum, train_update_number_per_epoch) train_object_loss_mean = np.divide(object_loss_sum, train_update_number_per_epoch) train_class_loss_mean = np.divide(class_loss_sum, train_update_number_per_epoch) train_total_loss_mean = train_xcyc_loss_mean + train_wh_loss_mean + train_object_loss_mean + train_class_loss_mean logging.info( f"train xcyc loss : {train_xcyc_loss_mean} / " f"train wh loss : {train_wh_loss_mean} / " f"train object loss : {train_object_loss_mean} / " f"train class loss : {train_class_loss_mean} / " f"train total loss : {train_total_loss_mean}" ) if i % eval_period == 0 and valid_list: xcyc_loss_sum = 0 wh_loss_sum = 0 object_loss_sum = 0 class_loss_sum = 0 # loss 구하기 for image, label, xcyc_all, wh_all, objectness_all, class_all, weights_all, _ in valid_dataloader: vd_batch_size, _, height, width = image.shape if GPU_COUNT <= 1: image = gluon.utils.split_and_load(image, [ctx], even_split=False) label = gluon.utils.split_and_load(label, [ctx], even_split=False) xcyc_all = gluon.utils.split_and_load(xcyc_all, [ctx], even_split=False) wh_all = gluon.utils.split_and_load(wh_all, [ctx], even_split=False) objectness_all = gluon.utils.split_and_load(objectness_all, [ctx], even_split=False) class_all = gluon.utils.split_and_load(class_all, [ctx], even_split=False) weights_all = gluon.utils.split_and_load(weights_all, [ctx], even_split=False) else: image = gluon.utils.split_and_load(image, ctx, even_split=False) label = gluon.utils.split_and_load(label, ctx, even_split=False) xcyc_all = gluon.utils.split_and_load(xcyc_all, ctx, even_split=False) wh_all = gluon.utils.split_and_load(wh_all, ctx, even_split=False) objectness_all = gluon.utils.split_and_load(objectness_all, ctx, even_split=False) class_all = gluon.utils.split_and_load(class_all, ctx, even_split=False) weights_all = gluon.utils.split_and_load(weights_all, ctx, even_split=False) xcyc_losses = [] wh_losses = [] object_losses = [] class_losses = [] total_loss = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, lb, xcyc_target, wh_target, objectness, class_target, weights in zip(image, label, xcyc_all, wh_all, objectness_all, class_all, weights_all): gt_box = lb[:, :, :4] gt_id = lb[:, :, 4:5] output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3 = net( img) id, score, bbox = prediction(output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3) precision_recall.update(pred_bboxes=bbox, pred_labels=id, pred_scores=score, gt_boxes=gt_box, gt_labels=gt_id) xcyc_loss, wh_loss, object_loss, class_loss = loss(output1, output2, output3, xcyc_target, wh_target, objectness, class_target, weights) xcyc_losses.append(xcyc_loss.asscalar()) wh_losses.append(wh_loss.asscalar()) object_losses.append(object_loss.asscalar()) class_losses.append(class_loss.asscalar()) total_loss.append(xcyc_losses + wh_losses + object_losses + class_losses) xcyc_loss_sum += sum(xcyc_losses) / vd_batch_size wh_loss_sum += sum(wh_losses) / vd_batch_size object_loss_sum += sum(object_losses) / vd_batch_size class_loss_sum += sum(class_losses) / vd_batch_size valid_xcyc_loss_mean = np.divide(xcyc_loss_sum, valid_update_number_per_epoch) valid_wh_loss_mean = np.divide(wh_loss_sum, valid_update_number_per_epoch) valid_object_loss_mean = np.divide(object_loss_sum, valid_update_number_per_epoch) valid_class_loss_mean = np.divide(class_loss_sum, valid_update_number_per_epoch) valid_total_loss_mean = valid_xcyc_loss_mean + valid_wh_loss_mean + valid_object_loss_mean + valid_class_loss_mean logging.info( f"valid xcyc loss : {valid_xcyc_loss_mean} / " f"valid wh loss : {valid_wh_loss_mean} / " f"valid object loss : {valid_object_loss_mean} / " f"valid class loss : {valid_class_loss_mean} / " f"valid total loss : {valid_total_loss_mean}" ) AP_appender = [] round_position = 2 class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list() for j, c, p, r in zip(range(len(recall)), class_name, precision, recall): name, AP = precision_recall.get_AP(c, p, r) logging.info(f"class {j}'s {name} AP : {round(AP * 100, round_position)}%") AP_appender.append(AP) mAP_result = np.mean(AP_appender) logging.info(f"mAP : {round(mAP_result * 100, round_position)}%") precision_recall.get_PR_curve(name=class_name, precision=precision, recall=recall, threshold=threshold, AP=AP_appender, mAP=mAP_result, folder_name=valid_graph_path, epoch=i) precision_recall.reset() if tensorboard: # gpu N 개를 대비한 코드 (Data Parallelism) dataloader_iter = iter(valid_dataloader) image, label, _, _, _, _, _, _ = next(dataloader_iter) if GPU_COUNT <= 1: image = gluon.utils.split_and_load(image, [ctx], even_split=False) label = gluon.utils.split_and_load(label, [ctx], even_split=False) else: image = gluon.utils.split_and_load(image, ctx, even_split=False) label = gluon.utils.split_and_load(label, ctx, even_split=False) ground_truth_colors = {} for k in range(num_classes): ground_truth_colors[k] = (0, 0, 1) batch_image = [] for img, lb in zip(image, label): gt_boxes = lb[:, :, :4] gt_ids = lb[:, :, 4:5] output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3 = net( img) ids, scores, bboxes = prediction(output1, output2, output3, anchor1, anchor2, anchor3, offset1, offset2, offset3, stride1, stride2, stride3) for ig, gt_id, gt_box, id, score, bbox in zip(img, gt_ids, gt_boxes, ids, scores, bboxes): ig = ig.transpose( (1, 2, 0)) * mx.nd.array(std, ctx=ig.context) + mx.nd.array(mean, ctx=ig.context) ig = (ig * 255).clip(0, 255) # ground truth box 그리기 ground_truth = plot_bbox(ig, gt_box, scores=None, labels=gt_id, thresh=None, reverse_rgb=True, class_names=valid_dataset.classes, absolute_coordinates=True, colors=ground_truth_colors) # prediction box 그리기 prediction_box = plot_bbox(ground_truth, bbox, scores=score, labels=id, thresh=plot_class_thresh, reverse_rgb=False, class_names=valid_dataset.classes, absolute_coordinates=True) # Tensorboard에 그리기 위해 BGR -> RGB / (height, width, channel) -> (channel, height, width) 를한다. prediction_box = cv2.cvtColor(prediction_box, cv2.COLOR_BGR2RGB) prediction_box = np.transpose(prediction_box, axes=(2, 0, 1)) batch_image.append(prediction_box) # (batch, channel, height, width) summary.add_image(tag="valid_result", image=np.array(batch_image), global_step=i) summary.add_scalar(tag="xy_loss", value={"train_xcyc_loss": train_xcyc_loss_mean, "valid_xcyc_loss": valid_xcyc_loss_mean}, global_step=i) summary.add_scalar(tag="wh_loss", value={"train_wh_loss": train_wh_loss_mean, "valid_wh_loss": valid_wh_loss_mean}, global_step=i) summary.add_scalar(tag="object_loss", value={"train_object_loss": train_object_loss_mean, "valid_object_loss": valid_object_loss_mean}, global_step=i) summary.add_scalar(tag="class_loss", value={"train_class_loss": train_class_loss_mean, "valid_class_loss": valid_class_loss_mean}, global_step=i) summary.add_scalar(tag="total_loss", value={ "train_total_loss": train_total_loss_mean, "valid_total_loss": valid_total_loss_mean}, global_step=i) params = net.collect_params().values() if GPU_COUNT > 1: for c in ctx: for p in params: summary.add_histogram(tag=p.name, values=p.data(ctx=c), global_step=i, bins='default') else: for p in params: summary.add_histogram(tag=p.name, values=p.data(), global_step=i, bins='default') if i % save_period == 0: weight_epoch_path = os.path.join(weight_path, str(i)) if not os.path.exists(weight_epoch_path): os.makedirs(weight_epoch_path) ''' Hybrid models can be serialized as JSON files using the export function Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface. When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc. ''' if GPU_COUNT >= 1: context = mx.gpu(0) else: context = mx.cpu(0) postnet = PostNet(net=net, auxnet=prediction) try: net.export(os.path.join(weight_path, f"{model}"), epoch=i, remove_amp_cast=True) # for onnx net.save_parameters(os.path.join(weight_path, f"{i}.params")) # onnx 추출용 # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함 / onnx로는 추출 못함. export_block_for_cplusplus(path=os.path.join(weight_epoch_path, f"{model}_prepost"), block=postnet, data_shape=tuple(input_size) + tuple((3,)), epoch=i, preprocess=True, # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨 layout='HWC', ctx=context, remove_amp_cast=True) except Exception as E: logging.error(f"json, param model export 예외 발생 : {E}") else: logging.info("json, param model export 성공") net.collect_params().reset_ctx(ctx) end_time = time.time() learning_time = end_time - start_time logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H") logging.info("optimization completed") if using_mlflow: ml.log_metric("learning time", round(learning_time / 3600, 2))
def fit(args, network, data_loader, **kwargs): """ train a model args : argparse returns network : the symbol definition of the nerual network data_loader : function that returns the train and val data iterators """ # kvstore kv = mx.kvstore.create(args.kv_store) if args.gc_type != 'none': kv.set_gradient_compression({ 'type': args.gc_type, 'threshold': args.gc_threshold }) # logging head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s' logging.basicConfig(level=logging.DEBUG, format=head) logging.info('start with arguments %s', args) epoch_size = get_epoch_size(args, kv) # data iterators (train, val) = data_loader(args, kv) if 'dist' in args.kv_store and not 'async' in args.kv_store: logging.info('Resizing training data to %d batches per machine', epoch_size) # resize train iter to ensure each machine has same number of batches per epoch # if not, dist_sync can hang at the end with one machine waiting for other machines train = mx.io.ResizeIter(train, epoch_size) if args.test_io: tic = time.time() for i, batch in enumerate(train): if isinstance(batch, list): for b in batch: for j in b.data: j.wait_to_read() else: for j in batch.data: j.wait_to_read() if (i + 1) % args.disp_batches == 0: logging.info( 'Batch [%d]\tSpeed: %.2f samples/sec', i, args.disp_batches * args.batch_size / (time.time() - tic)) tic = time.time() return # define a summary writer that logs data and flushes to the file every 5 seconds if args.summarywriter: shutil.rmtree('/opt/incubator-mxnet/logs') # clear the previous logs os.mkdir('/opt/incubator-mxnet/logs') sw = SummaryWriter(logdir='/opt/incubator-mxnet/logs', flush_secs=args.flush_secs) # load model if 'arg_params' in kwargs and 'aux_params' in kwargs: arg_params = kwargs['arg_params'] aux_params = kwargs['aux_params'] else: sym, arg_params, aux_params = _load_model(args, kv.rank) if sym is not None: assert sym.tojson() == network.tojson() network = sym # log the network if args.summarywriter: sw.add_graph(network) # save model checkpoint = _save_model(args, kv.rank) # convert mean.bin to mean.npy _convert_mean_numpy(args, kv.rank) # devices for training devs = mx.cpu() if args.gpus is None or args.gpus == "" else [ mx.gpu(int(i)) for i in args.gpus.split(',') ] # learning rate lr, lr_scheduler = _get_lr_scheduler(args, kv) # create model model = mx.mod.Module(context=devs, symbol=network) lr_scheduler = lr_scheduler optimizer_params = { 'learning_rate': lr, 'wd': args.wd, 'lr_scheduler': lr_scheduler, 'multi_precision': True } # Only a limited number of optimizers have 'momentum' property has_momentum = {'sgd', 'dcasgd', 'nag'} if args.optimizer in has_momentum: optimizer_params['momentum'] = args.mom monitor = mx.mon.Monitor(args.monitor, pattern=".*") if args.monitor > 0 else None # A limited number of optimizers have a warmup period has_warmup = {'lbsgd', 'lbnag'} if args.optimizer in has_warmup: nworkers = kv.num_workers if epoch_size < 1: epoch_size = 1 macrobatch_size = args.macrobatch_size if macrobatch_size < args.batch_size * nworkers: macrobatch_size = args.batch_size * nworkers #batch_scale = round(float(macrobatch_size) / args.batch_size / nworkers +0.4999) batch_scale = math.ceil( float(macrobatch_size) / args.batch_size / nworkers) optimizer_params['updates_per_epoch'] = epoch_size optimizer_params[ 'begin_epoch'] = args.load_epoch if args.load_epoch else 0 optimizer_params['batch_scale'] = batch_scale optimizer_params['warmup_strategy'] = args.warmup_strategy optimizer_params['warmup_epochs'] = args.warmup_epochs optimizer_params['num_epochs'] = args.num_epochs if args.initializer == 'default': if args.network == 'alexnet': # AlexNet will not converge using Xavier initializer = mx.init.Normal() # VGG will not trend to converge using Xavier-Gaussian elif args.network and 'vgg' in args.network: initializer = mx.init.Xavier() else: initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="in", magnitude=2) # initializer = mx.init.Xavier(factor_type="in", magnitude=2.34), elif args.initializer == 'xavier': initializer = mx.init.Xavier() elif args.initializer == 'msra': initializer = mx.init.MSRAPrelu() elif args.initializer == 'orthogonal': initializer = mx.init.Orthogonal() elif args.initializer == 'normal': initializer = mx.init.Normal() elif args.initializer == 'uniform': initializer = mx.init.Uniform() elif args.initializer == 'one': initializer = mx.init.One() elif args.initializer == 'zero': initializer = mx.init.Zero() # evaluation metrices eval_metrics = ['accuracy'] if args.top_k > 0: eval_metrics.append( mx.metric.create('top_k_accuracy', top_k=args.top_k)) supported_loss = ['ce', 'nll_loss'] if len(args.loss) > 0: # ce or nll loss is only applicable to softmax output loss_type_list = args.loss.split(',') if 'softmax_output' in network.list_outputs(): for loss_type in loss_type_list: loss_type = loss_type.strip() if loss_type == 'nll': loss_type = 'nll_loss' if loss_type not in supported_loss: logging.warning(loss_type + ' is not an valid loss type, only cross-entropy or ' \ 'negative likelihood loss is supported!') else: eval_metrics.append(mx.metric.create(loss_type)) else: logging.warning( "The output is not softmax_output, loss argument will be skipped!" ) # callbacks that run after each batch if args.summarywriter: # 增加可视化的回调函数,有多个回调函数时,除最后一个回调函数外不能进行准确率的清零操作(即auto_reset参数必须设置为False) batch_end_callbacks = [ mx.callback.Speedometer(args.batch_size, args.disp_batches, False), summary_writter_callback.summary_writter_eval_metric(sw) ] else: batch_end_callbacks = [ mx.callback.Speedometer(args.batch_size, args.disp_batches, True) ] if 'batch_end_callback' in kwargs: cbs = kwargs['batch_end_callback'] batch_end_callbacks += cbs if isinstance(cbs, list) else [cbs] # run model.fit(train, begin_epoch=args.load_epoch if args.load_epoch else 0, num_epoch=args.num_epochs, eval_data=val, eval_metric=eval_metrics, kvstore=kv, optimizer=args.optimizer, optimizer_params=optimizer_params, initializer=initializer, arg_params=arg_params, aux_params=aux_params, batch_end_callback=batch_end_callbacks, epoch_end_callback=checkpoint, allow_missing=True, monitor=monitor) # log the weight after train if args.summarywriter: arg_params, aux_params = model.get_params() for k, v in arg_params.items(): if v.ndim == 4: # only weight matrix has four dimision weight = rescale_per_image(v) sw.add_image(tag=k, image=weight) sw.close()
def train(train_data, net, loss, trainer, ctx, num_epochs): print("Start training on ", ctx) sw = SummaryWriter(logdir='./logs', flush_secs=2) global_step = 0 epoch_step=0 if isinstance(ctx, mx.Context): ctx = [ctx] for epoch in range(num_epochs): train_loss, n, = 0.0, 0.0 TP,TN,FP,FN=0,0,0,0 start = time() for i,batch in enumerate(train_data): data, label, batch_size = get_batch(batch, ctx) losses = [] with autograd.record(): outputs = [net(X) for X in data] losses = [loss(yhat, y) for yhat, y in zip(outputs, label)] for l in losses: l.backward() sw.add_scalar(tag='cross_entropy', value=l.mean().asscalar(), global_step=global_step) global_step += 1 train_loss += sum([l.sum().asscalar() for l in losses]) n += batch_size trainer.step(batch_size) for data,label in test_data: data=data.as_in_context(ctx[0]) label=label.as_in_context(ctx[0]) pred=net(data) nd.waitall() pred=nd.sigmoid(pred) pred=(pred>0.5).reshape(-1,256,256) TPt=nd.sum(pred*label).asscalar() FPt=nd.sum(pred-(pred*label)).asscalar() FNt=nd.sum(label-(pred*label)).asscalar() TNt=nd.sum((1-pred)*(1-label)).asscalar() TP=TP+TPt FP=FP+FPt FN=FN+FNt TN=TN+TNt ACC=(TP+TN)/(TP+TN+FP+FN) TPR=TP/ (TP+ FN) TNR= TN/(FP+TN) PPV=TP/(TP+FP+1e-15) F1=2*PPV*TPR/(PPV+TPR+1e-15) sw.add_scalar(tag='test_acc', value=ACC, global_step=epoch_step) sw.add_scalar(tag='test_TPR', value=TPR, global_step=epoch_step) sw.add_scalar(tag='test_TNR', value=TNR, global_step=epoch_step) sw.add_scalar(tag='test_PPV', value=PPV, global_step=epoch_step) sw.add_scalar(tag='F1', value=F1, global_step=epoch_step) epoch_step+=1 print('test_acc=',ACC) print('test_TPR=',TPR) print('test_TNR=',TNR) print('test_PPV=',PPV) print('F1=',F1) if F1>0.61: net.save_parameters('u_e.params') if epoch == 0: sw.add_graph(net) print('train_loss=',train_loss/n) print('time:',time() - start) sw.close() net.export("mynet", epoch)
class TrainerAgent: def __init__( self, net, val_data, nb_parts, lr_schedule, momentum_schedule, total_it, wd=0.0001, batch_steps=1000, k_steps_initial=0, cpu_count=16, batch_size=2048, normalize=True, export_weights=True, export_grad_histograms=True, log_metrics_to_tensorboard=True, ctx=mx.gpu(), metrics={}, # clip_gradient=60, use_spike_recovery=True, max_spikes=5, spike_thresh=1.5, seed=42, val_loss_factor=0.01, policy_loss_factor=0.99, ): # , lr_warmup_k_steps=30, lr_warmup_init=0.01): # patience=25, nb_lr_drops=3, nb_k_steps=200, self._log_metrics_to_tensorboard = log_metrics_to_tensorboard self._ctx = ctx # lr_drop_fac=0.1, self._metrics = metrics self._net = net self._graph_exported = False # self._lr = lr self._normalize = normalize # self._nb_k_steps = nb_k_steps # self._patience = patience # self._nb_lr_droups = nb_lr_drops self._lr_schedule = lr_schedule self._momentum_schedule = momentum_schedule self._total_it = total_it self._batch_size = batch_size self._export_grad_histograms = export_grad_histograms self._cpu_count = cpu_count # self._lr_drop_fac = lr_drop_fac self._k_steps_initial = k_steps_initial self._val_data = val_data self._export_weights = export_weights self._batch_steps = batch_steps self._use_spike_recovery = use_spike_recovery self._max_spikes = max_spikes self._spike_thresh = spike_thresh self._seed = seed self._val_loss_factor = val_loss_factor self._policy_loss_factor = policy_loss_factor # self._nb_lr_drops = nb_lr_drops # self._warmup_k_steps = lr_warmup_k_steps # self._lr_warmup_init = lr_warmup_init # define a summary writer that logs data and flushes to the file every 5 seconds if log_metrics_to_tensorboard is True: self.sw = SummaryWriter(logdir="./logs", flush_secs=5, verbose=False) # Define the two loss functions self._softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss() self._l2_loss = gluon.loss.L2Loss() self._trainer = gluon.Trainer( self._net.collect_params(), "nag", { "learning_rate": lr_schedule(0), "momentum": momentum_schedule(0), #'clip_gradient': clip_gradient, "wd": wd, }, ) # collect parameter names for logging the gradients of parameters in each epoch self._params = self._net.collect_params() self._param_names = self._params.keys() # define a list which describes the order of the processed batches self.ordering = list(range(nb_parts)) def _log_metrics(self, metric_values, global_step, prefix="train_"): """ Logs a dictionary object of metric vlaue to the console and to tensorboard if _log_metrics_to_tensorboard is set to true :param metric_values: Dictionary object storing the current metrics :param global_step: X-Position point of all metric entries :param prefix: Used for labelling the metrics :return: """ for name in metric_values.keys(): # show the metric stats print(" - %s%s: %.4f" % (prefix, name, metric_values[name]), end="") # add the metrics to the tensorboard event file if self._log_metrics_to_tensorboard is True: self.sw.add_scalar( name, [prefix.replace("_", ""), metric_values[name]], global_step) def _process_on_data_plane_file(self, train_data, batch_proc_tmp): for i, (data, value_label, policy_label) in enumerate(train_data): data = data.as_in_context(self._ctx) value_label = value_label.as_in_context(self._ctx) policy_label = policy_label.as_in_context(self._ctx) # update a dummy metric to see a proper progress bar # (the metrics will get evaluated at the end of 100k steps) # if self.batch_proc_tmp > 0: # self._metrics['value_loss'].update(old_label, value_out) # old_label = value_label with autograd.record(): [value_out, policy_out] = self._net(data) value_loss = self._l2_loss(value_out, value_label) policy_loss = self._softmax_cross_entropy( policy_out, policy_label) # weight the components of the combined loss combined_loss = self._val_loss_factor * value_loss.sum( ) + self._policy_loss_factor * policy_loss.sum() # update a dummy metric to see a proper progress bar self._metrics["value_loss"].update(preds=value_out, labels=value_label) combined_loss.backward() self._trainer.step(data.shape[0]) batch_proc_tmp += 1 return batch_proc_tmp, self._metrics["value_loss"].get()[1] def train(self): """ :param net: Gluon network object :param val_data: Gluon dataloader object :param nb_parts: Sets how many different part files exist in the train directory :param lr: Initial learning rate :param momentum: :param wd: :param nb_k_steps: Number of steps in after which to drop the learning rate (assuming the patience counter early dropping hasn't activated beforehand) :param patience: Number of batches to wait until no progress on validation loss has been achieved. If the no progress has been done the learning rate is multiplied by the drop factor. :param nb_lr_drops: Number of time to drop the learning rate in total. This defines the end of the train loop :param batch_steps: Number of batches after which the validation loss is evaluated :param k_steps_initial: Initial starting point of the network in terms of process k batches (default 0) :param lr_drop_fac: Dropping factor to the learning rate to apply :param cpu_count: How many cpu threads on the current are available :param batch_size: Batch size to train the network with :param normalize: Weather to use data normalization after loading the data (recommend to set to True) :param export_weights: Sets if network checkpoints should be exported :param export_grad_histograms: Sets if the gradient updates of the weights should be logged to tensorboard :return: """ # set a custom seed for reproducibility random.seed(self._seed) # define and initialize the variables which will be used t_s = time() # predefine the local variables that will be used in the training loop val_loss_best = None val_p_acc_best = None k_steps_best = None patience_cnt = 0 epoch = 0 # keep track on how many batches have been processed in this epoch so far batch_proc_tmp = 0 # counter for thousands steps k_steps = self._k_steps_initial # calculate how many log states will be processed k_steps_end = self._total_it / self._batch_steps cur_it = 0 # count the number of spikes that have been detected nb_spikes = 0 # initialize the loss to compare with, with a very high value old_val_loss = 9000 # self._lr = self._lr_warmup_init # logging.info('Warmup-Schedule') # logging.info('Initial learning rate: lr = %.5f', self._lr) # logging.info('=========================================') # set initial lr # self._trainer.set_learning_rate(self._lr) # log the current learning rate # self.sw.add_scalar(tag='lr', value=self._lr, global_step=k_steps) # create a state variable to check if the net architecture has been reported yet graph_exported = False old_label = None value_out = None # safety check to prevent eternal loop if not self.ordering: raise Exception( "You must have at least one part file in your planes-dataset directory!" ) while True: # reshuffle the ordering of the training game batches (shuffle works in place) random.shuffle(self.ordering) epoch += 1 logging.info("EPOCH %d", epoch) logging.info("=========================") t_s_steps = time() for part_id in tqdm_notebook(self.ordering): # load one chunk of the dataset from memory s_idcs_train, x_train, yv_train, yp_train, pgn_datasets_train = load_pgn_dataset( dataset_type="train", part_id=part_id, normalize=self._normalize, verbose=False) # update the train_data object train_dataset = gluon.data.ArrayDataset( nd.array(x_train), nd.array(yv_train), nd.array(yp_train.argmax(axis=1))) train_data = gluon.data.DataLoader(train_dataset, batch_size=self._batch_size, shuffle=True, num_workers=self._cpu_count) # batch_proc_tmp, dummy = self._process_on_data_plane_file(train_data, batch_proc_tmp) for i, (data, value_label, policy_label) in enumerate(train_data): data = data.as_in_context(self._ctx) value_label = value_label.as_in_context(self._ctx) policy_label = policy_label.as_in_context(self._ctx) # update a dummy metric to see a proper progress bar # (the metrics will get evaluated at the end of 100k steps) if batch_proc_tmp > 0: self._metrics["value_loss"].update( old_label, value_out) old_label = value_label with autograd.record(): [value_out, policy_out] = self._net(data) value_loss = self._l2_loss(value_out, value_label) policy_loss = self._softmax_cross_entropy( policy_out, policy_label) # weight the components of the combined loss combined_loss = ( self._val_loss_factor * value_loss.sum() + self._policy_loss_factor * policy_loss.sum()) # update a dummy metric to see a proper progress bar # self._metrics['value_loss'].update(preds=value_out, labels=value_label) combined_loss.backward() # update the learning rate lr = self._lr_schedule(cur_it) self._trainer.set_learning_rate(lr) # update the momentum momentum = self._momentum_schedule(cur_it) self._trainer._optimizer.momentum = momentum self._trainer.step(data.shape[0]) cur_it += 1 batch_proc_tmp += 1 # add the graph representation of the network to the tensorboard log file if graph_exported is False and self._log_metrics_to_tensorboard is True: self.sw.add_graph(self._net) graph_exported = True # show metrics every thousands steps if batch_proc_tmp >= self._batch_steps: # if k_steps < self._warmup_k_steps: # update the learning rate # self._lr *= k_steps * ((self._lr_first - self._lr_warmup_init) / self._warmup_k_steps) + self._lr_warmup_init #self._lr_drop_fac # self._trainer.set_learning_rate(self._lr) # logging.info('Learning rate update: lr = %.5f', self._lr) # logging.info('=========================================') # log the current learning rate # update batch_proc_tmp counter by subtracting the batch_steps batch_proc_tmp = batch_proc_tmp - self._batch_steps # measure elapsed time ms_step = ( (time() - t_s_steps) / self._batch_steps) * 1000 # update the counters k_steps += 1 patience_cnt += 1 logging.info("Step %dK/%dK - %dms/step", k_steps, k_steps_end, ms_step) logging.info("-------------------------") logging.debug("Iteration %d/%d", cur_it, self._total_it) logging.debug("lr: %.7f - momentum: %.7f", lr, momentum) train_metric_values = evaluate_metrics(self._metrics, train_data, self._net, nb_batches=25, ctx=self._ctx) val_metric_values = evaluate_metrics(self._metrics, self._val_data, self._net, nb_batches=None, ctx=self._ctx) # spike_detected = False # spike_detected = old_val_loss * 1.5 < val_metric_values['loss'] # if np.isnan(val_metric_values['loss']): # spike_detected = True # check for spikes if self._use_spike_recovery is True and ( old_val_loss * self._spike_thresh < val_metric_values["loss"] or np.isnan(val_metric_values["loss"])): nb_spikes += 1 logging.warning( "Spike %d/%d occurred - val_loss: %.3f", nb_spikes, self._max_spikes, val_metric_values["loss"], ) if nb_spikes >= self._max_spikes: val_loss = val_metric_values["loss"] val_p_acc = val_metric_values["policy_acc"] logging.debug( "The maximum number of spikes has been reached. Stop training." ) # finally stop training because the number of lr drops has been achieved print() print("Elapsed time for training(hh:mm:ss): " + str( datetime.timedelta( seconds=round(time() - t_s)))) if self._log_metrics_to_tensorboard is True: self.sw.close() return (k_steps, val_loss, val_p_acc), (k_steps_best, val_loss_best, val_p_acc_best) logging.debug("Recover to latest checkpoint") # ## Load the best model once again model_path = "./weights/model-%.5f-%.3f-%04d.params" % ( val_loss_best, val_p_acc_best, k_steps_best, ) logging.debug("load current best model:%s" % model_path) self._net.load_parameters(model_path, ctx=self._ctx) k_steps = k_steps_best logging.debug("k_step is back at %d", k_steps_best) # print the elapsed time t_delta = time() - t_s_steps print(" - %.ds" % t_delta) t_s_steps = time() else: # update the val_loss_value to compare with using spike recovery old_val_loss = val_metric_values["loss"] # log the metric values to tensorboard self._log_metrics(train_metric_values, global_step=k_steps, prefix="train_") self._log_metrics(val_metric_values, global_step=k_steps, prefix="val_") if self._export_grad_histograms is True: grads = [] # logging the gradients of parameters for checking convergence for i_p, name in enumerate(self._param_names): if "bn" not in name and "batch" not in name: grads.append(self._params[name].grad()) self.sw.add_histogram( tag=name, values=grads[-1], global_step=k_steps, bins=20) # check if a new checkpoint shall be created if val_loss_best is None or val_metric_values[ "loss"] < val_loss_best: # update val_loss_best val_loss_best = val_metric_values["loss"] val_p_acc_best = val_metric_values[ "policy_acc"] k_steps_best = k_steps if self._export_weights is True: prefix = "./weights/model-%.5f-%.3f" % ( val_loss_best, val_p_acc_best) # the export function saves both the architecture and the weights self._net.export(prefix, epoch=k_steps_best) print() logging.info( "Saved checkpoint to %s-%04d.params" % (prefix, k_steps_best)) # reset the patience counter patience_cnt = 0 # print the elapsed time t_delta = time() - t_s_steps print(" - %.ds" % t_delta) t_s_steps = time() # log the samples per second metric to tensorbaord self.sw.add_scalar( tag="samples_per_second", value={ "hybrid_sync": data.shape[0] * self._batch_steps / t_delta }, global_step=k_steps, ) # log the current learning rate self.sw.add_scalar(tag="lr", value=self._lr_schedule(cur_it), global_step=k_steps) # log the current momentum value self.sw.add_scalar( tag="momentum", value=self._momentum_schedule(cur_it), global_step=k_steps) if cur_it >= self._total_it: val_loss = val_metric_values["loss"] val_p_acc = val_metric_values["policy_acc"] logging.debug( "The number of given iterations has been reached" ) # finally stop training because the number of lr drops has been achieved print() print("Elapsed time for training(hh:mm:ss): " + str( datetime.timedelta( seconds=round(time() - t_s)))) if self._log_metrics_to_tensorboard is True: self.sw.close() return (k_steps, val_loss, val_p_acc), (k_steps_best, val_loss_best, val_p_acc_best) """
def train(net, train_data, valid_data, num_epochs, lr, wd, momentum, ctx): trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': lr, 'wd': wd}) #trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr, 'momentum': momentum}) metric = mx.metric.Accuracy()#用来记录训练过程中的参数 #自己画出训练曲线 train_loss = [] if valid_data is not None: test_loss = [] # collect parameter names for logging the gradients of parameters in each epoch params = net.collect_params() param_names = params.keys() # define a summary writer that logs data and flushes to the file every 5 seconds sw = SummaryWriter(logdir='./logs', flush_secs=2) global_step = 0 prev_time = datetime.datetime.now()#记录每一个epoch的时间 for epoch in range(num_epochs): trainer = update_learning_rate(lr, trainer, epoch, opt.lr_factor, lr_steps) #学习率衰减策略 _loss = 0. metric.reset() for i, (data, label) in enumerate(train_data): label = label.as_in_context(ctx) #标签和数据,放在gpu上 data = data.as_in_context(ctx) #开始记录计算图 with autograd.record(): output = net(data) #预测值 loss = softmax_cross_entropy(output, label) #和真实label对比,计算loss sw.add_scalar(tag='cross_entropy', value=loss.mean().asscalar(), global_step=global_step) global_step += 1 loss.backward() #反向传播梯度 trainer.step(opt.batch_size) metric.update([label], [output]) if i % 100 == 0 and i > 0: name, acc = metric.get() print('[Epoch %d Batch %d] Training: %s=%f' % (epoch, i, name, acc)) if i == 0: pass #sw.add_image('kaggleDog_first_minibatch', data.reshape((opt.batch_size, 2048, 1, 1)), epoch) _loss += nd.mean(loss).asscalar() ####################使用MXboard画出训练曲线################### if epoch == 0: sw.add_graph(net) grads = [i.grad() for i in net.collect_params().values()] assert len(grads) == len(param_names) # logging the gradients of parameters for checking convergence for i, name in enumerate(param_names): sw.add_histogram(tag=name, values=grads[i], global_step=epoch, bins=1000) #训练精度 name, acc = metric.get() print('[Epoch %d] Training: %s=%f' % (epoch, name, acc)) # logging training accuracy sw.add_scalar(tag='train_acc', value=acc, global_step=epoch) #得到测试精度 name, val_acc = test(valid_data, ctx, net) # logging the validation accuracy print('[Epoch %d] Validation: %s=%f' % (epoch, name, val_acc)) sw.add_scalar(tag='valid_acc', value=val_acc, global_step=epoch) ####################使用MXboard画出训练曲线################### cur_time = datetime.datetime.now() #转换为时分秒格式 h, remainder = divmod((cur_time - prev_time).seconds, 3600) m, s = divmod(remainder, 60) time_str = "Time %02d:%02d:%02d" % (h, m, s) __loss = _loss/len(train_data) train_loss.append(__loss) #如果有验证数据,则给出训练loss和验证loss if valid_data is not None: valid_loss = get_loss(valid_data, net, ctx) epoch_str = ("Epoch %d. Train loss: %f, Valid loss %f, " % (epoch, __loss, valid_loss)) test_loss.append(valid_loss) else: epoch_str = ("Epoch %d. Train loss: %f, " % (epoch, __loss)) #打印出一个epoch的时间和loss prev_time = cur_time print(epoch_str + time_str + ', lr ' + str(trainer.learning_rate)) sw.close() #训练完成则画出loss曲线,保存到本地train.png plt.plot(train_loss, 'r') if valid_data is not None: plt.plot(test_loss, 'g') plt.legend(['Train_Loss', 'Test_Loss'], loc=2) #保存训练参模型文件 plt.savefig(pngname, dpi=1000) net.collect_params().save(modelparams) net.export('model')
def run(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], anchor_alloc_size=[256, 256], box_sizes=[21, 51.2, 133.12, 215.04, 296.96, 378.88, 460.8, 542.72], box_ratios=[[1, 2, 0.5]] + [[1, 2, 0.5, 3, 1.0 / 3]] * 4 + [[1, 2, 0.5]] * 2, anchor_box_clip=True, graphviz=True, epoch=100, input_size=[400, 600], batch_log=100, batch_size=16, batch_interval=10, subdivision=4, train_dataset_path="Dataset/train", valid_dataset_path="Dataset/valid", multiscale=True, factor_scale=[8, 5], foreground_iou_thresh=0.5, data_augmentation=True, num_workers=4, optimizer="ADAM", save_period=10, load_period=10, learning_rate=0.001, decay_lr=0.999, decay_step=10, GPU_COUNT=0, base="VGG16_512", pretrained_base=True, pretrained_path="modelparam", classHardNegativeMining=True, boxHardNegativeMining=True, AMP=True, valid_size=8, eval_period=5, tensorboard=True, valid_graph_path="valid_Graph", using_mlflow=True, decode_number=-1, multiperclass=True, nms_thresh=0.45, nms_topk=500, iou_thresh=0.5, except_class_thresh=0.01, plot_class_thresh=0.5): if GPU_COUNT == 0: ctx = mx.cpu(0) AMP = False elif GPU_COUNT == 1: ctx = mx.gpu(0) else: ctx = [mx.gpu(i) for i in range(GPU_COUNT)] # 운영체제 확인 if platform.system() == "Linux": logging.info(f"{platform.system()} OS") elif platform.system() == "Windows": logging.info(f"{platform.system()} OS") else: logging.info(f"{platform.system()} OS") if isinstance(ctx, (list, tuple)): for i, c in enumerate(ctx): free_memory, total_memory = mx.context.gpu_memory_info(i) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info( f'Running on {c} / free memory : {free_memory}GB / total memory {total_memory}GB' ) else: if GPU_COUNT == 1: free_memory, total_memory = mx.context.gpu_memory_info(0) free_memory = round(free_memory / (1024 * 1024 * 1024), 2) total_memory = round(total_memory / (1024 * 1024 * 1024), 2) logging.info( f'Running on {ctx} / free memory : {free_memory}GB / total memory {total_memory}GB' ) else: logging.info(f'Running on {ctx}') if GPU_COUNT > 0 and batch_size < GPU_COUNT: logging.info("batch size must be greater than gpu number") exit(0) if AMP: amp.init() if multiscale: logging.info("Using MultiScale") if data_augmentation: logging.info("Using Data Augmentation") logging.info("training SSD Detector") input_shape = (1, 3) + tuple(input_size) try: if base.upper() == "VGG16_300": # 입력 사이즈 300 x 300 추천 net = SSD_VGG16(version=300, input_size=input_size, box_sizes=box_sizes, box_ratios=box_ratios, anchor_box_clip=anchor_box_clip, alloc_size=anchor_alloc_size, ctx=mx.cpu()) elif base.upper() == "VGG16_512": # 입력 사이즈 512 x 512 추천 net = SSD_VGG16(version=512, input_size=input_size, box_sizes=box_sizes, box_ratios=box_ratios, anchor_box_clip=anchor_box_clip, ctx=mx.cpu()) train_dataloader, train_dataset = traindataloader( multiscale=multiscale, factor_scale=factor_scale, augmentation=data_augmentation, path=train_dataset_path, input_size=input_size, batch_size=batch_size, batch_interval=batch_interval, num_workers=num_workers, shuffle=True, mean=mean, std=std, net=net, foreground_iou_thresh=foreground_iou_thresh, make_target=True) valid_dataloader, valid_dataset = validdataloader( path=valid_dataset_path, input_size=input_size, batch_size=valid_size, num_workers=num_workers, shuffle=True, mean=mean, std=std, net=net, foreground_iou_thresh=foreground_iou_thresh, make_target=True) except Exception: logging.info("dataset 없음") exit(0) train_update_number_per_epoch = len(train_dataloader) if train_update_number_per_epoch < 1: logging.warning("train batch size가 데이터 수보다 큼") exit(0) valid_list = glob.glob(os.path.join(valid_dataset_path, "*")) if valid_list: valid_update_number_per_epoch = len(valid_dataloader) if valid_update_number_per_epoch < 1: logging.warning("valid batch size가 데이터 수보다 큼") exit(0) num_classes = train_dataset.num_class # 클래스 수 name_classes = train_dataset.classes # 이름 다시 붙이기 optimizer = optimizer.upper() base = base.upper() if pretrained_base: model = str(input_size[0]) + "_" + str( input_size[1]) + "_" + optimizer + "_P" + base else: model = str(input_size[0]) + "_" + str( input_size[1]) + "_" + optimizer + "_" + base weight_path = f"weights/{model}" sym_path = os.path.join(weight_path, f'{model}-symbol.json') param_path = os.path.join(weight_path, f'{model}-{load_period:04d}.params') if os.path.exists(param_path) and os.path.exists(sym_path): start_epoch = load_period logging.info(f"loading {os.path.basename(param_path)} weights\n") net = gluon.SymbolBlock.imports(sym_path, ['data'], param_path, ctx=ctx) else: start_epoch = 0 if base.upper() == "VGG16_300": # 입력 사이즈 300 x 300 추천 net = SSD_VGG16( version=300, input_size=input_size, # box_sizes=[21, 45, 101.25, 157.5, 213.75, 270, 326.25], # box_ratios=[[1, 2, 0.5]] + # conv4_3 # [[1, 2, 0.5, 3, 1.0 / 3]] * 3 + # conv7, conv8_2, conv9_2, conv10_2 # [[1, 2, 0.5]] * 2, # conv11_2, conv12_2 box_sizes=box_sizes, box_ratios=box_ratios, num_classes=num_classes, pretrained=pretrained_base, pretrained_path=pretrained_path, anchor_box_clip=anchor_box_clip, alloc_size=anchor_alloc_size, ctx=ctx) elif base.upper() == "VGG16_512": # 입력 사이즈 512 x 512 추천 net = SSD_VGG16( version=512, input_size=input_size, # box_sizes=[21, 51.2, 133.12, 215.04, 296.96, 378.88, 460.8, 542.72], # box_ratios=[[1, 2, 0.5]] + # conv4_3 # [[1, 2, 0.5, 3, 1.0 / 3]] * 4 + # conv7, conv8_2, conv9_2, conv10_2 # [[1, 2, 0.5]] * 2, # conv11_2, conv12_2 box_sizes=box_sizes, box_ratios=box_ratios, num_classes=num_classes, pretrained=pretrained_base, pretrained_path=pretrained_path, anchor_box_clip=anchor_box_clip, ctx=ctx) else: logging.warning("backbone 없음") exit(0) if isinstance(ctx, (list, tuple)): net.summary(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.summary(mx.nd.ones(shape=input_shape, ctx=ctx)) ''' active (bool, default True) – Whether to turn hybrid on or off. static_alloc (bool, default False) – Statically allocate memory to improve speed. Memory usage may increase. static_shape (bool, default False) – Optimize for invariant input shapes between iterations. Must also set static_alloc to True. Change of input shapes is still allowed but slower. ''' if multiscale: net.hybridize(active=True, static_alloc=True, static_shape=False) else: net.hybridize(active=True, static_alloc=True, static_shape=True) if start_epoch + 1 >= epoch + 1: logging.info("this model has already been optimized") exit(0) if tensorboard: summary = SummaryWriter(logdir=os.path.join("mxboard", model), max_queue=10, flush_secs=10, verbose=False) if isinstance(ctx, (list, tuple)): net.forward(mx.nd.ones(shape=input_shape, ctx=ctx[0])) else: net.forward(mx.nd.ones(shape=input_shape, ctx=ctx)) summary.add_graph(net) if graphviz: gluoncv.utils.viz.plot_network(net, shape=input_shape, save_prefix=model) # optimizer unit = 1 if (len(train_dataset) // batch_size) < 1 else len(train_dataset) // batch_size step = unit * decay_step lr_sch = mx.lr_scheduler.FactorScheduler(step=step, factor=decay_lr, stop_factor_lr=1e-12, base_lr=learning_rate) for p in net.collect_params().values(): if p.grad_req != "null": p.grad_req = 'add' if AMP: ''' update_on_kvstore : bool, default None Whether to perform parameter updates on kvstore. If None, then trainer will choose the more suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored. ''' if optimizer.upper() == "ADAM": trainer = gluon.Trainer( net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "beta1": 0.9, "beta2": 0.999, 'multi_precision': False }, update_on_kvstore=False) # for Dynamic loss scaling elif optimizer.upper() == "RMSPROP": trainer = gluon.Trainer( net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "gamma1": 0.9, "gamma2": 0.999, 'multi_precision': False }, update_on_kvstore=False) # for Dynamic loss scaling elif optimizer.upper() == "SGD": trainer = gluon.Trainer( net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": 0.0005, "momentum": 0.9, 'multi_precision': False }, update_on_kvstore=False) # for Dynamic loss scaling else: logging.error("optimizer not selected") exit(0) amp.init_trainer(trainer) else: if optimizer.upper() == "ADAM": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "beta1": 0.9, "beta2": 0.999, 'multi_precision': False }) elif optimizer.upper() == "RMSPROP": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "gamma1": 0.9, "gamma2": 0.999, 'multi_precision': False }) elif optimizer.upper() == "SGD": trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params={ "learning_rate": learning_rate, "lr_scheduler": lr_sch, "wd": 0.0005, "momentum": 0.9, 'multi_precision': False }) else: logging.error("optimizer not selected") exit(0) ''' localization loss -> Smooth L1 loss confidence loss -> Softmax ''' if not classHardNegativeMining: confidence_loss = SoftmaxCrossEntropyLoss(axis=-1, sparse_label=True, from_log_softmax=False, batch_axis=None, reduction="sum", exclude=False) if not boxHardNegativeMining: localization_loss = HuberLoss(rho=1, batch_axis=None, reduction="sum", exclude=False) prediction = Prediction(from_softmax=False, num_classes=num_classes, decode_number=decode_number, nms_thresh=nms_thresh, nms_topk=nms_topk, except_class_thresh=except_class_thresh, multiperclass=multiperclass) precision_recall = Voc_2007_AP(iou_thresh=iou_thresh, class_names=name_classes) start_time = time.time() for i in tqdm(range(start_epoch + 1, epoch + 1, 1), initial=start_epoch + 1, total=epoch): conf_loss_sum = 0 loc_loss_sum = 0 time_stamp = time.time() for batch_count, (image, _, cls_all, box_all, _) in enumerate(train_dataloader, start=1): td_batch_size = image.shape[0] image = mx.nd.split(data=image, num_outputs=subdivision, axis=0) cls_all = mx.nd.split(data=cls_all, num_outputs=subdivision, axis=0) box_all = mx.nd.split(data=box_all, num_outputs=subdivision, axis=0) if subdivision == 1: image = [image] cls_t_all = [cls_t_all] box_t_all = [box_t_all] with autograd.record(train_mode=True): cls_all_losses = [] box_all_losses = [] for image_split, cls_split, box_split in zip( image, cls_all, box_all): if GPU_COUNT <= 1: image_split = gluon.utils.split_and_load( image_split, [ctx], even_split=False) cls_split = gluon.utils.split_and_load( cls_split, [ctx], even_split=False) box_split = gluon.utils.split_and_load( box_split, [ctx], even_split=False) else: image_split = gluon.utils.split_and_load( image_split, ctx, even_split=False) cls_split = gluon.utils.split_and_load( cls_split, ctx, even_split=False) box_split = gluon.utils.split_and_load( box_split, ctx, even_split=False) # prediction, target space for Data Parallelism cls_losses = [] box_losses = [] total_loss = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, cls_target, box_target in zip( image_split, cls_split, box_split): # 1. SSD network Inference cls_pred, box_pred, anchor = net(img) ''' 4. Hard negative mining (class에만 loss 계산) Hard negative mining After the matching step, most of the default boxes are negatives, especially when the number of possible default boxes is large. This introduces a significant imbalance between the positive and negative training examples. Instead of using all the negative examples, we sort them using the highest confidence loss for each default box and pick the top ones so that the ratio between the negatives and positives is at most 3:1. We found that this leads to faster optimization and a more stable training ''' weight_term_alpha = 1 negative_mining_ratio = 3 positive_samples = cls_target > 0 # True or False positive_numbers = positive_samples.sum() if classHardNegativeMining: pred = mx.nd.log_softmax(cls_pred, axis=-1) negative_samples = 1 - positive_samples conf_loss = -mx.nd.pick( pred, cls_target, axis=-1) # (batch, all feature number) ''' we sort them using the highest confidence loss for each default box and pick the top ones so that the ratio between the negatives and positives is at most 3:1. ''' negative_samples_conf_loss = (conf_loss * negative_samples) # 아래 3줄의 코드 출처 : from gluoncv.loss import SSDMultiBoxLoss negative_samples_index = mx.nd.argsort( negative_samples_conf_loss, axis=-1, is_ascend=False) selection = mx.nd.argsort(negative_samples_index, axis=-1, is_ascend=True) hard_negative_samples = selection <= mx.nd.multiply( positive_numbers, negative_mining_ratio).expand_dims(-1) pos_hardnega = positive_samples + hard_negative_samples conf_loss = mx.nd.where( pos_hardnega > 0, conf_loss, mx.nd.zeros_like(conf_loss)) conf_loss = mx.nd.sum(conf_loss) if positive_numbers: conf_loss = mx.nd.divide( conf_loss, positive_numbers) else: conf_loss = mx.nd.multiply(conf_loss, 0) cls_losses.append(conf_loss.asscalar()) else: conf_loss = confidence_loss( cls_pred, cls_target, positive_samples.expand_dims(axis=-1)) if positive_numbers: conf_loss = mx.nd.divide( conf_loss, positive_numbers) else: conf_loss = mx.nd.multiply(conf_loss, 0) cls_losses.append(conf_loss.asscalar()) if boxHardNegativeMining: # loc loss에도 hard HardNegativeMining 적용해보자. pred = mx.nd.log_softmax(cls_pred, axis=-1) negative_samples = 1 - positive_samples conf_loss_for_box = -mx.nd.pick( pred, cls_target, axis=-1) # (batch, all feature number) negative_samples_conf_loss = (conf_loss_for_box * negative_samples) negative_samples_index = mx.nd.argsort( negative_samples_conf_loss, axis=-1, is_ascend=False) selection = mx.nd.argsort(negative_samples_index, axis=-1, is_ascend=True) hard_negative_samples = selection <= mx.nd.multiply( positive_numbers, negative_mining_ratio).expand_dims(-1) pos_hardnega = positive_samples + hard_negative_samples pos_hardnega = mx.nd.repeat( pos_hardnega.reshape(shape=(0, 0, 1)), repeats=4, axis=-1) loc_loss = mx.nd.abs(box_pred - box_target) loc_loss = mx.nd.where(loc_loss > 1, loc_loss - 0.5, (0.5 / 1) * mx.nd.square(loc_loss)) loc_loss = mx.nd.where(pos_hardnega > 0, loc_loss, mx.nd.zeros_like(loc_loss)) loc_loss = mx.nd.sum(loc_loss) if positive_numbers: loc_loss = mx.nd.divide( loc_loss, positive_numbers) else: loc_loss = mx.nd.multiply(loc_loss, 0) box_losses.append(loc_loss.asscalar()) else: loc_loss = localization_loss( box_pred, box_target, positive_samples.expand_dims(axis=-1)) if positive_numbers: loc_loss = mx.nd.divide( loc_loss, positive_numbers) else: loc_loss = mx.nd.multiply(loc_loss, 0) box_losses.append(loc_loss.asscalar()) total_loss.append(conf_loss + weight_term_alpha * loc_loss) if AMP: with amp.scale_loss(total_loss, trainer) as scaled_loss: autograd.backward(scaled_loss) else: autograd.backward(total_loss) cls_all_losses.append(sum(cls_losses)) box_all_losses.append(sum(box_losses)) trainer.step(batch_size=td_batch_size, ignore_stale_grad=False) # 비우기 for p in net.collect_params().values(): p.zero_grad() conf_loss_sum += sum(cls_all_losses) / td_batch_size loc_loss_sum += sum(box_all_losses) / td_batch_size if batch_count % batch_log == 0: logging.info( f'[Epoch {i}][Batch {batch_count}/{train_update_number_per_epoch}],' f'[Speed {td_batch_size / (time.time() - time_stamp):.3f} samples/sec],' f'[Lr = {trainer.learning_rate}]' f'[confidence loss = {sum(cls_all_losses) / td_batch_size:.3f}]' f'[localization loss = {sum(box_all_losses) / td_batch_size:.3f}]' ) time_stamp = time.time() train_conf_loss_mean = np.divide(conf_loss_sum, train_update_number_per_epoch) train_loc_loss_mean = np.divide(loc_loss_sum, train_update_number_per_epoch) train_total_loss_mean = train_conf_loss_mean + train_loc_loss_mean logging.info( f"train confidence loss : {train_conf_loss_mean} / train localization loss : {train_loc_loss_mean} / train total loss : {train_total_loss_mean}" ) if i % eval_period == 0 and valid_list: if classHardNegativeMining: confidence_loss = SoftmaxCrossEntropyLoss( axis=-1, sparse_label=True, from_log_softmax=False, batch_axis=None, reduction="sum", exclude=False) if boxHardNegativeMining: localization_loss = HuberLoss(rho=1, batch_axis=None, reduction="sum", exclude=False) conf_loss_sum = 0 loc_loss_sum = 0 for image, label, cls_all, box_all, _ in valid_dataloader: vd_batch_size = image.shape[0] if GPU_COUNT <= 1: image = gluon.utils.split_and_load(image, [ctx], even_split=False) label = gluon.utils.split_and_load(label, [ctx], even_split=False) cls_all = gluon.utils.split_and_load(cls_all, [ctx], even_split=False) box_all = gluon.utils.split_and_load(box_all, [ctx], even_split=False) else: image = gluon.utils.split_and_load(image, ctx, even_split=False) label = gluon.utils.split_and_load(label, ctx, even_split=False) cls_all = gluon.utils.split_and_load(cls_all, [ctx], even_split=False) box_all = gluon.utils.split_and_load(box_all, [ctx], even_split=False) # prediction, target space for Data Parallelism cls_losses = [] box_losses = [] # gpu N 개를 대비한 코드 (Data Parallelism) for img, lb, cls_target, box_target in zip( image, label, cls_all, box_all): gt_box = lb[:, :, :4] gt_id = lb[:, :, 4:5] cls_pred, box_pred, anchor = net(img) id, score, bbox = prediction(cls_pred, box_pred, anchor) precision_recall.update(pred_bboxes=bbox, pred_labels=id, pred_scores=score, gt_boxes=gt_box, gt_labels=gt_id) positive_samples = cls_target > 0 positive_numbers = positive_samples.sum() conf_loss = confidence_loss( cls_pred, cls_target, positive_samples.expand_dims(axis=-1)) if positive_numbers: conf_loss = mx.nd.divide(conf_loss, positive_numbers) else: conf_loss = mx.nd.multiply(conf_loss, 0) cls_losses.append(conf_loss.asscalar()) loc_loss = localization_loss( box_pred, box_target, positive_samples.expand_dims(axis=-1)) if positive_numbers: loc_loss = mx.nd.divide(loc_loss, positive_numbers) else: loc_loss = mx.nd.multiply(loc_loss, 0) box_losses.append(loc_loss.asscalar()) conf_loss_sum += sum(cls_losses) / vd_batch_size loc_loss_sum += sum(box_losses) / vd_batch_size valid_conf_loss_mean = np.divide(conf_loss_sum, valid_update_number_per_epoch) valid_loc_loss_mean = np.divide(loc_loss_sum, valid_update_number_per_epoch) valid_total_loss_mean = valid_conf_loss_mean + valid_loc_loss_mean logging.info( f"valid confidence loss : {valid_conf_loss_mean} / valid localization loss : {valid_loc_loss_mean} / valid total loss : {valid_total_loss_mean}" ) AP_appender = [] round_position = 2 class_name, precision, recall, true_positive, false_positive, threshold = precision_recall.get_PR_list( ) for j, c, p, r in zip(range(len(recall)), class_name, precision, recall): name, AP = precision_recall.get_AP(c, p, r) logging.info( f"class {j}'s {name} AP : {round(AP * 100, round_position)}%" ) AP_appender.append(AP) mAP_result = np.mean(AP_appender) logging.info(f"mAP : {round(mAP_result * 100, round_position)}%") precision_recall.get_PR_curve(name=class_name, precision=precision, recall=recall, threshold=threshold, AP=AP_appender, mAP=mAP_result, folder_name=valid_graph_path, epoch=i) precision_recall.reset() if tensorboard: # gpu N 개를 대비한 코드 (Data Parallelism) dataloader_iter = iter(valid_dataloader) image, label, _, _, _ = next(dataloader_iter) if GPU_COUNT <= 1: image = gluon.utils.split_and_load(image, [ctx], even_split=False) label = gluon.utils.split_and_load(label, [ctx], even_split=False) else: image = gluon.utils.split_and_load(image, ctx, even_split=False) label = gluon.utils.split_and_load(label, ctx, even_split=False) ground_truth_colors = {} for k in range(num_classes): ground_truth_colors[k] = (0, 0, 1) batch_image = [] for img, lb in zip(image, label): gt_boxes = lb[:, :, :4] gt_ids = lb[:, :, 4:5] cls_pred, box_pred, anchor = net(img) ids, scores, bboxes = prediction(cls_pred, box_pred, anchor) for ig, gt_id, gt_box, id, score, bbox in zip( img, gt_ids, gt_boxes, ids, scores, bboxes): ig = ig.transpose((1, 2, 0)) * mx.nd.array( std, ctx=ig.context) + mx.nd.array(mean, ctx=ig.context) ig = (ig * 255).clip(0, 255) # ground truth box 그리기 ground_truth = plot_bbox( ig, gt_box, scores=None, labels=gt_id, thresh=None, reverse_rgb=True, class_names=valid_dataset.classes, absolute_coordinates=True, colors=ground_truth_colors) # prediction box 그리기 prediction_box = plot_bbox( ground_truth, bbox, scores=score, labels=id, thresh=plot_class_thresh, reverse_rgb=False, class_names=valid_dataset.classes, absolute_coordinates=True) # Tensorboard에 그리기 위해 BGR -> RGB / (height, width, channel) -> (channel, height, width) 를한다. prediction_box = cv2.cvtColor(prediction_box, cv2.COLOR_BGR2RGB) prediction_box = np.transpose(prediction_box, axes=(2, 0, 1)) batch_image.append( prediction_box) # (batch, channel, height, width) summary.add_image(tag="valid_result", image=np.array(batch_image), global_step=i) summary.add_scalar(tag="conf_loss", value={ "train_conf_loss": train_conf_loss_mean, "valid_conf_loss": valid_conf_loss_mean }, global_step=i) summary.add_scalar(tag="loc_loss", value={ "train_loc_loss": train_loc_loss_mean, "valid_loc_loss": valid_loc_loss_mean }, global_step=i) summary.add_scalar(tag="total_loss", value={ "train_total_loss": train_total_loss_mean, "valid_total_loss": valid_total_loss_mean }, global_step=i) params = net.collect_params().values() if GPU_COUNT > 1: for c in ctx: for p in params: summary.add_histogram(tag=p.name, values=p.data(ctx=c), global_step=i, bins='default') else: for p in params: summary.add_histogram(tag=p.name, values=p.data(), global_step=i, bins='default') if i % save_period == 0: weight_epoch_path = os.path.join(weight_path, str(i)) if not os.path.exists(weight_epoch_path): os.makedirs(weight_epoch_path) ''' Hybrid models can be serialized as JSON files using the export function Export HybridBlock to json format that can be loaded by SymbolBlock.imports, mxnet.mod.Module or the C++ interface. When there are only one input, it will have name data. When there Are more than one inputs, they will be named as data0, data1, etc. ''' if GPU_COUNT >= 1: context = mx.gpu(0) else: context = mx.cpu(0) postnet = PostNet(net=net, auxnet=prediction) try: net.export(os.path.join(weight_path, f"{model}"), epoch=i, remove_amp_cast=True) net.save_parameters(os.path.join(weight_path, f"{i}.params")) # onnx 추출용 # network inference, decoder, nms까지 처리됨 - mxnet c++에서 편리함 / onnx로는 추출 못함. export_block_for_cplusplus( path=os.path.join(weight_epoch_path, f"{model}_prepost"), block=postnet, data_shape=tuple(input_size) + tuple((3, )), epoch=i, preprocess= True, # c++ 에서 inference시 opencv에서 읽은 이미지 그대로 넣으면 됨 layout='HWC', ctx=context, remove_amp_cast=True) except Exception as E: logging.error(f"json, param model export 예외 발생 : {E}") else: logging.info("json, param model export 성공") net.collect_params().reset_ctx(ctx) end_time = time.time() learning_time = end_time - start_time logging.info(f"learning time : 약, {learning_time / 3600:0.2f}H") logging.info("optimization completed") if using_mlflow: ml.log_metric("learning time", round(learning_time / 3600, 2))
def train_net(net, train_path, num_classes, batch_size, data_shape, mean_img, mean_img_dir, resume, finetune, pretrained, epoch, prefix, ctx, begin_epoch, end_epoch, frequent, learning_rate, momentum, weight_decay, lr_refactor_step, lr_refactor_ratio, convert_numpy=1, freeze_layer_pattern='', num_example=10000, label_pad_width=350, nms_thresh=0.45, force_nms=False, ovp_thresh=0.5, use_difficult=False, class_names=None, voc07_metric=False, nms_topk=400, force_suppress=False, train_list="", val_path="", val_list="", iter_monitor=0, monitor_pattern=".*", log_file=None, summarywriter=0, flush_secs=180): """ Wrapper for training phase. Parameters: ---------- net : str symbol name for the network structure train_path : str record file path for training num_classes : int number of object classes, not including background batch_size : int training batch-size data_shape : int or tuple width/height as integer or (3, height, width) tuple mean_pixels : tuple of floats mean pixel values for red, green and blue resume : int resume from previous checkpoint if > 0 finetune : int fine-tune from previous checkpoint if > 0 pretrained : str prefix of pretrained model, including path epoch : int load epoch of either resume/finetune/pretrained model prefix : str prefix for saving checkpoints ctx : [mx.cpu()] or [mx.gpu(x)] list of mxnet contexts begin_epoch : int starting epoch for training, should be 0 if not otherwise specified end_epoch : int end epoch of training frequent : int frequency to print out training status learning_rate : float training learning rate momentum : float trainig momentum weight_decay : float training weight decay param lr_refactor_ratio : float multiplier for reducing learning rate lr_refactor_step : comma separated integers at which epoch to rescale learning rate, e.g. '30, 60, 90' freeze_layer_pattern : str regex pattern for layers need to be fixed num_example : int number of training images label_pad_width : int force padding training and validation labels to sync their label widths nms_thresh : float non-maximum suppression threshold for validation force_nms : boolean suppress overlaped objects from different classes train_list : str list file path for training, this will replace the embeded labels in record val_path : str record file path for validation val_list : str list file path for validation, this will replace the embeded labels in record iter_monitor : int monitor internal stats in networks if > 0, specified by monitor_pattern monitor_pattern : str regex pattern for monitoring network stats log_file : str log to file if enabled """ # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) if log_file: fh = logging.FileHandler(log_file) logger.addHandler(fh) # check args if isinstance(data_shape, int): data_shape = (3, data_shape, data_shape) assert len(data_shape) == 3 and data_shape[0] == 3 prefix += '_' + net + '_' + str(data_shape[1]) # if isinstance(mean_pixels, (int, float)): # mean_pixels = [mean_pixels, mean_pixels, mean_pixels] # assert len(mean_pixels) == 3, "must provide all RGB mean values" train_iter = DetRecordIter(train_path, batch_size, data_shape, mean_img=mean_img, label_pad_width=label_pad_width, path_imglist=train_list, **cfg.train) if val_path: val_iter = DetRecordIter(val_path, batch_size, data_shape, mean_img=mean_img, label_pad_width=label_pad_width, path_imglist=val_list, **cfg.valid) else: val_iter = None # convert mean.bin to mean.npy _convert_mean_numpy(convert_numpy, mean_img_dir, mean_img) # load symbol net = get_symbol_train(net, data_shape[1], num_classes=num_classes, nms_thresh=nms_thresh, force_suppress=force_suppress, nms_topk=nms_topk) if summarywriter: if os.path.exists('/opt/incubator-mxnet/example/ssd/logs'): shutil.rmtree('/opt/incubator-mxnet/example/ssd/logs' ) # clear the previous logs os.mkdir('/opt/incubator-mxnet/example/ssd/logs') sw = SummaryWriter(logdir='/opt/incubator-mxnet/example/ssd/logs', flush_secs=flush_secs) sw.add_graph(net) else: sw = None # mx.viz.plot_network(net, shape={"data":(64, 3, 320, 320)}, node_attrs={"shape":'rect',"fixedsize":'false'}).view() # define layers with fixed weight/bias if freeze_layer_pattern.strip(): re_prog = re.compile(freeze_layer_pattern) fixed_param_names = [ name for name in net.list_arguments() if re_prog.match(name) ] else: fixed_param_names = None # load pretrained or resume from previous state ctx_str = '(' + ','.join([str(c) for c in ctx]) + ')' if resume > 0: logger.info("Resume training with {} from epoch {}".format( ctx_str, resume)) _, args, auxs = mx.model.load_checkpoint(prefix, resume) begin_epoch = resume elif finetune > 0: logger.info("Start finetuning with {} from epoch {}".format( ctx_str, finetune)) _, args, auxs = mx.model.load_checkpoint(prefix, finetune) begin_epoch = finetune # the prediction convolution layers name starts with relu, so it's fine fixed_param_names = [name for name in net.list_arguments() \ if name.startswith('conv')] elif pretrained: logger.info("Start training with {} from pretrained model {}".format( ctx_str, pretrained)) _, args, auxs = mx.model.load_checkpoint(pretrained, epoch) args = convert_pretrained(pretrained, args) else: logger.info("Experimental: start training from scratch with {}".format( ctx_str)) args = None auxs = None fixed_param_names = None # helper information if fixed_param_names: logger.info("Freezed parameters: [" + ','.join(fixed_param_names) + ']') # init training module mod = mx.mod.Module(net, label_names=('label', ), logger=logger, context=ctx, fixed_param_names=fixed_param_names) # fit parameters if summarywriter: # 增加可视化的回调函数,有多个回调函数时,除最后一个回调函数外不能进行准确率的清零操作(即auto_reset参数必须设置为False) batch_end_callbacks = [ mx.callback.Speedometer(train_iter.batch_size, frequent=frequent, auto_reset=True), summary_writter_callback.summary_writter_eval_metric(sw) ] else: batch_end_callbacks = [ mx.callback.Speedometer(train_iter.batch_size, frequent=frequent, auto_reset=False) ] # batch_end_callback = mx.callback.Speedometer(train_iter.batch_size, frequent=frequent) epoch_end_callback = mx.callback.do_checkpoint(prefix) learning_rate, lr_scheduler = get_lr_scheduler(learning_rate, lr_refactor_step, lr_refactor_ratio, num_example, batch_size, begin_epoch) optimizer_params = { 'learning_rate': learning_rate, 'momentum': momentum, 'wd': weight_decay, 'lr_scheduler': lr_scheduler, 'clip_gradient': None, 'rescale_grad': 1.0 / len(ctx) if len(ctx) > 0 else 1.0 } monitor = mx.mon.Monitor( iter_monitor, pattern=monitor_pattern) if iter_monitor > 0 else None # run fit net, every n epochs we run evaluation network to get mAP if voc07_metric: valid_metric = VOC07MApMetric(ovp_thresh, use_difficult, class_names, pred_idx=3) else: valid_metric = MApMetric(ovp_thresh, use_difficult, class_names, pred_idx=3) mod.fit(train_iter, val_iter, eval_metric=MultiBoxMetric(), validation_metric=valid_metric, batch_end_callback=batch_end_callbacks, epoch_end_callback=epoch_end_callback, optimizer='sgd', optimizer_params=optimizer_params, begin_epoch=begin_epoch, num_epoch=end_epoch, initializer=mx.init.Xavier(), arg_params=args, aux_params=auxs, allow_missing=True, monitor=monitor) if summarywriter: sw.close()
mean_img=args.mean_img) cqsym, qarg_params, aux_params = quantize_model( sym=sym, arg_params=arg_params, aux_params=aux_params, ctx=ctx, excluded_sym_names=excluded_sym_names, calib_mode=calib_mode, calib_data=data, num_calib_examples=num_calib_batches * batch_size, calib_layer=calib_layer, quantized_dtype=args.quantized_dtype, logger=logger) if calib_mode == 'entropy': suffix = '-quantized-%dbatches-entropy' % num_calib_batches elif calib_mode == 'naive': suffix = '-quantized-%dbatches-naive' % num_calib_batches else: raise ValueError( 'unknow calibration mode %s received, only supports `none`, `naive`, and `entropy`' % calib_mode) sym_name = '%s-symbol.json' % (model_prefix + suffix) save_symbol(sym_name, cqsym, logger) sw.add_graph(cqsym) param_name = '%s-%04d.params' % (model_prefix + '-quantized', args.epoch) save_params(param_name, qarg_params, aux_params, logger) sw.close()
def fit(args, network, data_loader, **kwargs): """ train a model args : argparse returns network : the symbol definition of the nerual network data_loader : function that returns the train and val data iterators """ # mxborad sw_train = SummaryWriter(logdir='./logs/train', flush_secs=20) sw_val = SummaryWriter(logdir='./logs/val', flush_secs=20) sw_image = SummaryWriter(logdir='./logs/image', flush_secs=20) sw_symbol = SummaryWriter(logdir='./logs/symbol', flush_secs=20) sw_symbol.add_graph(network) args.summary_writer_image = None # kvstore kv = mx.kvstore.create(args.kv_store) if args.gc_type != 'none': kv.set_gradient_compression({ 'type': args.gc_type, 'threshold': args.gc_threshold }) # logging head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s' logging.basicConfig(level=logging.DEBUG, format=head) logging.info('start with arguments %s', args) #epoch_size epoch_size = get_epoch_size(args, kv) # data iterators (train, val) = data_loader(args, kv) if 'dist' in args.kv_store and not 'async' in args.kv_store: logging.info('Resizing training data to %d batches per machine', epoch_size) # resize train iter to ensure each machine has same number of batches per epoch # if not, dist_sync can hang at the end with one machine waiting for other machines train = mx.io.ResizeIter(train, epoch_size) if args.test_io: tic = time.time() for i, batch in enumerate(train): for j in batch.data: j.wait_to_read() if (i + 1) % args.disp_batches == 0: logging.info('Batch [%d]\tSpeed: %.2f samples/sec' % (i, args.disp_batches * args.batch_size / (time.time() - tic))) tic = time.time() return print('next_sample', train.next()) # load model if 'arg_params' in kwargs and 'aux_params' in kwargs: arg_params = kwargs['arg_params'] aux_params = kwargs['aux_params'] else: sym, arg_params, aux_params = _load_model(args, kv.rank) if sym is not None: assert sym.tojson() == network.tojson() # save model checkpoint = _save_model(args, kv.rank) # devices for training devs = mx.cpu() if args.gpus is None or args.gpus is '' else [ mx.gpu(int(i)) for i in args.gpus.split(',') ] # learning rate lr, lr_scheduler = _get_lr_scheduler(args, kv) # create model model = mx.mod.Module(context=devs, symbol=network, label_names={ 'gender_label', 'hat_label', 'bag_label', 'handbag_label', 'backpack_label', 'updress_label', 'downdress_label' }) model_mix = mx.mod.Module( context=devs, symbol=network, label_names={ 'gender_label', 'hat_label', 'bag_label', 'handbag_label', 'backpack_label', 'updress_label', 'downdress_label', 'gender_mix_label', 'hat_mix_label', 'bag_mix_label', 'handbag_mix_label', 'backpack_mix_label', 'updress_mix_label', 'downdress_mix_label' }) lr_scheduler = lr_scheduler optimizer_params = { 'learning_rate': lr, 'wd': args.wd, 'lr_scheduler': lr_scheduler, 'multi_precision': True } # Only a limited number of optimizers have 'momentum' property has_momentum = {'sgd', 'dcasgd', 'nag', 'signum', 'lbsgd'} if args.optimizer in has_momentum: optimizer_params['momentum'] = args.mom monitor = mx.mon.Monitor(args.monitor, pattern=".*") if args.monitor > 0 else None # A limited number of optimizers have a warmup period has_warmup = {'lbsgd', 'lbnag'} if args.optimizer in has_warmup: nworkers = kv.num_workers if epoch_size < 1: epoch_size = 1 macrobatch_size = args.macrobatch_size if macrobatch_size < args.batch_size * nworkers: macrobatch_size = args.batch_size * nworkers #batch_scale = round(float(macrobatch_size) / args.batch_size / nworkers +0.4999) batch_scale = math.ceil( float(macrobatch_size) / args.batch_size / nworkers) optimizer_params['updates_per_epoch'] = epoch_size optimizer_params[ 'begin_epoch'] = args.load_epoch if args.load_epoch else 0 optimizer_params['batch_scale'] = batch_scale optimizer_params['warmup_strategy'] = args.warmup_strategy optimizer_params['warmup_epochs'] = args.warmup_epochs optimizer_params['num_epochs'] = args.num_epochs if args.initializer == 'default': if args.network == 'alexnet': # AlexNet will not converge using Xavier initializer = mx.init.Normal() # VGG will not trend to converge using Xavier-Gaussian elif args.network and 'vgg' in args.network: initializer = mx.init.Xavier() else: initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="in", magnitude=2) # initializer = mx.init.Xavier(factor_type="in", magnitude=2.34), elif args.initializer == 'xavier': initializer = mx.init.Xavier() elif args.initializer == 'msra': initializer = mx.init.MSRAPrelu() elif args.initializer == 'orthogonal': initializer = mx.init.Orthogonal() elif args.initializer == 'normal': initializer = mx.init.Normal() elif args.initializer == 'uniform': initializer = mx.init.Uniform() elif args.initializer == 'one': initializer = mx.init.One() elif args.initializer == 'zero': initializer = mx.init.Zero() # evaluation metrices eval_metrics = ['accuracy'] if args.top_k > 0: eval_metrics.append( mx.metric.create('top_k_accuracy', top_k=args.top_k)) eval_metrics = Multi_Acc_Metric(num=7, label_names=[ 'gender_label', 'hat_label', 'bag_label', 'handbag_label', 'backpack_label', 'updress_label', 'downdress_label' ]) supported_loss = ['ce', 'nll_loss'] if len(args.loss) > 0: # ce or nll loss is only applicable to softmax output loss_type_list = args.loss.split(',') if 'gender_label_output' in network.list_outputs(): for loss_type in loss_type_list: loss_type = loss_type.strip() if loss_type == 'nll': loss_type = 'nll_loss' if loss_type not in supported_loss: logging.warning(loss_type + ' is not an valid loss type, only cross-entropy or ' \ 'negative likelihood loss is supported!') else: eval_metrics.append(mx.metric.create(loss_type)) else: logging.warning( "The output is not softmax_output, loss argument will be skipped!" ) # callbacks that run after each batch batch_end_callbacks = [ mx.callback.Speedometer(args.batch_size, args.disp_batches, False) ] batch_end_callback_with_sw = [ Speedometerboradwriter(args.batch_size, sw_train, args.disp_batches) ] eval_end_callback_with_sw = [LogMetricsCallback(sw_val)] if 'batch_end_callback' in kwargs: cbs = kwargs['batch_end_callback'] batch_end_callbacks += cbs if isinstance(cbs, list) else [cbs] batch_end_callbacks += batch_end_callback_with_sw print(batch_end_callbacks) # run model_mix.fit(train, begin_epoch=args.load_epoch if args.load_epoch else 0, num_epoch=args.num_epochs, eval_data=val, eval_metric=eval_metrics, kvstore=kv, optimizer=args.optimizer, optimizer_params=optimizer_params, initializer=initializer, arg_params=arg_params, aux_params=aux_params, batch_end_callback=batch_end_callbacks, eval_end_callback=eval_end_callback_with_sw, epoch_end_callback=checkpoint, allow_missing=True, monitor=monitor)
class Model: """ Model - class encapsulating training and predicting functionality It if capable of performing training, selecting best model according given metric, writing logs for tensorboard, relaunching training from the last snapshot, saving and loading model and its state """ def __init__(self, symbol, name, **kwargs): """ :param symbol: model symbol :param name: model name :param kwargs: training related arguments like :param path: path to the models root directory (default: ./) :param loss_index: loss index in the output (default: -1) :param data_names: name of the data inputs :param label_names: names of the label inputs :param context: execution context :param group2ctxs: group of contexts (optional) :param fixed_param_names: names of the fixed parameters (optional) :param arg_params: model states (optional) :param aux_params: model states (optional) :param rewrite_dir: rewrite model directory if it exists """ self.name = name self.path = kwargs.get('path', './') self.path = self.path + name + '/' self.logs_path = self.path + '/logs/' self.sw = SummaryWriter(logdir=self.logs_path, flush_secs=5) #training information self.symbol = symbol self.module = None self.epoch = 0 self.train_metric = dict() self.val_metric = dict() #arguments self.init_args = kwargs self.loss_index = kwargs.get('loss_index', -1) module_keys = [ 'data_names', 'label_names', 'context', 'group2ctxs', 'fixed_param_names' ] self.module_desc = dict( (k, kwargs[k]) for k in module_keys if k in kwargs) self.arg_params = kwargs.get('arg_params', None) self.aux_params = kwargs.get('aux_params', None) if 'rewrite_dir' in kwargs: if os.path.exists(self.path) and kwargs['rewrite_dir']: shutil.rmtree(self.path) os.mkdir(self.path) os.mkdir(self.logs_path) def train(self, train_iterator, val_iterator, **kwargs): """ Train model :param train_iterator: training iterator :param val_iterator: validation iterator :param kwargs: training related arguments like :param arg_params: model states (optional) :param aux_params: model states (optional) :param initializer: model weights initializer :param optimizer: optimization algorithm :param optimizer_params: parameters of the optimization algorithm :param train_metrics: list of training metrics :param val_metrics: list of validation metrics :param num_epoch: number of epochs to train :param default_val: default value of validation metric being tracked :param track_metric: name of the metric being tracked :param comparator: metric comparator :param epoch_end_callback: list of epoch end callbacks """ if 'arg_params' in kwargs: self.arg_params = kwargs['arg_params'] if 'aux_params' in kwargs: self.aux_params = kwargs['aux_params'] #create new module if self.module is None: self.module = mx.mod.Module(symbol=self.symbol, **self.module_desc) #initialize self.module.bind(data_shapes=train_iterator.provide_data, label_shapes=train_iterator.provide_label, for_training=True) self.module.init_params(initializer=kwargs['initializer']) if self.arg_params is not None: self.module.set_params(arg_params=self.arg_params, aux_params=self.aux_params, allow_missing=True, allow_extra=True) self.module.init_optimizer(optimizer=kwargs['optimizer'], optimizer_params=kwargs['optimizer_params']) print('model has been binded') #prepare dicts for metrics self.train_metric = dict() self.val_metric = dict() for m in kwargs['train_metrics']: self.train_metric[m.name] = [] for m in kwargs['val_metrics']: self.val_metric[m.name] = [] start_epoch = self.epoch num_epoch = kwargs['num_epoch'] self.sw.add_graph(self.module.symbol) #number of processed batches global_step = 0 #initialize best validation score self.best_val = kwargs['default_val'] self.save() for i in range(start_epoch, num_epoch): train_iterator.reset() val_iterator.reset() tic = time.time() global_step = self._train_one_epoch(train_iterator, kwargs['train_metrics'], self.train_metric, i, global_step) self.arg_params, self.aux_params = self.module.get_params() self.epoch = i self._evaluate_and_save(val_iterator, kwargs['val_metrics'], kwargs['track_metric'], self.val_metric, i, kwargs['comparator']) mx.model._multiple_callbacks(kwargs['epoch_end_callback'], i, self.module.symbol, self.arg_params, self.aux_params) tac = time.time() print('Epoch %d, time %s\n' % (i, tac - tic)) def predict(self, batch): """ Perform prediction :param batch: input data batch :return: model's output """ if self.module is None: desc = self.module_desc desc['label_names'] = None #print desc self.symbol = self.symbol.get_internals()[ self.symbol.list_outputs()[0]] self.module = mx.mod.Module(symbol=self.symbol, **desc) #bind data shape #print self.module._label_shapes, data_shapes data_shapes = [(name, batch.data[i].shape) for i, name in enumerate(desc['data_names'])] #label_shapes = [(name, batch.label[i].shape) for i, name in enumerate(desc['label_names'])] self.module.bind(for_training=False, data_shapes=data_shapes) self.module.set_params(arg_params=self.arg_params, aux_params=self.aux_params, allow_missing=True) self.module.forward(batch) return self.module.get_outputs() def clear(self): #clear module and release gpu memory self.module = None gc.collect() return def _train_one_epoch(self, train_iter, train_metrics, train_metrics_results, epoch, global_step): for m in train_metrics: m.reset() for batch in train_iter: self.module.forward_backward(batch) # compute predictions self.module.update() for m in train_metrics: self.module.update_metric( m, batch.label) # accumulate prediction accuracy outputs = self.module.get_outputs() for i in self.loss_index: outputs[i].wait_to_read() loss = np.mean(outputs[i].asnumpy()) utils.log_var(loss, 'loss' + str(i), global_step, self.sw) global_step += 1 for m in train_metrics: train_metrics_results[m.name].append(m.get()[1]) utils.log_var(m.get()[1], 'train_' + m.name, epoch, self.sw) print('Epoch %d, Training %s %s' % (epoch, m.name, m.get()[1])) return global_step def _evaluate_and_save(self, eval_iter, val_metrics, track_metric, val_metrics_results, epoch, comparator): for m in val_metrics: m.reset() for batch in eval_iter: self.module.forward(batch, is_train=False) # compute predictions for m in val_metrics: m.update(batch.label, self.module.get_outputs()) val = 0.0 for m in val_metrics: if m.name == track_metric: val = m.get()[1] utils.log_var(m.get()[1], 'val_' + m.name, epoch, self.sw) val_metrics_results[m.name].append(m.get()[1]) print('Epoch %d, Validation %s %s' % (epoch, m.name, m.get()[1])) if comparator(val, self.best_val): self.best_val = val self.save() print('model saved') def save(self): #save training state self.module.save_checkpoint(self.path + self.name, 0) #save training metadata pickle.dump(self.train_metric, open(os.path.join(self.path, 'train_metric.p'), 'wb')) pickle.dump(self.val_metric, open(os.path.join(self.path, 'val_metric.p'), 'wb')) pickle.dump(self.epoch, open(os.path.join(self.path, 'epoch.p'), 'wb')) pickle.dump(self.module_desc, open(os.path.join(self.path, 'module_desc.p'), 'wb')) def load(self, path, load_symbol=False): #load training state self.path = path model_prefix = os.path.join(path, self.name) model_number = 0 sym, self.arg_params, self.aux_params = mx.model.load_checkpoint( model_prefix, model_number) if load_symbol: self.symbol = sym #load training metadata #self.train_metric = pickle.load(open(os.path.join(path,'train_metric.p'), 'rb')) #self.val_metric = pickle.load(open(os.path.join(path, 'val_metric.p'), 'rb')) self.epoch = pickle.load(open(os.path.join(path, 'epoch.p'), 'rb')) self.module_desc = pickle.load( open(os.path.join(path, 'module_desc.p'), 'rb')) return def load_params(self, sym, arg_params, aux_params): if sym is not None: self.symbol = sym self.arg_params = arg_params self.aux_params = aux_params
def train(): """training""" image_pool = ImagePool(pool_size) metric = mx.metric.CustomMetric(facc) stamp = datetime.now().strftime('%Y_%m_%d-%H_%M') logging.basicConfig(level=logging.DEBUG) # define a summary writer that logs data and flushes to the file every 5 seconds sw = SummaryWriter(logdir='%s' % dir_out_sw, flush_secs=5, verbose=False) global_step = 0 for epoch in range(epochs): if epoch == 0: netG.hybridize() netD.hybridize() # sw.add_graph(netG) # sw.add_graph(netD) tic = time.time() btic = time.time() train_data.reset() val_data.reset() iter = 0 for local_step, batch in enumerate(train_data): ############################ # (1) Update D network: maximize log(D(x, y)) + log(1 - D(x, G(x, z))) ########################### tmp = mx.nd.concat(batch.data[0], batch.data[1], batch.data[2], dim=1) tmp = augmenter(tmp, patch_size=128, offset=offset, aug_type=1, aug_methods=aug_methods, random_crop=False) real_in = tmp[:, :1].as_in_context(ctx) real_out = tmp[:, 1:2].as_in_context(ctx) m = tmp[:, 2:3].as_in_context(ctx) # mask fake_out = netG(real_in) * m # loss weight based on mask, applied on L1 loss if no_loss_weights: loss_weight = m else: loss_weight = m.asnumpy() loss_weight[loss_weight == 0] = .1 loss_weight = mx.nd.array(loss_weight, ctx=m.context) fake_concat = image_pool.query(nd.concat(real_in, fake_out, dim=1)) with autograd.record(): # Train with fake image # Use image pooling to utilize history images output = netD(fake_concat) fake_label = nd.zeros(output.shape, ctx=ctx) errD_fake = GAN_loss(output, fake_label) metric.update([ fake_label, ], [ output, ]) # Train with real image real_concat = nd.concat(real_in, real_out, dim=1) output = netD(real_concat) real_label = nd.ones(output.shape, ctx=ctx) errD_real = GAN_loss(output, real_label) errD = (errD_real + errD_fake) * 0.5 errD.backward() metric.update([ real_label, ], [ output, ]) trainerD.step(batch.data[0].shape[0]) ############################ # (2) Update G network: maximize log(D(x, G(x, z))) - lambda1 * L1(y, G(x, z)) ########################### with autograd.record(): fake_out = netG(real_in) fake_concat = nd.concat(real_in, fake_out, dim=1) output = netD(fake_concat) real_label = nd.ones(output.shape, ctx=ctx) errG = GAN_loss(output, real_label) + loss_2nd( real_out, fake_out, loss_weight) * lambda1 errG.backward() trainerG.step(batch.data[0].shape[0]) sw.add_scalar(tag='loss', value=('d_loss', errD.mean().asscalar()), global_step=global_step) sw.add_scalar(tag='loss', value=('g_loss', errG.mean().asscalar()), global_step=global_step) global_step += 1 if epoch + local_step == 0: sw.add_graph((netG)) img_in_list, img_out_list, m_val = val_data.next().data m_val = m_val.as_in_context(ctx) sw.add_image('first_minibatch_train_real', norm3(real_out)) sw.add_image('first_minibatch_val_real', norm3(img_out_list.as_in_context(ctx))) netG.export('%snetG' % dir_out_checkpoints) if local_step == 0: # Log the first batch of images of each epoch (training) sw.add_image('first_minibatch_train_fake', norm3(fake_out * m) * m, epoch) sw.add_image( 'first_minibatch_val_fake', norm3(netG(img_in_list.as_in_context(ctx)) * m_val) * m_val, epoch) # norm3(netG(img_in_list.as_in_context(ctx)) * m_val.as_in_context(ctx)), epoch) if (iter + 1) % 10 == 0: name, acc = metric.get() logging.info('speed: {} samples/s'.format( batch_size / (time.time() - btic))) logging.info( 'discriminator loss = %f, generator loss = %f, binary training acc = %f at iter %d epoch %d' % (nd.mean(errD).asscalar(), nd.mean(errG).asscalar(), acc, iter, epoch)) iter += 1 btic = time.time() sw.add_scalar(tag='binary_training_acc', value=('acc', acc), global_step=epoch) name, acc = metric.get() metric.reset() fake_val = netG(val_data.data[0][1].as_in_context(ctx)) loss_val = loss_2nd(val_data.data[1][1].as_in_context(ctx), fake_val, val_data.data[2][1].as_in_context(ctx)) * lambda1 sw.add_scalar(tag='loss_val', value=('g_loss', loss_val.mean().asscalar()), global_step=epoch) if (epoch % check_point_interval == 0) | (epoch == epochs - 1): netD.save_params('%snetD-%04d' % (dir_out_checkpoints, epoch)) netG.save_params('%snetG-%04d' % (dir_out_checkpoints, epoch)) logging.info('\nbinary training acc at epoch %d: %s=%f' % (epoch, name, acc)) logging.info('time: %f' % (time.time() - tic)) sw.export_scalars('scalar_dict.json') sw.close()
def train(epochs, ctx): # Collect all parameters from net and its children, then initialize them. net.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx) # Trainer is for updating parameters with gradient. trainer = gluon.Trainer(net.collect_params(), 'adam') metric = mx.metric.Accuracy() loss = gluon.loss.SoftmaxCrossEntropyLoss() # do forward pass with dummy data without backwards pass to initialize binary layers with autograd.record(): data, label = dummy_data(ctx) output = net(data) L = loss(output, label) if opt.hybridize: net.hybridize() # collect parameter names for logging the gradients of parameters in each epoch log_param_filter = ".*weight|.*bias" params = net.collect_params(log_param_filter) param_names = params.keys() sw = SummaryWriter(logdir='./logs/{}-{}bits/'.format( "symbolic" if opt.hybridize else "gluon", opt.bits), flush_secs=5) global_step = 0 for epoch in range(epochs): # reset data iterator and metric at begining of epoch. metric.reset() for i, (data, label) in enumerate(train_data): # Copy data to ctx if necessary data = data.as_in_context(ctx) label = label.as_in_context(ctx) # Start recording computation graph with record() section. # Recorded graphs can then be differentiated with backward. with autograd.record(): output = net(data) L = loss(output, label) L.backward() sw.add_scalar(tag='cross_entropy', value=L.mean().asscalar(), global_step=global_step) global_step += 1 # take a gradient step with batch_size equal to data.shape[0] trainer.step(data.shape[0]) # update metric at last. metric.update([label], [output]) if i % opt.log_interval == 0 and i > 0: name, acc = metric.get() print('[Epoch %d Batch %d] Training: %s=%f' % (epoch, i, name, acc)) if i == 0: sw.add_image('mnist_first_minibatch', data.reshape((opt.batch_size, 1, 28, 28)), epoch) grads = [ i.grad() for i in net.collect_params(log_param_filter).values() ] assert len(grads) == len(param_names) # logging the gradients of parameters for checking convergence for i, name in enumerate(param_names): sw.add_histogram(tag=name, values=grads[i], global_step=global_step, bins=1000) name, acc = metric.get() print('[Epoch %d] Training: %s=%f' % (epoch, name, acc)) sw.add_scalar(tag='train_acc', value=acc, global_step=global_step) name, val_acc = test(ctx) print('[Epoch %d] Validation: %s=%f' % (epoch, name, val_acc)) sw.add_scalar(tag='valid_acc', value=val_acc, global_step=global_step) if not opt.hybridize: net.hybridize() with autograd.record(): data, label = dummy_data(ctx) output = net(data) L = loss(output, label) net.export("mnist-lenet-{}-{}-bit".format( "symbolic" if opt.hybridize else "gluon", opt.bits), epoch=1) sw.add_graph(net) sw.close()
def train(epochs, ctx): # Collect all parameters from net and its children, then initialize them. net.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx) net.hybridize() # Trainer is for updating parameters with gradient. trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': opt.lr, 'momentum': opt.momentum}) metric = mx.metric.Accuracy() loss = gluon.loss.SoftmaxCrossEntropyLoss() # collect parameter names for logging the gradients of parameters in each epoch params = net.collect_params() param_names = params.keys() # define a summary writer that logs data and flushes to the file every 5 seconds sw = SummaryWriter(logdir='./logs', flush_secs=5) global_step = 0 for epoch in range(epochs): # reset data iterator and metric at begining of epoch. metric.reset() for i, (data, label) in enumerate(train_data): # Copy data to ctx if necessary data = data.as_in_context(ctx) label = label.as_in_context(ctx) # Start recording computation graph with record() section. # Recorded graphs can then be differentiated with backward. with autograd.record(): output = net(data) L = loss(output, label) sw.add_scalar(tag='cross_entropy', value=L.mean().asscalar(), global_step=global_step) global_step += 1 L.backward() # take a gradient step with batch_size equal to data.shape[0] trainer.step(data.shape[0]) # update metric at last. metric.update([label], [output]) if i % opt.log_interval == 0 and i > 0: name, train_acc = metric.get() print('[Epoch %d Batch %d] Training: %s=%f' % (epoch, i, name, train_acc)) # Log the first batch of images of each epoch if i == 0: sw.add_image('minist_first_minibatch', data.reshape((opt.batch_size, 1, 28, 28)), epoch) if epoch == 0: sw.add_graph(net) grads = [i.grad() for i in net.collect_params().values()] assert len(grads) == len(param_names) # logging the gradients of parameters for checking convergence for i, name in enumerate(param_names): sw.add_histogram(tag=name, values=grads[i], global_step=epoch, bins=1000) name, train_acc = metric.get() print('[Epoch %d] Training: %s=%f' % (epoch, name, train_acc)) # logging training accuracy sw.add_scalar(tag='accuracy_curves', value=('train_acc', train_acc), global_step=epoch) name, val_acc = test(ctx) print('[Epoch %d] Validation: %s=%f' % (epoch, name, val_acc)) # logging the validation accuracy sw.add_scalar(tag='accuracy_curves', value=('valid_acc', val_acc), global_step=epoch) sw.export_scalars('scalar_dict.json') sw.close()
class BaseTrainer: def __init__(self, config, model, criterion, ctx): config['trainer']['output_dir'] = os.path.join( str(pathlib.Path(os.path.abspath(__name__)).parent), config['trainer']['output_dir']) config['name'] = config['name'] + '_' + model.model_name self.save_dir = os.path.join(config['trainer']['output_dir'], config['name']) self.checkpoint_dir = os.path.join(self.save_dir, 'checkpoint') if config['trainer']['resume_checkpoint'] == '' and config['trainer'][ 'finetune_checkpoint'] == '': shutil.rmtree(self.save_dir, ignore_errors=True) if not os.path.exists(self.checkpoint_dir): os.makedirs(self.checkpoint_dir) # 保存本次实验的alphabet 到模型保存的地方 np.save(os.path.join(self.save_dir, 'alphabet.npy'), config['data_loader']['args']['dataset']['alphabet']) self.global_step = 0 self.start_epoch = 1 self.config = config self.model = model self.criterion = criterion # logger and tensorboard self.tensorboard_enable = self.config['trainer']['tensorboard'] self.epochs = self.config['trainer']['epochs'] self.display_interval = self.config['trainer']['display_interval'] if self.tensorboard_enable: from mxboard import SummaryWriter self.writer = SummaryWriter(self.save_dir, verbose=False) self.logger = setup_logger(os.path.join(self.save_dir, 'train_log')) self.logger.info(pformat(self.config)) self.logger.info(self.model) # device set self.ctx = ctx mx.random.seed(2) # 设置随机种子 self.logger.info('train with mxnet: {} and device: {}'.format( mx.__version__, self.ctx)) self.metrics = { 'val_acc': 0, 'train_loss': float('inf'), 'best_model': '' } schedule = self._initialize('lr_scheduler', mx.lr_scheduler) optimizer = self._initialize('optimizer', mx.optimizer, lr_scheduler=schedule) self.trainer = gluon.Trainer(self.model.collect_params(), optimizer=optimizer) if self.config['trainer']['resume_checkpoint'] != '': self._laod_checkpoint(self.config['trainer']['resume_checkpoint'], resume=True) elif self.config['trainer']['finetune_checkpoint'] != '': self._laod_checkpoint( self.config['trainer']['finetune_checkpoint'], resume=False) if self.tensorboard_enable: try: # add graph from mxnet.gluon import utils as gutils dummy_input = gutils.split_and_load( nd.zeros(( 1, self.config['data_loader']['args']['dataset'] ['img_channel'], self.config['data_loader']['args']['dataset']['img_h'], self.config['data_loader']['args']['dataset']['img_w'] )), ctx) self.model(dummy_input[0]) self.writer.add_graph(model) except: self.logger.error(traceback.format_exc()) self.logger.warn('add graph to tensorboard failed') def train(self): """ Full training logic """ try: for epoch in range(self.start_epoch, self.epochs + 1): self.epoch_result = self._train_epoch(epoch) self._on_epoch_finish() except: self.logger.error(traceback.format_exc()) if self.tensorboard_enable: self.writer.close() self._on_train_finish() def _train_epoch(self, epoch): """ Training logic for an epoch :param epoch: Current epoch number """ raise NotImplementedError def _eval(self): """ eval logic for an epoch :param epoch: Current epoch number """ raise NotImplementedError def _on_epoch_finish(self): raise NotImplementedError def _on_train_finish(self): raise NotImplementedError def _save_checkpoint(self, epoch, file_name, save_best=False): """ 保存模型和检查点信息,会保存模型权重,trainer状态,其他的信息 :param epoch: 当前epoch :param file_name: 文件名 :param save_best: 是否是最优模型 :return: """ # 保存权重 params_filename = os.path.join(self.checkpoint_dir, file_name) self.model.save_parameters(params_filename) # 保存trainer状态 trainer_filename = params_filename.replace('.params', '.train_states') self.trainer.save_states(trainer_filename) # 其他信息 state = { 'epoch': epoch, 'global_step': self.global_step, 'config': self.config, 'metrics': self.metrics } other_filename = params_filename.replace('.params', '.info') pickle.dump(state, open(other_filename, 'wb')) if save_best: shutil.copy(params_filename, os.path.join(self.checkpoint_dir, 'model_best.params')) shutil.copy( trainer_filename, os.path.join(self.checkpoint_dir, 'model_best.train_states')) shutil.copy(other_filename, os.path.join(self.checkpoint_dir, 'model_best.info')) self.logger.info("Saving current best: {}".format( os.path.join(self.checkpoint_dir, 'model_best.params'))) else: self.logger.info("Saving checkpoint: {}".format(params_filename)) def _laod_checkpoint(self, checkpoint_path, resume): """ 从检查点钟加载模型,会加载模型权重,trainer状态,其他的信息 :param resume_path: 检查点地址 :return: """ self.logger.info("Loading checkpoint: {} ...".format(checkpoint_path)) # 加载模型参数 self.model.load_parameters(checkpoint_path, ctx=self.ctx, ignore_extra=True, allow_missing=True) if resume: # 加载trainer状态 trainer_filename = checkpoint_path.replace('.params', '.train_states') if os.path.exists(trainer_filename): self.trainer.load_states(trainer_filename) # 加载其他信息 other_filename = checkpoint_path.replace('.params', '.info') checkpoint = pickle.load(open(other_filename, 'rb')) self.start_epoch = checkpoint['epoch'] + 1 self.global_step = checkpoint['global_step'] self.metrics = checkpoint['metrics'] self.logger.info("resume from checkpoint {} (epoch {})".format( checkpoint_path, self.start_epoch)) else: self.logger.info( "finetune from checkpoint {}".format(checkpoint_path)) def _initialize(self, name, module, *args, **kwargs): module_name = self.config[name]['type'] module_args = self.config[name]['args'] assert all([ k not in module_args for k in kwargs ]), 'Overwriting kwargs given in config file is not allowed' module_args.update(kwargs) return getattr(module, module_name)(*args, **module_args)
def train_net(args, ctx, pretrained, pretrained_flow, epoch, prefix, begin_epoch, end_epoch, lr, lr_step): sw = SummaryWriter(logdir=config.output_path, flush_secs=5) logger, final_output_path = create_logger(config.output_path, args.cfg, config.dataset.image_set) prefix = os.path.join(final_output_path, prefix) # load symbol shutil.copy2(os.path.join(curr_path, 'symbols', config.symbol + '.py'), final_output_path) sym_instance = eval(config.symbol + '.' + config.symbol)() sym = sym_instance.get_train_symbol(config) sw.add_graph(sym) feat_sym = sym.get_internals()['rpn_cls_score_output'] # setup multi-gpu batch_size = len(ctx) input_batch_size = config.TRAIN.BATCH_IMAGES * batch_size # print config pprint.pprint(config) logger.info('training config:{}\n'.format(pprint.pformat(config))) # load dataset and prepare imdb for training image_sets = [iset for iset in config.dataset.val_image_set.split('+')] roidbs = [load_gt_roidb(config.dataset.dataset, image_set, config.dataset.root_path, config.dataset.dataset_path, flip=config.TRAIN.FLIP) for image_set in image_sets] roidb = merge_roidb(roidbs) roidb = filter_roidb(roidb, config) # load training data train_data = AnchorLoader(feat_sym, roidb, config, batch_size=input_batch_size, shuffle=config.TRAIN.SHUFFLE, ctx=ctx, feat_stride=config.network.RPN_FEAT_STRIDE, anchor_scales=config.network.ANCHOR_SCALES, anchor_ratios=config.network.ANCHOR_RATIOS, aspect_grouping=config.TRAIN.ASPECT_GROUPING, normalize_target=config.network.NORMALIZE_RPN, bbox_mean=config.network.ANCHOR_MEANS, bbox_std=config.network.ANCHOR_STDS) roidbs_eval = [ load_gt_roidb(config.dataset.dataset, image_set, config.dataset.root_path, config.dataset.dataset_path, flip=False) for image_set in image_sets] roidb_eval = merge_roidb(roidbs_eval) # need? roidb_eval = filter_roidb(roidb_eval, config) eval_data = AnchorLoader(feat_sym, roidb_eval, config, batch_size=input_batch_size, shuffle=config.TRAIN.SHUFFLE, ctx=ctx, feat_stride=config.network.RPN_FEAT_STRIDE, anchor_scales=config.network.ANCHOR_SCALES, anchor_ratios=config.network.ANCHOR_RATIOS, aspect_grouping=config.TRAIN.ASPECT_GROUPING, normalize_target=config.network.NORMALIZE_RPN, bbox_mean=config.network.ANCHOR_MEANS, bbox_std=config.network.ANCHOR_STDS) # infer max shape max_data_shape = [('data', ( config.TRAIN.BATCH_IMAGES, 3, max([v[0] for v in config.SCALES]), max([v[1] for v in config.SCALES]))), ('data_ref', (config.TRAIN.BATCH_IMAGES, 3, max([v[0] for v in config.SCALES]), max([v[1] for v in config.SCALES]))), ('eq_flag', (1,))] max_data_shape, max_label_shape = train_data.infer_shape(max_data_shape) max_data_shape.append(('gt_boxes', (config.TRAIN.BATCH_IMAGES, 100, 5))) print('providing maximum shape', max_data_shape, max_label_shape) data_shape_dict = dict(train_data.provide_data_single + train_data.provide_label_single) pprint.pprint(data_shape_dict) sym_instance.infer_shape(data_shape_dict) # load and initialize params if config.TRAIN.RESUME: print('continue training from ', begin_epoch) arg_params, aux_params = load_param(prefix, begin_epoch, convert=True) else: arg_params, aux_params = load_param(pretrained, epoch, convert=True) arg_params_flow, aux_params_flow = load_param(pretrained_flow, epoch, convert=True) arg_params.update(arg_params_flow) aux_params.update(aux_params_flow) sym_instance.init_weight(config, arg_params, aux_params) # check parameter shapes sym_instance.check_parameter_shapes(arg_params, aux_params, data_shape_dict) # create solver fixed_param_prefix = config.network.FIXED_PARAMS data_names = [k[0] for k in train_data.provide_data_single] label_names = [k[0] for k in train_data.provide_label_single] mod = MutableModule(sym, data_names=data_names, label_names=label_names, logger=logger, context=ctx, max_data_shapes=[max_data_shape for _ in range(batch_size)], max_label_shapes=[max_label_shape for _ in range(batch_size)], fixed_param_prefix=fixed_param_prefix) if config.TRAIN.RESUME: mod._preload_opt_states = '%s-%04d.states' % (prefix, begin_epoch) # decide training params # metric rpn_eval_metric = metric.RPNAccMetric() rpn_cls_metric = metric.RPNLogLossMetric() rpn_bbox_metric = metric.RPNL1LossMetric() eval_metric = metric.RCNNAccMetric(config) cls_metric = metric.RCNNLogLossMetric(config) bbox_metric = metric.RCNNL1LossMetric(config) eval_metrics = mx.metric.CompositeEvalMetric() # rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, eval_metric, cls_metric, bbox_metric for child_metric in [rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, eval_metric, cls_metric, bbox_metric]: eval_metrics.add(child_metric) # callback batch_end_callback = [callback.Speedometer(train_data.batch_size, frequent=args.frequent, sw=sw), callback.SummaryMetric(sw, frequent=args.frequent, prefix='train')] eval_end_callback = callback.SummaryValMetric(sw, prefix='val') means = np.tile(np.array(config.TRAIN.BBOX_MEANS), 2 if config.CLASS_AGNOSTIC else config.dataset.NUM_CLASSES) stds = np.tile(np.array(config.TRAIN.BBOX_STDS), 2 if config.CLASS_AGNOSTIC else config.dataset.NUM_CLASSES) epoch_end_callback = [mx.callback.module_checkpoint(mod, prefix, period=1, save_optimizer_states=True), callback.do_checkpoint(prefix, means, stds)] # decide learning rate base_lr = lr lr_factor = config.TRAIN.lr_factor lr_epoch = [float(epoch) for epoch in lr_step.split(',')] lr_epoch_diff = [epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch] lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff))) lr_iters = [int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff] print('lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters) lr_scheduler = WarmupMultiFactorScheduler(lr_iters, lr_factor, config.TRAIN.warmup, config.TRAIN.warmup_lr, config.TRAIN.warmup_step, sw=sw) # optimizer optimizer_params = {'momentum': config.TRAIN.momentum, 'wd': config.TRAIN.wd, 'learning_rate': lr, 'lr_scheduler': lr_scheduler, 'rescale_grad': 1.0, 'clip_gradient': None} if not isinstance(train_data, PrefetchingIter): train_data = PrefetchingIter(train_data) if not isinstance(eval_data, PrefetchingIter): eval_data = PrefetchingIter(eval_data) # train mod.fit(train_data, eval_data=None, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, eval_end_callback=eval_end_callback, kvstore=config.default.kvstore, optimizer='sgd', optimizer_params=optimizer_params,eval_num_batch=config.TEST.EVAL_NUM_BATCH, arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch) sw.close()
def train(): if config.restart_training: shutil.rmtree(config.output_dir, ignore_errors=True) if config.output_dir is None: config.output_dir = 'output' if not os.path.exists(config.output_dir): os.makedirs(config.output_dir) logger = setup_logger(os.path.join(config.output_dir, 'train_log')) logger.info('train with gpu %s and mxnet %s' % (config.gpu_id, mx.__version__)) ctx = mx.gpu(config.gpu_id) # 设置随机种子 mx.random.seed(2) mx.random.seed(2, ctx=ctx) train_transfroms = transforms.Compose( [transforms.RandomBrightness(0.5), transforms.ToTensor()]) train_dataset = ImageDataset(config.trainfile, (config.img_h, config.img_w), 3, 80, config.alphabet, phase='train') train_data_loader = DataLoader( train_dataset.transform_first(train_transfroms), config.train_batch_size, shuffle=True, last_batch='keep', num_workers=config.workers) test_dataset = ImageDataset(config.testfile, (config.img_h, config.img_w), 3, 80, config.alphabet, phase='test') test_data_loader = DataLoader(test_dataset.transform_first( transforms.ToTensor()), config.eval_batch_size, shuffle=True, last_batch='keep', num_workers=config.workers) net = CRNN(len(config.alphabet), hidden_size=config.nh) net.hybridize() if not config.restart_training and config.checkpoint != '': logger.info('load pretrained net from {}'.format(config.checkpoint)) net.load_parameters(config.checkpoint, ctx=ctx) else: net.initialize(ctx=ctx) criterion = gluon.loss.CTCLoss() all_step = len(train_data_loader) logger.info('each epoch contains {} steps'.format(all_step)) schedule = mx.lr_scheduler.FactorScheduler(step=config.lr_decay_step * all_step, factor=config.lr_decay, stop_factor_lr=config.end_lr) # schedule = mx.lr_scheduler.MultiFactorScheduler(step=[15 * all_step, 30 * all_step, 60 * all_step,80 * all_step], # factor=0.1) adam_optimizer = mx.optimizer.Adam(learning_rate=config.lr, lr_scheduler=schedule) trainer = gluon.Trainer(net.collect_params(), optimizer=adam_optimizer) sw = SummaryWriter(logdir=config.output_dir) for epoch in range(config.start_epoch, config.end_epoch): loss = .0 train_acc = .0 tick = time.time() cur_step = 0 for i, (data, label) in enumerate(train_data_loader): data = data.as_in_context(ctx) label = label.as_in_context(ctx) with autograd.record(): output = net(data) loss_ctc = criterion(output, label) loss_ctc.backward() trainer.step(data.shape[0]) loss_c = loss_ctc.mean() cur_step = epoch * all_step + i sw.add_scalar(tag='ctc_loss', value=loss_c.asscalar(), global_step=cur_step // 2) sw.add_scalar(tag='lr', value=trainer.learning_rate, global_step=cur_step // 2) loss += loss_c acc = accuracy(output, label, config.alphabet) train_acc += acc if (i + 1) % config.display_interval == 0: acc /= len(label) sw.add_scalar(tag='train_acc', value=acc, global_step=cur_step) batch_time = time.time() - tick logger.info( '[{}/{}], [{}/{}],step: {}, Speed: {:.3f} samples/sec, ctc loss: {:.4f},acc: {:.4f}, lr:{},' ' time:{:.4f} s'.format( epoch, config.end_epoch, i, all_step, cur_step, config.display_interval * config.train_batch_size / batch_time, loss.asscalar() / config.display_interval, acc, trainer.learning_rate, batch_time)) loss = .0 tick = time.time() nd.waitall() if epoch == 0: sw.add_graph(net) logger.info('start val ....') train_acc /= train_dataset.__len__() validation_accuracy = evaluate_accuracy( net, test_data_loader, ctx, config.alphabet) / test_dataset.__len__() sw.add_scalar(tag='val_acc', value=validation_accuracy, global_step=cur_step) logger.info("Epoch {},train_acc {:.4f}, val_acc {:.4f}".format( epoch, train_acc, validation_accuracy)) net.save_parameters("{}/{}_{:.4f}_{:.4f}.params".format( config.output_dir, epoch, train_acc, validation_accuracy)) sw.close()
def mytrain(net,num_classes,train_data,valid_data,ctx,start_epoch, end_epoch, \ arm_cls_loss=arm_cls_loss,cls_loss=cls_loss,box_loss=box_loss,trainer=None): if trainer is None: # trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.01,'momentum':0.9, 'wd':50.0}) trainer = gluon.Trainer(net.collect_params(), 'adam', { 'learning_rate': 0.001, 'clip_gradient': 2.0 }) # trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': 0.003}) box_metric = metric.MAE() ## add visible # collect parameter names for logging the gradients of parameters in each epoch params = net.collect_params() # param_names = params.keys() # define a summary writer that logs data and flushes to the file every 5 seconds sw = SummaryWriter(logdir='./logs', flush_secs=5) global_step = 0 for e in range(start_epoch, end_epoch): # print(e) train_data.reset() valid_data.reset() box_metric.reset() tic = time.time() _loss = [0, 0] arm_loss = [0, 0] # if e == 6 or e == 100: # trainer.set_learning_rate(trainer.learning_rate * 0.2) outs, labels = None, None for i, batch in enumerate(train_data): # print('----- batch {} start ----'.format(i)) data = batch.data[0].as_in_context(ctx) label = batch.label[0].as_in_context(ctx) # print('label shape: ',label.shape) with autograd.record(): # 1. generate results according to extract network ssd_layers = net(data) arm_loc_preds, arm_cls_preds, arm_anchor_boxes, odm_loc_preds, odm_cls_preds = multibox_layer(ssd_layers,\ num_classes,sizes,ratios,normalizations) # arm_loc_preds, arm_cls_preds, arm_anchor_boxes, odm_loc_preds, odm_cls_preds = net(data) # print('---------1111-----------') # 2. ARM predict ## 2.1 modify label as [-1,0,..] label_arm = nd.Custom(label, op_type='modify_label') arm_tmp = MultiBoxTarget(arm_anchor_boxes,label_arm,arm_cls_preds,overlap_threshold=.5,\ negative_mining_ratio=3,negative_mining_thresh=.5) arm_loc_target = arm_tmp[0] # box offset arm_loc_target_mask = arm_tmp[1] # box mask (only 0,1) arm_cls_target = arm_tmp[2] # every anchor' idx # print(sum(arm_cls_target[0])) # print('---------2222-----------') # 3. ODM predict ## 3.1 refine anchor generator originate in ARM odm_anchor_boxes = refine_anchor_generator( arm_anchor_boxes, arm_loc_preds) #(batch,h*w*num_anchors[:layers],4) # ### debug backward err # odm_anchor_boxes = arm_anchor_boxes odm_anchor_boxes_bs = nd.split( data=odm_anchor_boxes, axis=0, num_outputs=label.shape[0]) # list # print('---3 : odm_anchor_boxes_bs shape : {}'.format(odm_anchor_boxes_bs[0].shape)) # print('---------3333-----------') ## 3.2 对当前所有batch的data计算 Target (多个gpu使用) odm_loc_target = [] odm_loc_target_mask = [] odm_cls_target = [] label_bs = nd.split(data=label, axis=0, num_outputs=label.shape[0]) odm_cls_preds_bs = nd.split(data=odm_cls_preds, axis=0, num_outputs=label.shape[0]) # print('---4 : odm_cls_preds_bs shape: {}'.format(odm_cls_preds_bs[0].shape)) # print('---4 : label_bs shape: {}'.format(label_bs[0].shape)) for j in range(label.shape[0]): if label.shape[0] == 1: odm_tmp = MultiBoxTarget(odm_anchor_boxes_bs[j].expand_dims(axis=0),label_bs[j].expand_dims(axis=0),\ odm_cls_preds_bs[j].expand_dims(axis=0),overlap_threshold=.5,negative_mining_ratio=2,negative_mining_thresh=.5) ## 多个batch else: odm_tmp = MultiBoxTarget(odm_anchor_boxes_bs[j],label_bs[j],\ odm_cls_preds_bs[j],overlap_threshold=.5,negative_mining_ratio=3,negative_mining_thresh=.5) odm_loc_target.append(odm_tmp[0]) odm_loc_target_mask.append(odm_tmp[1]) odm_cls_target.append(odm_tmp[2]) ### concat ,上面为什么会单独计算每张图,odm包含了batch,so需要拆 odm_loc_target = nd.concat(*odm_loc_target, dim=0) odm_loc_target_mask = nd.concat(*odm_loc_target_mask, dim=0) odm_cls_target = nd.concat(*odm_cls_target, dim=0) # 4. negitave filter group = nd.Custom(arm_cls_preds, odm_cls_target, odm_loc_target_mask, op_type='negative_filtering') odm_cls_target = group[0] #用ARM中的cls过滤后的odm_cls odm_loc_target_mask = group[1] #过滤掉的mask为0 # print('---------4444-----------') # 5. calc loss # TODO:add 1/N_arm, 1/N_odm (num of positive anchors) # arm_cls_loss = gluon.loss.SoftmaxCrossEntropyLoss() arm_loss_cls = arm_cls_loss(arm_cls_preds.transpose((0, 2, 1)), arm_cls_target) arm_loss_loc = box_loss(arm_loc_preds, arm_loc_target, arm_loc_target_mask) # print('55555 loss-> arm_loss_cls : {} arm_loss_loc {}'.format(arm_loss_cls.shape,arm_loss_loc.shape)) # print('arm_loss_cls loss : {}'.format(arm_loss_cls)) # odm_cls_prob = nd.softmax(odm_cls_preds,axis=2) tmp = odm_cls_preds.transpose((0, 2, 1)) odm_loss_cls = cls_loss(odm_cls_preds.transpose((0, 2, 1)), odm_cls_target) odm_loss_loc = box_loss(odm_loc_preds, odm_loc_target, odm_loc_target_mask) # print('66666 loss-> odm_loss_cls : {} odm_loss_loc {}'.format(odm_loss_cls.shape,odm_loss_loc.shape)) # print('odm_loss_cls loss :{} '.format(odm_loss_cls)) # print('odm_loss_loc loss :{} '.format(odm_loss_loc)) # print('N_arm: {} ; N_odm: {} '.format(nd.sum(arm_loc_target_mask,axis=1)/4.0,nd.sum(odm_loc_target_mask,axis=1)/4.0)) # loss = arm_loss_cls+arm_loss_loc+odm_loss_cls+odm_loss_loc loss = 1/(nd.sum(arm_loc_target_mask,axis=1)/4.0) *(arm_loss_cls+arm_loss_loc) + \ 1/(nd.sum(odm_loc_target_mask,axis=1)/4.0)*(odm_loss_cls+odm_loss_loc) sw.add_scalar(tag='loss', value=loss.mean().asscalar(), global_step=global_step) global_step += 1 loss.backward(retain_graph=False) # autograd.backward(loss) # print(net.collect_params().get('conv4_3_weight').data()) # print(net.collect_params().get('vgg0_conv9_weight').grad()) ### 单独测试梯度 # arm_loss_cls.backward(retain_graph=False) # arm_loss_loc.backward(retain_graph=False) # odm_loss_cls.backward(retain_graph=False) # odm_loss_loc.backward(retain_graph=False) trainer.step(data.shape[0]) _loss[0] += nd.mean(odm_loss_cls).asscalar() _loss[1] += nd.mean(odm_loss_loc).asscalar() arm_loss[0] += nd.mean(arm_loss_cls).asscalar() arm_loss[1] += nd.mean(arm_loss_loc).asscalar() # print(arm_loss) arm_cls_prob = nd.SoftmaxActivation(arm_cls_preds, mode='channel') odm_cls_prob = nd.SoftmaxActivation(odm_cls_preds, mode='channel') out = MultiBoxDetection(odm_cls_prob,odm_loc_preds,odm_anchor_boxes,\ force_suppress=True,clip=False,nms_threshold=.5,nms_topk=400) # print('out shape: {}'.format(out.shape)) if outs is None: outs = out labels = label else: outs = nd.concat(outs, out, dim=0) labels = nd.concat(labels, label, dim=0) box_metric.update([odm_loc_target], [odm_loc_preds * odm_loc_target_mask]) print('-------{} epoch end ------'.format(e)) train_AP = evaluate_MAP(outs, labels) valid_AP, val_box_metric = evaluate_acc(net, valid_data, ctx) info["train_ap"].append(train_AP) info["valid_ap"].append(valid_AP) info["loss"].append(_loss) print('odm loss: ', _loss) print('arm loss: ', arm_loss) if e == 0: sw.add_graph(net) # grads = [i.grad() for i in net.collect_params().values()] # grads_4_3 = net.collect_params().get('vgg0_conv9_weight').grad() # sw.add_histogram(tag ='vgg0_conv9_weight',values=grads_4_3,global_step=e, bins=1000 ) grads_4_2 = net.collect_params().get('vgg0_conv5_weight').grad() sw.add_histogram(tag='vgg0_conv5_weight', values=grads_4_2, global_step=e, bins=1000) # assert len(grads) == len(param_names) # logging the gradients of parameters for checking convergence # for i, name in enumerate(param_names): # sw.add_histogram(tag=name, values=grads[i], global_step=e, bins=1000) # net.export('./Model/RefineDet_MeterDetect') # net if (e + 1) % 5 == 0: print( "epoch: %d time: %.2f cls loss: %.4f,reg loss: %.4f lr: %.5f" % (e, time.time() - tic, _loss[0], _loss[1], trainer.learning_rate)) print("train mae: %.4f AP: %.4f" % (box_metric.get()[1], train_AP)) print("valid mae: %.4f AP: %.4f" % (val_box_metric.get()[1], valid_AP)) sw.add_scalar(tag='train_AP', value=train_AP, global_step=e) sw.add_scalar(tag='valid_AP', value=valid_AP, global_step=e) sw.close() if True: info["loss"] = np.array(info["loss"]) info["cls_loss"] = info["loss"][:, 0] info["box_loss"] = info["loss"][:, 1] plt.figure(figsize=(12, 4)) plt.subplot(121) plot("train_ap") plot("valid_ap") plt.legend(loc="upper right") plt.subplot(122) plot("cls_loss") plot("box_loss") plt.legend(loc="upper right") plt.savefig('loss_curve.png')
def train(): image_pool = ImagePool(pool_size) metric = mx.metric.CustomMetric(facc) stamp = datetime.now().strftime('%Y_%m_%d-%H_%M') logging.basicConfig(level=logging.DEBUG) # define a summary writer that logs data and flushes to the file every 5 seconds sw = SummaryWriter(logdir='./logs_', flush_secs=5) global_step = 0 # paramsG = netG.collect_params() # param_namesG = paramsG.keys() # # paramsD = netD.collect_params() # param_namesD = paramsD.keys() for epoch in range(epochs): if epoch == 0: netG.hybridize() netD.hybridize() # sw.add_graph(netG) # sw.add_graph(netD) tic = time.time() btic = time.time() train_data.reset() iter = 0 for local_step, batch in enumerate(train_data): ############################ # (1) Update D network: maximize log(D(x, y)) + log(1 - D(x, G(x, z))) ########################### real_in = batch.data[0].as_in_context(ctx) real_out = batch.data[1].as_in_context(ctx) fake_out = netG(real_in) fake_concat = image_pool.query(nd.concat(real_in, fake_out, dim=1)) with autograd.record(): # Train with fake image # Use image pooling to utilize history images output = netD(fake_concat) fake_label = nd.zeros(output.shape, ctx=ctx) errD_fake = GAN_loss(output, fake_label) metric.update([ fake_label, ], [ output, ]) # Train with real image real_concat = nd.concat(real_in, real_out, dim=1) output = netD(real_concat) real_label = nd.ones(output.shape, ctx=ctx) errD_real = GAN_loss(output, real_label) errD = (errD_real + errD_fake) * 0.5 errD.backward() metric.update([ real_label, ], [ output, ]) trainerD.step(batch.data[0].shape[0]) sw.add_graph((netG)) ############################ # (2) Update G network: maximize log(D(x, G(x, z))) - lambda1 * L1(y, G(x, z)) ########################### with autograd.record(): fake_out = netG(real_in) fake_concat = nd.concat(real_in, fake_out, dim=1) output = netD(fake_concat) real_label = nd.ones(output.shape, ctx=ctx) errG = GAN_loss( output, real_label) + L1_loss(real_out, fake_out) * lambda1 errG.backward() trainerG.step(batch.data[0].shape[0]) sw.add_scalar(tag='loss', value=('d_loss', errD.mean().asscalar()), global_step=global_step) sw.add_scalar(tag='loss', value=('g_loss', errG.mean().asscalar()), global_step=global_step) global_step += 1 # Log the first batch of images of each epoch if local_step == 0: fake_out = ((fake_out + 1) * 127.5) / 255 sw.add_image('minist_first_minibatch', fake_out, epoch) if iter % 10 == 0: name, acc = metric.get() logging.info('speed: {} samples/s'.format( batch_size / (time.time() - btic))) logging.info( 'discriminator loss = %f, generator loss = %f, binary training acc = %f at iter %d epoch %d' % (nd.mean(errD).asscalar(), nd.mean(errG).asscalar(), acc, iter, epoch)) iter = iter + 1 btic = time.time() sw.add_scalar(tag='binary_training_acc', value=('acc', acc), global_step=epoch) # gradsG = [i.grad() for i in netG.collect_params().values()] # gradsD = [i.grad() for i in netD.collect_params().values()] # # logging the gradients of parameters for checking convergence # for i, name in enumerate(param_namesG): # sw.add_histogram(tag=name + 'G', values=gradsG[i], global_step=epoch, bins=1000) # for i, name in enumerate(param_namesD): # sw.add_histogram(tag=name + 'D', values=gradsD[i], global_step=epoch, bins=1000) name, acc = metric.get() metric.reset() logging.info('\nbinary training acc at epoch %d: %s=%f' % (epoch, name, acc)) logging.info('time: %f' % (time.time() - tic)) # # Visualize one generated image for each epoch # fake_img = fake_out[0] # visualize(fake_img) # plt.show() sw.export_scalars('scalar_dict.json') sw.close()