def main(): """Main function.""" # Parse arguments. args = parse_args() # Parse configurations. config = parse_config(args.config) config = update_config(config, args.options) config.work_dir = args.work_dir config.checkpoint = args.checkpoint config.launcher = args.launcher config.backend = args.backend if not os.path.isfile(config.checkpoint): raise FileNotFoundError(f'Checkpoint file `{config.checkpoint}` is ' f'missing!') # Set CUDNN. config.cudnn_benchmark = config.get('cudnn_benchmark', True) config.cudnn_deterministic = config.get('cudnn_deterministic', False) torch.backends.cudnn.benchmark = config.cudnn_benchmark torch.backends.cudnn.deterministic = config.cudnn_deterministic # Setting for launcher. config.is_distributed = True init_dist(config.launcher, backend=config.backend) config.num_gpus = dist.get_world_size() # Setup logger. if dist.get_rank() == 0: logger_type = config.get('logger_type', 'normal') logger = build_logger(logger_type, work_dir=config.work_dir) shutil.copy(args.config, os.path.join(config.work_dir, 'config.py')) commit_id = os.popen('git rev-parse HEAD').readline() logger.info(f'Commit ID: {commit_id}') else: logger = build_logger('dumb', work_dir=config.work_dir) # Start inference. runner = getattr(runners, config.runner_type)(config, logger) runner.load(filepath=config.checkpoint, running_metadata=False, learning_rate=False, optimizer=False, running_stats=False) if args.synthesis_num > 0: num = args.synthesis_num logger.print() logger.info(f'Synthesizing images ...') runner.synthesize(num, html_name=f'synthesis_{num}.html') logger.info(f'Finish synthesizing {num} images.') if args.fid_num > 0: num = args.fid_num logger.print() logger.info(f'Testing FID ...') fid_value = runner.fid(num, align_tf=not args.use_torchvision) logger.info(f'Finish testing FID on {num} samples. ' f'The result is {fid_value:.6f}.')
def main(): args = parse_args() # Parse configurations. config = parse_config(args.config) os.environ['CUDA_VISIBLE_DEVICES'] = config.gpus timestamp = datetime.datetime.now() version = '%d-%d-%d-%02.0d-%02.0d-%02.0d' % \ (timestamp.year, timestamp.month, timestamp.day, timestamp.hour, timestamp.minute, timestamp.second) config.work_dir = os.path.join(config.work_dir, config.checkpoint_path.split('/')[-3], version) logger_type = config.get('logger_type', 'normal') logger = build_logger(logger_type, work_dir=config.work_dir) shutil.copy(args.config, os.path.join(config.work_dir, 'config.py')) commit_id = os.popen('git rev-parse HEAD').readline() logger.info(f'Commit ID: {commit_id}') runner = SefaRunner(config, logger) runner.run()
def main(): """Main function.""" # Parse arguments. args = parse_args() # Parse configurations. config = parse_config(args.config) config = update_config(config, args.options) os.environ['CUDA_VISIBLE_DEVICES'] = config.gpus timestamp = datetime.datetime.now() version = '%d-%d-%d-%02.0d-%02.0d-%02.0d' % \ (timestamp.year, timestamp.month, timestamp.day, timestamp.hour, timestamp.minute, timestamp.second) work_dir = os.path.join(args.work_dir, version) config.work_dir = work_dir config.resume_path = args.resume_path config.weight_path = args.weight_path config.seed = args.seed config.launcher = args.launcher config.backend = args.backend # Set CUDNN. config.cudnn_benchmark = config.get('cudnn_benchmark', True) config.cudnn_deterministic = config.get('cudnn_deterministic', False) torch.backends.cudnn.benchmark = config.cudnn_benchmark torch.backends.cudnn.deterministic = config.cudnn_deterministic # Set random seed. if config.seed is not None: random.seed(config.seed) np.random.seed(config.seed) torch.manual_seed(config.seed) config.cudnn_deterministic = True torch.backends.cudnn.deterministic = True warnings.warn('Random seed is set for training! ' 'This will turn on the CUDNN deterministic setting, ' 'which may slow down the training considerably! ' 'Unexpected behavior can be observed when resuming from ' 'checkpoints.') # Set launcher. config.is_distributed = True init_dist(config.launcher, backend=config.backend) config.num_gpus = dist.get_world_size() # Setup logger. if dist.get_rank() == 0: logger_type = config.get('logger_type', 'normal') logger = build_logger(logger_type, work_dir=config.work_dir) shutil.copy(args.config, os.path.join(config.work_dir, 'config.py')) commit_id = os.popen('git rev-parse HEAD').readline() logger.info(f'Commit ID: {commit_id}') else: logger = build_logger('dumb', work_dir=config.work_dir) # Start training. runner = getattr(runners, config.runner_type)(config, logger) if config.resume_path: runner.load(filepath=config.resume_path, running_metadata=True, learning_rate=True, optimizer=True, running_stats=False) if config.weight_path: runner.load(filepath=config.weight_path, running_metadata=False, learning_rate=False, optimizer=False, running_stats=False) runner.train()
def train(net, train_data, val_data, eval_metric, ctx, args): """Training pipeline""" net.collect_params().setattr('grad_req', 'null') net.collect_train_params().setattr('grad_req', 'write') trainer = gluon.Trainer( net.collect_train_params(), # fix batchnorm, fix first stage, etc... 'sgd', { 'learning_rate': args.lr, 'wd': args.wd, 'momentum': args.momentum, 'clip_gradient': 5 }) # lr decay policy lr_decay = float(args.lr_decay) lr_steps = sorted( [float(ls) for ls in args.lr_decay_epoch.split(',') if ls.strip()]) lr_warmup = float(args.lr_warmup) # avoid int division rpn_cls_loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss( from_sigmoid=False) rpn_box_loss = mx.gluon.loss.HuberLoss(rho=1 / 9.) # == smoothl1 rcnn_cls_loss = mx.gluon.loss.SoftmaxCrossEntropyLoss() rcnn_box_loss = mx.gluon.loss.HuberLoss() # == smoothl1 rcnn_mask_loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss( from_sigmoid=False) metrics = [ mx.metric.Loss('RPN_Conf'), mx.metric.Loss('RPN_SmoothL1'), mx.metric.Loss('RCNN_CrossEntropy'), mx.metric.Loss('RCNN_SmoothL1'), mx.metric.Loss('RCNN_Mask') ] rpn_acc_metric = RPNAccMetric() rpn_bbox_metric = RPNL1LossMetric() rcnn_acc_metric = RCNNAccMetric() rcnn_bbox_metric = RCNNL1LossMetric() rcnn_mask_metric = MaskAccMetric() rcnn_fgmask_metric = MaskFGAccMetric() metrics2 = [ rpn_acc_metric, rpn_bbox_metric, rcnn_acc_metric, rcnn_bbox_metric, rcnn_mask_metric, rcnn_fgmask_metric ] # set up logger log_file_path = args.save_prefix + '_train.log' logger = build_logger(log_file_path) logger.info(args) if args.verbose: logger.info('Trainable parameters:') logger.info(net.collect_train_params().keys()) logger.info('Start training from [Epoch {}]'.format(args.start_epoch)) best_map = [0] for epoch in range(args.start_epoch, args.epochs): while lr_steps and epoch >= lr_steps[0]: new_lr = trainer.learning_rate * lr_decay lr_steps.pop(0) trainer.set_learning_rate(new_lr) logger.info("[Epoch {}] Set learning rate to {}".format( epoch, new_lr)) for metric in metrics: metric.reset() tic = time.time() btic = time.time() if not args.disable_hybridization: net.hybridize(static_alloc=args.static_alloc) base_lr = trainer.learning_rate for i, batch in enumerate(train_data): if epoch == 0 and i <= lr_warmup: # adjust based on real percentage new_lr = base_lr * get_lr_at_iter(i / lr_warmup) if new_lr != trainer.learning_rate: if i % args.log_interval == 0: logger.info( '[Epoch 0 Iteration {}] Set learning rate to {}'. format(i, new_lr)) trainer.set_learning_rate(new_lr) batch = split_and_load(batch, ctx_list=ctx) batch_size = len(batch[0]) losses = [] metric_losses = [[] for _ in metrics] add_losses = [[] for _ in metrics2] with autograd.record(): for data, label, gt_mask, rpn_cls_targets, rpn_box_targets, rpn_box_masks in zip( *batch): gt_label = label[:, :, 4:5] gt_box = label[:, :, :4] cls_pred, box_pred, mask_pred, roi, samples, matches, rpn_score, rpn_box, anchors = net( data, gt_box) # losses of rpn rpn_score = rpn_score.squeeze(axis=-1) num_rpn_pos = (rpn_cls_targets >= 0).sum() rpn_loss1 = rpn_cls_loss( rpn_score, rpn_cls_targets, rpn_cls_targets >= 0) * rpn_cls_targets.size / num_rpn_pos rpn_loss2 = rpn_box_loss( rpn_box, rpn_box_targets, rpn_box_masks) * rpn_box.size / num_rpn_pos # rpn overall loss, use sum rather than average rpn_loss = rpn_loss1 + rpn_loss2 # generate targets for rcnn cls_targets, box_targets, box_masks = net.target_generator( roi, samples, matches, gt_label, gt_box) # losses of rcnn num_rcnn_pos = (cls_targets >= 0).sum() rcnn_loss1 = rcnn_cls_loss(cls_pred, cls_targets, cls_targets >= 0) * cls_targets.size / \ cls_targets.shape[0] / num_rcnn_pos rcnn_loss2 = rcnn_box_loss(box_pred, box_targets, box_masks) * box_pred.size / \ box_pred.shape[0] / num_rcnn_pos rcnn_loss = rcnn_loss1 + rcnn_loss2 # generate targets for mask mask_targets, mask_masks = net.mask_target( roi, gt_mask, matches, cls_targets) # loss of mask mask_loss = rcnn_mask_loss(mask_pred, mask_targets, mask_masks) * \ mask_targets.size / mask_targets.shape[0] / mask_masks.sum() # overall losses losses.append(rpn_loss.sum() + rcnn_loss.sum() + mask_loss.sum()) metric_losses[0].append(rpn_loss1.sum()) metric_losses[1].append(rpn_loss2.sum()) metric_losses[2].append(rcnn_loss1.sum()) metric_losses[3].append(rcnn_loss2.sum()) metric_losses[4].append(mask_loss.sum()) add_losses[0].append( [[rpn_cls_targets, rpn_cls_targets >= 0], [rpn_score]]) add_losses[1].append([[rpn_box_targets, rpn_box_masks], [rpn_box]]) add_losses[2].append([[cls_targets], [cls_pred]]) add_losses[3].append([[box_targets, box_masks], [box_pred]]) add_losses[4].append([[mask_targets, mask_masks], [mask_pred]]) add_losses[5].append([[mask_targets, mask_masks], [mask_pred]]) autograd.backward(losses) for metric, record in zip(metrics, metric_losses): metric.update(0, record) for metric, records in zip(metrics2, add_losses): for pred in records: metric.update(pred[0], pred[1]) trainer.step(batch_size) # update metrics if args.log_interval and not (i + 1) % args.log_interval: msg = ','.join([ '{}={:.3f}'.format(*metric.get()) for metric in metrics + metrics2 ]) logger.info( '[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}'. format( epoch, i, args.log_interval * batch_size / (time.time() - btic), msg)) btic = time.time() msg = ','.join( ['{}={:.3f}'.format(*metric.get()) for metric in metrics]) logger.info('[Epoch {}] Training cost: {:.3f}, {}'.format( epoch, (time.time() - tic), msg)) if not (epoch + 1) % args.val_interval: # consider reduce the frequency of validation to save time map_name, mean_ap = validate(net, val_data, ctx, eval_metric, args) val_msg = '\n'.join( ['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)]) logger.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg)) current_map = float(mean_ap[-1]) else: current_map = 0. save_params(net, logger, best_map, current_map, epoch, args.save_interval, args.save_prefix)
self.logger.info('Epoch %d, pixAcc %.3f, mIoU %.3f'%\ (epoch, pixAcc, mIoU)) # best model if mIoU > self.best_mIoU: self.best_mIoU = mIoU self.is_best = True # for save checkpoint if __name__ == "__main__": from utils.argument import parse_args_for_segm as parse_args from utils.logger import build_logger from utils.custom_load import make_save_dir, save_checkpoint args = parse_args() save_dir = make_save_dir(args) logger = build_logger(os.path.join(save_dir, 'train.log'), True) logger.info(args) trainer = Trainer(args, logger) if args.eval: logger.info('Evaluating model: {}'.format(args.resume)) trainer.validation(args.start_epoch) else: logger.info('Starting Epoch:{}'.format(args.start_epoch)) logger.info('Total Epochs: {}'.format(args.epochs)) for epoch in range(args.start_epoch, args.epochs): trainer.training(epoch) if not trainer.args.no_val: trainer.validation(epoch) # save every epoch save_checkpoint(trainer.net.module, save_dir, trainer.is_best, epoch)
def main(): # Arguments ########################################################################### try: args = get_args() config = process_config(args.config) except: logging.error("Missing or invalid arguments.") exit(0) # Logging ########################################################################### logging.basicConfig( filename=os.path.join("logs", config.exp_name + ".log"), format="[%(asctime)s] - [%(levelname)s]: %(message)s", filemode="a", level=logging.DEBUG, ) logging.info("Logging started.") logging.info("Keras version: {}".format(keras_version)) # Session ########################################################################### sess = tf.Session() K.set_session(sess) # create experiment related directories ########################################################################### create_dirs([config.summary_dir, config.checkpoint_dir]) # Initialize the model ########################################################################### model_formicID = load_model(config=config, num_species=97) model_formicID = compile_model(model=model_formicID, config=config) model_formicID = weights_load( model=model_formicID, weights= "experiments/T97_CaAll_QuM_ShSti_AugM_D05_LR0001_E200_I4_def_clean/checkpoint/weights_55-1.76.hdf5", ) # Training in batches with iterator ########################################################################### history = trainer_dir( model=model_formicID, config=config, callbacks=build_logger(config=config, model=model_formicID), ) save_model(model=model_formicID, filename="final_weights.hdf5", config=config) # Evaluation ########################################################################### plot_history(history=history, config=config, theme="ggplot", save=None) evaluator(model=model_formicID, config=config, test_dir=None) # Testing ########################################################################### Y_true, Y_pred, labels, species_dict = predictor( model=model_formicID, config=config, # species_json="data/species_dict.json", plot=True, n_img=10, n_cols=3, ) predictor_reports( Y_true=Y_true, Y_pred=Y_pred, config=config, species_dict=species_dict, target_names=labels, digits=5, ) plot_confusion_matrix( Y_pred=Y_pred, Y_true=Y_true, config=config, target_names=labels, species_dict=species_dict, title=None, cmap="viridis", normalize=True, scores=True, score_size=8, save="confusion_matrix.png", ) # Footer ########################################################################### K.clear_session() logging.info("Logging ended.")
def main(): """Main function.""" # Parse arguments. args = parse_args() # Parse configurations. config = parse_config(args.config) config = update_config(config, args.options) config.work_dir = args.work_dir config.resume_path = args.resume_path config.weight_path = args.weight_path config.seed = args.seed config.launcher = args.launcher config.backend = args.backend if args.adv != None: config.loss['g_loss_kwargs']['adv'] = float(args.adv) if args.lamb != None: config.loss['g_loss_kwargs']['lamb'] = float(args.lamb) if args.metric != None: config.loss['g_loss_kwargs']['metric'] = args.metric if args.baseLR != None: config.modules['generator']['opt']['base_lr'] = float(args.baseLR) / 2 if args.nethz != None: config.nethz = args.nethz config.savename = args.adv + '_' + args.lamb.replace( '.', 'dot') + '_' + args.metric.replace( '.', 'dot') + '_' + args.baseLR.replace('.', 'dot') config.data['train'][ 'root_dir'] = '/cluster/scratch/' + config.nethz + '/data' config.data['val'][ 'root_dir'] = '/cluster/scratch/' + config.nethz + '/data' # Set CUDNN. config.cudnn_benchmark = config.get('cudnn_benchmark', True) config.cudnn_deterministic = config.get('cudnn_deterministic', False) torch.backends.cudnn.benchmark = config.cudnn_benchmark torch.backends.cudnn.deterministic = config.cudnn_deterministic # Set random seed. config.seed = 26 if config.seed is not None: random.seed(config.seed) np.random.seed(config.seed) torch.manual_seed(config.seed) config.cudnn_deterministic = True torch.backends.cudnn.deterministic = True warnings.warn('Random seed is set for training! ' 'This will turn on the CUDNN deterministic setting, ' 'which may slow down the training considerably! ' 'Unexpected behavior can be observed when resuming from ' 'checkpoints.') # Set launcher. config.is_distributed = True init_dist(config.launcher, backend=config.backend) config.num_gpus = dist.get_world_size() # Setup logger. if dist.get_rank() == 0: logger_type = config.get('logger_type', 'normal') logger = build_logger(logger_type, work_dir=config.work_dir) shutil.copy(args.config, os.path.join(config.work_dir, 'config.py')) commit_id = os.popen('git rev-parse HEAD').readline() logger.info(f'Commit ID: {commit_id}') else: logger = build_logger('dumb', work_dir=config.work_dir) # Start training. runner = getattr(runners, config.runner_type)(config, logger) if config.resume_path: runner.load(filepath=config.resume_path, running_metadata=True, learning_rate=True, optimizer=True, running_stats=False) if config.weight_path: runner.load(filepath=config.weight_path, running_metadata=False, learning_rate=False, optimizer=False, running_stats=False) runner.train()