def main(): rank, world_size = dist_init() logger.info("init done") # load cfg cfg.merge_from_file(args.cfg) if rank == 0: if not os.path.exists(cfg.TRAIN.LOG_DIR): os.makedirs(cfg.TRAIN.LOG_DIR) init_log('global', logging.INFO) if cfg.TRAIN.LOG_DIR: add_file_handler('global', os.path.join(cfg.TRAIN.LOG_DIR, 'logs.txt'), logging.INFO) logger.info("Version Information: \n{}\n".format(commit())) logger.info("config \n{}".format(json.dumps(cfg, indent=4))) # create model model = ModelBuilder().cuda().train() dist_model = DistModule(model) # load pretrained backbone weights if cfg.BACKBONE.PRETRAINED: cur_path = os.path.dirname(os.path.realpath(__file__)) backbone_path = os.path.join(cur_path, '../', cfg.BACKBONE.PRETRAINED) load_pretrain(model.backbone, backbone_path) # create tensorboard writer if rank == 0 and cfg.TRAIN.LOG_DIR: tb_writer = SummaryWriter(cfg.TRAIN.LOG_DIR) else: tb_writer = None # build dataset loader train_loader = build_data_loader() # build optimizer and lr_scheduler optimizer, lr_scheduler = build_opt_lr(dist_model.module, cfg.TRAIN.START_EPOCH) # resume training if cfg.TRAIN.RESUME: logger.info("resume from {}".format(cfg.TRAIN.RESUME)) assert os.path.isfile(cfg.TRAIN.RESUME), \ '{} is not a valid file.'.format(cfg.TRAIN.RESUME) model, optimizer, cfg.TRAIN.START_EPOCH = \ restore_from(model, optimizer, cfg.TRAIN.RESUME) dist_model = DistModule(model) logger.info(lr_scheduler) logger.info("model prepare done") # start training train(train_loader, dist_model, optimizer, lr_scheduler, tb_writer)
def main(): #(1) #rank, world_size = dist_init() rank = 0 logger.info("init done") # load cfg cfg.merge_from_file(args.cfg) #rank=0代表是单节点运行 if rank == 0: if not os.path.exists(cfg.TRAIN.LOG_DIR): os.makedirs(cfg.TRAIN.LOG_DIR) init_log('global', logging.INFO) if cfg.TRAIN.LOG_DIR: add_file_handler('global', os.path.join(cfg.TRAIN.LOG_DIR, 'logs.txt'), logging.INFO) logger.info("Version Information: \n{}\n".format(commit())) logger.info("config \n{}".format(json.dumps(cfg, indent=4))) #(2) # create model model = ModelBuilder().cuda().train() dist_model = nn.DataParallel(model, device_ids=[0, 1]) #dist_model = DistModule(model) # load pretrained backbone weights if cfg.BACKBONE.PRETRAINED: cur_path = os.path.dirname(os.path.realpath(__file__)) backbone_path = os.path.join(cur_path, '../', cfg.BACKBONE.PRETRAINED) load_pretrain(model.backbone, backbone_path) # create tensorboard writer if rank == 0 and cfg.TRAIN.LOG_DIR: tb_writer = SummaryWriter(cfg.TRAIN.LOG_DIR) else: tb_writer = None # build dataset loader 加载数据集 train_loader = build_data_loader() # build optimizer and lr_scheduler optimizer, lr_scheduler = build_opt_lr(dist_model.module, cfg.TRAIN.START_EPOCH) # resume training if cfg.TRAIN.RESUME: logger.info("resume from {}".format(cfg.TRAIN.RESUME)) assert os.path.isfile(cfg.TRAIN.RESUME), \ '{} is not a valid file.'.format(cfg.TRAIN.RESUME) # (1) 从某一个checkpoint开始训练 dist_model, optimizer, cfg.TRAIN.START_EPOCH = \ restore_from(model, optimizer, cfg.TRAIN.RESUME) # (2) 加载预训练模型 # device = torch.cuda.current_device() # ckpt = torch.load(cfg.TRAIN.RESUME, map_location=lambda storage, loc: storage.cuda(device)) # model.load_state_dict(ckpt, strict=False) logger.info(lr_scheduler) logger.info("model prepare done") # start training train(train_loader, dist_model, optimizer, lr_scheduler, tb_writer)