def build_dataloader(rank, world_size, data_root, ann_file): val_dataset = COCOJoints(data_root, ann_file, image_set="val2017", order=("image", "boxes", "info")) val_sampler = SequentialSampler(val_dataset, 1, world_size=world_size, rank=rank) val_dataloader = DataLoader( val_dataset, sampler=val_sampler, num_workers=4, transform=T.Compose( transforms=[ T.Normalize(mean=cfg.img_mean, std=cfg.img_std), ExtendBoxes( cfg.test_x_ext, cfg.test_y_ext, cfg.input_shape[1] / cfg.input_shape[0], random_extend_prob=0, ), RandomBoxAffine( degrees=(0, 0), scale=(1, 1), output_shape=cfg.input_shape, rotate_prob=0, scale_prob=0, ), T.ToMode(), ], order=("image", "boxes", "info"), ), ) return val_dataloader
def worker(master_ip, port, rank, world_size, args): if world_size > 1: # Initialize distributed process group logger.info("init distributed process group {} / {}".format(rank, world_size)) dist.init_process_group( master_ip=master_ip, port=port, world_size=world_size, rank=rank, device=rank, ) model_name = "{}_{}x{}".format(args.arch, cfg.input_shape[0], cfg.input_shape[1]) save_dir = os.path.join(args.save, model_name) model = getattr(kpm, args.arch)() model.train() start_epoch = 0 if args.resume is not None: file = mge.load(args.resume) model.load_state_dict(file["state_dict"]) start_epoch = file["epoch"] optimizer = optim.Adam( model.parameters(), lr=cfg.initial_lr, weight_decay=cfg.weight_decay ) gm = GradManager() if dist.get_world_size() > 1: gm.attach( model.parameters(), callbacks=[dist.make_allreduce_cb("SUM", dist.WORLD)], ) else: gm.attach(model.parameters()) if dist.get_world_size() > 1: dist.bcast_list_(model.parameters(), dist.WORLD) # sync parameters # Build train datasets logger.info("preparing dataset..") ann_file = os.path.join( cfg.data_root, "annotations", "person_keypoints_train2017.json" ) train_dataset = COCOJoints( cfg.data_root, ann_file, image_set="train2017", order=("image", "keypoints", "boxes", "info"), ) logger.info("Num of Samples: {}".format(len(train_dataset))) train_sampler = data.RandomSampler( train_dataset, batch_size=cfg.batch_size, drop_last=True ) transforms = [ T.Normalize(mean=cfg.img_mean, std=cfg.img_std), RandomHorizontalFlip(0.5, keypoint_flip_order=cfg.keypoint_flip_order) ] if cfg.half_body_transform: transforms.append( HalfBodyTransform( cfg.upper_body_ids, cfg.lower_body_ids, cfg.prob_half_body ) ) if cfg.extend_boxes: transforms.append( ExtendBoxes(cfg.x_ext, cfg.y_ext, cfg.input_shape[1] / cfg.input_shape[0]) ) transforms += [ RandomBoxAffine( degrees=cfg.rotate_range, scale=cfg.scale_range, output_shape=cfg.input_shape, rotate_prob=cfg.rotation_prob, scale_prob=cfg.scale_prob, ) ] transforms += [T.ToMode()] train_queue = data.DataLoader( train_dataset, sampler=train_sampler, num_workers=args.workers, transform=T.Compose(transforms=transforms, order=train_dataset.order,), collator=HeatmapCollator( cfg.input_shape, cfg.output_shape, cfg.keypoint_num, cfg.heat_thr, cfg.heat_kernels if args.multi_scale_supervision else cfg.heat_kernels[-1:], cfg.heat_range, ), ) # Start training for epoch in range(start_epoch, cfg.epochs): loss = train(model, train_queue, optimizer, gm, epoch=epoch) logger.info("Epoch %d Train %.6f ", epoch, loss) if rank == 0 and epoch % cfg.save_freq == 0: # save checkpoint mge.save( {"epoch": epoch + 1, "state_dict": model.state_dict()}, os.path.join(save_dir, "epoch_{}.pkl".format(epoch)), )
def worker(rank, world_size, args): if world_size > 1: # Initialize distributed process group logger.info("init distributed process group {} / {}".format( rank, world_size)) dist.init_process_group( master_ip="localhost", master_port=23456, world_size=world_size, rank=rank, dev=rank, ) model_name = "{}_{}x{}".format(args.arch, cfg.input_shape[0], cfg.input_shape[1]) save_dir = os.path.join(args.save, model_name) model = getattr(M, args.arch)(pretrained=args.pretrained) model.train() start_epoch = 0 if args.c is not None: file = mge.load(args.c) model.load_state_dict(file["state_dict"]) start_epoch = file["epoch"] optimizer = optim.Adam( model.parameters(requires_grad=True), lr=args.lr, weight_decay=cfg.weight_decay, ) # Build train datasets logger.info("preparing dataset..") train_dataset = COCOJoints( args.data_root, args.ann_file, image_set="train", order=("image", "keypoints", "boxes", "info"), ) train_sampler = data.RandomSampler(train_dataset, batch_size=args.batch_size, drop_last=True) transforms = [T.Normalize(mean=cfg.IMG_MEAN, std=cfg.IMG_STD)] if cfg.half_body_transform: transforms.append( HalfBodyTransform(cfg.upper_body_ids, cfg.lower_body_ids, cfg.prob_half_body)) if cfg.extend_boxes: transforms.append( ExtendBoxes(cfg.x_ext, cfg.y_ext, cfg.input_shape[1] / cfg.input_shape[0])) transforms += [ RandomHorizontalFlip(0.5, keypoint_flip_order=cfg.keypoint_flip_order) ] transforms += [ RandomBoxAffine( degrees=cfg.rotate_range, scale=cfg.scale_range, output_shape=cfg.input_shape, rotate_prob=cfg.rotation_prob, scale_prob=cfg.scale_prob, ) ] transforms += [T.ToMode()] train_queue = data.DataLoader( train_dataset, sampler=train_sampler, num_workers=args.workers, transform=T.Compose( transforms=transforms, order=train_dataset.order, ), collator=HeatmapCollator( cfg.input_shape, cfg.output_shape, cfg.keypoint_num, cfg.heat_thre, cfg.heat_kernel if args.multi_scale_supervision else cfg.heat_kernel[-1:], cfg.heat_range, ), ) # Start training for epoch in range(start_epoch, args.epochs): loss = train(model, train_queue, optimizer, args, epoch=epoch) logger.info("Epoch %d Train %.6f ", epoch, loss) if rank == 0: # save checkpoint mge.save( { "epoch": epoch + 1, "state_dict": model.state_dict(), }, os.path.join(save_dir, "epoch_{}.pkl".format(epoch)), )