Exemplo n.º 1
0
def build_dataloader(rank, world_size, data_root, ann_file):
    val_dataset = COCOJoints(data_root,
                             ann_file,
                             image_set="val2017",
                             order=("image", "boxes", "info"))
    val_sampler = SequentialSampler(val_dataset,
                                    1,
                                    world_size=world_size,
                                    rank=rank)
    val_dataloader = DataLoader(
        val_dataset,
        sampler=val_sampler,
        num_workers=4,
        transform=T.Compose(
            transforms=[
                T.Normalize(mean=cfg.img_mean, std=cfg.img_std),
                ExtendBoxes(
                    cfg.test_x_ext,
                    cfg.test_y_ext,
                    cfg.input_shape[1] / cfg.input_shape[0],
                    random_extend_prob=0,
                ),
                RandomBoxAffine(
                    degrees=(0, 0),
                    scale=(1, 1),
                    output_shape=cfg.input_shape,
                    rotate_prob=0,
                    scale_prob=0,
                ),
                T.ToMode(),
            ],
            order=("image", "boxes", "info"),
        ),
    )
    return val_dataloader
Exemplo n.º 2
0
def worker(master_ip, port, rank, world_size, args):
    if world_size > 1:
        # Initialize distributed process group
        logger.info("init distributed process group {} / {}".format(rank, world_size))
        dist.init_process_group(
            master_ip=master_ip,
            port=port,
            world_size=world_size,
            rank=rank,
            device=rank,
        )

    model_name = "{}_{}x{}".format(args.arch, cfg.input_shape[0], cfg.input_shape[1])
    save_dir = os.path.join(args.save, model_name)

    model = getattr(kpm, args.arch)()
    model.train()
    start_epoch = 0
    if args.resume is not None:
        file = mge.load(args.resume)
        model.load_state_dict(file["state_dict"])
        start_epoch = file["epoch"]

    optimizer = optim.Adam(
        model.parameters(), lr=cfg.initial_lr, weight_decay=cfg.weight_decay
    )

    gm = GradManager()
    if dist.get_world_size() > 1:
        gm.attach(
            model.parameters(), callbacks=[dist.make_allreduce_cb("SUM", dist.WORLD)],
        )
    else:
        gm.attach(model.parameters())

    if dist.get_world_size() > 1:
        dist.bcast_list_(model.parameters(), dist.WORLD)  # sync parameters

    # Build train datasets
    logger.info("preparing dataset..")
    ann_file = os.path.join(
        cfg.data_root, "annotations", "person_keypoints_train2017.json"
    )
    train_dataset = COCOJoints(
        cfg.data_root,
        ann_file,
        image_set="train2017",
        order=("image", "keypoints", "boxes", "info"),
    )
    logger.info("Num of Samples: {}".format(len(train_dataset)))
    train_sampler = data.RandomSampler(
        train_dataset, batch_size=cfg.batch_size, drop_last=True
    )

    transforms = [
        T.Normalize(mean=cfg.img_mean, std=cfg.img_std),
        RandomHorizontalFlip(0.5, keypoint_flip_order=cfg.keypoint_flip_order)
    ]

    if cfg.half_body_transform:
        transforms.append(
            HalfBodyTransform(
                cfg.upper_body_ids, cfg.lower_body_ids, cfg.prob_half_body
            )
        )
    if cfg.extend_boxes:
        transforms.append(
            ExtendBoxes(cfg.x_ext, cfg.y_ext, cfg.input_shape[1] / cfg.input_shape[0])
        )

    transforms += [
        RandomBoxAffine(
            degrees=cfg.rotate_range,
            scale=cfg.scale_range,
            output_shape=cfg.input_shape,
            rotate_prob=cfg.rotation_prob,
            scale_prob=cfg.scale_prob,
        )
    ]
    transforms += [T.ToMode()]

    train_queue = data.DataLoader(
        train_dataset,
        sampler=train_sampler,
        num_workers=args.workers,
        transform=T.Compose(transforms=transforms, order=train_dataset.order,),
        collator=HeatmapCollator(
            cfg.input_shape,
            cfg.output_shape,
            cfg.keypoint_num,
            cfg.heat_thr,
            cfg.heat_kernels if args.multi_scale_supervision else cfg.heat_kernels[-1:],
            cfg.heat_range,
        ),
    )

    # Start training
    for epoch in range(start_epoch, cfg.epochs):
        loss = train(model, train_queue, optimizer, gm, epoch=epoch)
        logger.info("Epoch %d Train %.6f ", epoch, loss)

        if rank == 0 and epoch % cfg.save_freq == 0:  # save checkpoint
            mge.save(
                {"epoch": epoch + 1, "state_dict": model.state_dict()},
                os.path.join(save_dir, "epoch_{}.pkl".format(epoch)),
            )
Exemplo n.º 3
0
def worker(rank, world_size, args):
    if world_size > 1:
        # Initialize distributed process group
        logger.info("init distributed process group {} / {}".format(
            rank, world_size))
        dist.init_process_group(
            master_ip="localhost",
            master_port=23456,
            world_size=world_size,
            rank=rank,
            dev=rank,
        )

    model_name = "{}_{}x{}".format(args.arch, cfg.input_shape[0],
                                   cfg.input_shape[1])
    save_dir = os.path.join(args.save, model_name)

    model = getattr(M, args.arch)(pretrained=args.pretrained)
    model.train()
    start_epoch = 0
    if args.c is not None:
        file = mge.load(args.c)
        model.load_state_dict(file["state_dict"])
        start_epoch = file["epoch"]

    optimizer = optim.Adam(
        model.parameters(requires_grad=True),
        lr=args.lr,
        weight_decay=cfg.weight_decay,
    )
    # Build train datasets
    logger.info("preparing dataset..")
    train_dataset = COCOJoints(
        args.data_root,
        args.ann_file,
        image_set="train",
        order=("image", "keypoints", "boxes", "info"),
    )
    train_sampler = data.RandomSampler(train_dataset,
                                       batch_size=args.batch_size,
                                       drop_last=True)

    transforms = [T.Normalize(mean=cfg.IMG_MEAN, std=cfg.IMG_STD)]
    if cfg.half_body_transform:
        transforms.append(
            HalfBodyTransform(cfg.upper_body_ids, cfg.lower_body_ids,
                              cfg.prob_half_body))
    if cfg.extend_boxes:
        transforms.append(
            ExtendBoxes(cfg.x_ext, cfg.y_ext,
                        cfg.input_shape[1] / cfg.input_shape[0]))
    transforms += [
        RandomHorizontalFlip(0.5, keypoint_flip_order=cfg.keypoint_flip_order)
    ]
    transforms += [
        RandomBoxAffine(
            degrees=cfg.rotate_range,
            scale=cfg.scale_range,
            output_shape=cfg.input_shape,
            rotate_prob=cfg.rotation_prob,
            scale_prob=cfg.scale_prob,
        )
    ]
    transforms += [T.ToMode()]

    train_queue = data.DataLoader(
        train_dataset,
        sampler=train_sampler,
        num_workers=args.workers,
        transform=T.Compose(
            transforms=transforms,
            order=train_dataset.order,
        ),
        collator=HeatmapCollator(
            cfg.input_shape,
            cfg.output_shape,
            cfg.keypoint_num,
            cfg.heat_thre,
            cfg.heat_kernel
            if args.multi_scale_supervision else cfg.heat_kernel[-1:],
            cfg.heat_range,
        ),
    )

    # Start training
    for epoch in range(start_epoch, args.epochs):
        loss = train(model, train_queue, optimizer, args, epoch=epoch)
        logger.info("Epoch %d Train %.6f ", epoch, loss)

        if rank == 0:  # save checkpoint
            mge.save(
                {
                    "epoch": epoch + 1,
                    "state_dict": model.state_dict(),
                },
                os.path.join(save_dir, "epoch_{}.pkl".format(epoch)),
            )