예제 #1
0
def worker(args):
    current_network = import_from_file(args.file)

    model = current_network.Net(current_network.Cfg())
    model.train()

    if dist.get_rank() == 0:
        logger.info(get_config_info(model.cfg))
        logger.info(repr(model))

    params_with_grad = []
    for name, param in model.named_parameters():
        if "bottom_up.conv1" in name and model.cfg.backbone_freeze_at >= 1:
            continue
        if "bottom_up.layer1" in name and model.cfg.backbone_freeze_at >= 2:
            continue
        params_with_grad.append(param)

    opt = SGD(
        params_with_grad,
        lr=model.cfg.basic_lr * args.batch_size,
        momentum=model.cfg.momentum,
        weight_decay=model.cfg.weight_decay * dist.get_world_size(),
    )

    gm = GradManager()
    if dist.get_world_size() > 1:
        gm.attach(params_with_grad,
                  callbacks=[dist.make_allreduce_cb("SUM", dist.WORLD)])
    else:
        gm.attach(params_with_grad)

    if args.weight_file is not None:
        # model.backbone.bottom_up.load_state_dict(weights, strict=False)
        logger.info("Loading Base-Pretrain weights...")
        weights = mge.load(args.weight_file)
        weight_new = {k: v for k, v in weights.items() if 'pred_' not in k}
        model.load_state_dict(weight_new, strict=False)

    if dist.get_world_size() > 1:
        dist.bcast_list_(model.parameters(), dist.WORLD)  # sync parameters

    if dist.get_rank() == 0:
        logger.info("Prepare dataset")
    train_loader = iter(
        build_dataloader(args.batch_size, args.dataset_dir, model.cfg))

    for epoch in range(model.cfg.max_epoch):
        train_one_epoch(model, train_loader, opt, gm, epoch, args)
        if dist.get_rank() == 0:
            save_path = "logs/{}/epoch_{}.pkl".format(
                os.path.basename(args.file).split(".")[0], epoch)
            mge.save(
                {
                    "epoch": epoch,
                    "state_dict": model.state_dict()
                },
                save_path,
            )
            logger.info("dump weights to %s", save_path)
예제 #2
0
 def valid_func(image, label):
     model.eval()
     logits = model(image)
     loss = F.loss.cross_entropy(logits, label, label_smooth=0.1)
     acc1, acc5 = F.topk_accuracy(logits, label, (1, 5))
     if dist.is_distributed():  # all_reduce_mean
         loss = dist.functional.all_reduce_sum(loss) / dist.get_world_size()
         acc1 = dist.functional.all_reduce_sum(acc1) / dist.get_world_size()
         acc5 = dist.functional.all_reduce_sum(acc5) / dist.get_world_size()
     return loss, acc1, acc5
예제 #3
0
파일: test.py 프로젝트: wjfwzzc/Models
 def valid_step(image, label):
     logits = model(image)
     loss = F.nn.cross_entropy(logits, label)
     acc1, acc5 = F.topk_accuracy(logits, label, topk=(1, 5))
     # calculate mean values
     if dist.get_world_size() > 1:
         loss = F.distributed.all_reduce_sum(loss) / dist.get_world_size()
         acc1 = F.distributed.all_reduce_sum(acc1) / dist.get_world_size()
         acc5 = F.distributed.all_reduce_sum(acc5) / dist.get_world_size()
     return loss, acc1, acc5
 def valid_func(image, label):
     model.eval()
     logits = model(image)
     loss = F.cross_entropy_with_softmax(logits, label, label_smooth=0.)
     acc1, acc5 = F.accuracy(logits, label, (1, 5))
     if dist.is_distributed():  # all_reduce_mean
         loss = dist.all_reduce_sum(loss, "valid_loss") / dist.get_world_size()
         acc1 = dist.all_reduce_sum(acc1, "valid_acc1") / dist.get_world_size()
         acc5 = dist.all_reduce_sum(acc5, "valid_acc5") / dist.get_world_size()
     return loss, acc1, acc5
예제 #5
0
 def train_func(image, label):
     model.train()
     logits = model(image)
     loss = F.cross_entropy_with_softmax(logits, label, label_smooth=0.1)
     acc1, acc5 = F.accuracy(logits, label, (1, 5))
     optimizer.backward(loss)  # compute gradients
     if dist.is_distributed():  # all_reduce_mean
         loss = dist.all_reduce_sum(loss) / dist.get_world_size()
         acc1 = dist.all_reduce_sum(acc1) / dist.get_world_size()
         acc5 = dist.all_reduce_sum(acc5) / dist.get_world_size()
     return loss, acc1, acc5
예제 #6
0
 def train_func(images, labels):
     opt.clear_grad()
     with gm:
         loss, accuracy, _ = model(images, labels)
         gm.backward(loss)
         if dist.is_distributed():
             # all_reduce_mean
             loss = dist.functional.all_reduce_sum(
                 loss) / dist.get_world_size()
             accuracy = dist.functional.all_reduce_sum(
                 accuracy) / dist.get_world_size()
     opt.step()
     return loss, accuracy
예제 #7
0
def worker(current_network,
           weight_file,
           dataset_dir,
           result_list,
           master_ip=None,
           port=None,
           world_size=1,
           rank=0):
    if world_size > 1:
        dist.init_process_group(
            master_ip=master_ip,
            port=port,
            world_size=world_size,
            rank=rank,
            device=rank,
        )

    cfg = current_network.Cfg()
    cfg.backbone_pretrained = False
    model = current_network.Net(cfg)
    model.eval()

    state_dict = mge.load(weight_file)
    if "state_dict" in state_dict:
        state_dict = state_dict["state_dict"]
    model.load_state_dict(state_dict)

    evaluator = DetEvaluator(model)

    test_loader = build_dataloader(dataset_dir, model.cfg)
    if dist.get_world_size() == 1:
        test_loader = tqdm(test_loader)

    for data in test_loader:
        image, im_info = DetEvaluator.process_inputs(
            data[0][0],
            model.cfg.test_image_short_size,
            model.cfg.test_image_max_size,
        )
        pred_res = evaluator.predict(image=mge.tensor(image),
                                     im_info=mge.tensor(im_info))

        result = {
            "det_res": pred_res,
            "image_id": int(data[1][2][0]),
        }
        if dist.get_world_size() > 1:
            result_list.put_nowait(result)
        else:
            result_list.append(result)
예제 #8
0
파일: train.py 프로젝트: lhaippp/GyroFlow
def train_and_evaluate(model, manager):
    rank = dist.get_rank()

    # reload weights from restore_file if specified
    if args.restore_file is not None:
        manager.load_checkpoints()

    world_size = dist.get_world_size()
    if world_size > 1:
        dist.bcast_list_(model.parameters())
        dist.bcast_list_(model.buffers())

    gm = GradManager().attach(
        model.parameters(),
        callbacks=dist.make_allreduce_cb("SUM") if world_size > 1 else None,
    )

    for epoch in range(manager.params.num_epochs):
        # compute number of batches in one epoch (one full pass over the training set)
        train(model, manager, gm)

        # Evaluate for one epoch on validation set
        evaluate(model, manager)

        # Save best model weights accroding to the params.major_metric
        if rank == 0:
            manager.check_best_save_last_checkpoints(latest_freq=5)
예제 #9
0
    def worker():
        rank = dist.get_rank()
        size = dist.get_world_size()
        x = mge.tensor(np.random.randn(1, rank * 2 + 2), dtype=np.float32)
        m = M.Linear(rank * 2 + 2, rank * 2 + 4)
        gm = GradManager().attach(m.parameters())
        opt = optim.SGD(m.parameters(), 1e-3, momentum=0.9)

        def train_func(x):
            with gm:
                if rank != 0:
                    x = dist.functional.remote_recv(rank - 1,
                                                    shape=(1, rank * 2 + 2),
                                                    dtype=np.float32)
                y = m(x)
                if rank != size - 1:
                    dist.functional.remote_send(y, dest_rank=rank + 1)
                    gm.backward()
                else:
                    y = y.mean()
                    gm.backward(y)
                opt.step().clear_grad()

        train_funcs = [
            train_func,
            trace(symbolic=False)(train_func),
            trace(symbolic=True)(train_func),
        ]

        for func in train_funcs:
            for i in range(3):
                func(x)
예제 #10
0
def train_generator_batch(image, label, *, gm, netG, netloss):
    B, T, _, h, w = image.shape
    biup = get_bilinear(image)
    netG.train()
    with gm:
        forward_hiddens = []
        backward_hiddens = []
        res = []
        hidden = F.zeros((2 * B, netG.hidden_channels, h, w))
        for i in range(T):
            now_frame = F.concat([image[:, i, ...], image[:, T - i - 1, ...]],
                                 axis=0)
            if i == 0:
                flow = netG.flownet(now_frame, now_frame)
            else:
                ref = F.concat([image[:, i - 1, ...], image[:, T - i, ...]],
                               axis=0)
                flow = netG.flownet(now_frame, ref)
            hidden = netG(hidden, flow, now_frame)
            forward_hiddens.append(hidden[0:B, ...])
            backward_hiddens.append(hidden[B:2 * B, ...])
        for i in range(T):
            res.append(
                netG.do_upsample(forward_hiddens[i],
                                 backward_hiddens[T - i - 1]))
        res = F.stack(res, axis=1)  # [B,T,3,H,W]
        loss = netloss(res + biup, label)
        gm.backward(loss)
        if dist.is_distributed():
            loss = dist.functional.all_reduce_sum(loss) / dist.get_world_size()
    return loss
예제 #11
0
 def train_func():
     loss = model.calc_loss()
     optimizer.backward(loss)  # compute gradients
     if dist.is_distributed():  # all_reduce_mean
         loss = dist.all_reduce_sum(loss,
                                    "train_loss") / dist.get_world_size()
     return loss
예제 #12
0
def worker(
    arch,
    model_file,
    data_root,
    ann_file,
):
    """
    :param net_file: network description file
    :param model_file: file of dump weights
    :param data_dir: the dataset directory
    :param worker_id: the index of the worker
    :param total_worker: number of gpu for evaluation
    """

    model = getattr(kpm, arch)()
    model.eval()
    weight = mge.load(model_file)
    weight = weight["state_dict"] if "state_dict" in weight.keys() else weight
    model.load_state_dict(weight)

    loader = build_dataloader(dist.get_rank(), dist.get_world_size(),
                              data_root, ann_file)
    if dist.get_rank() == 0:
        loader = tqdm(loader)

    result_list = []
    for data_dict in loader:
        img, bbox, info = data_dict

        fliped_img = img[:, :, :, ::-1] - np.zeros_like(img)
        data = np.concatenate([img, fliped_img], 0)
        data = np.ascontiguousarray(data).astype(np.float32)

        outs = model.predict(mge.tensor(data)).numpy()
        preds = outs[:img.shape[0]]
        preds_fliped = outs[img.shape[0]:, cfg.keypoint_flip_order, :, ::-1]
        preds = (preds + preds_fliped) / 2

        for i in range(preds.shape[0]):

            results = find_keypoints(preds[i], bbox[i, 0])

            final_score = float(results[:, -1].mean() * info[-1][i])
            image_id = int(info[-2][i])

            keypoints = results.copy()
            keypoints[:, -1] = 1
            keypoints = keypoints.reshape(-1, ).tolist()
            instance = {
                "image_id": image_id,
                "category_id": 1,
                "score": final_score,
                "keypoints": keypoints,
            }

            result_list.append(instance)
    return result_list
예제 #13
0
 def worker(rank, backend, q):
     if not mge.is_cuda_available():
         return
     _init_process_group_wrapper(world_size, rank, rank, backend, q)
     assert dist.is_distributed() == True
     assert dist.get_master_ip() == _LOCALHOST
     assert dist.get_master_port() > 0
     assert dist.get_world_size() == world_size
     assert dist.get_rank() == rank
     assert dist.get_backend() == backend
예제 #14
0
파일: test.py 프로젝트: zzh7982/Models
def worker(
    current_network, weight_file, dataset_dir, result_list,
    master_ip=None, port=None, world_size=None, rank=None
):
    if world_size > 1:
        dist.init_process_group(
            master_ip=master_ip,
            port=port,
            world_size=world_size,
            rank=rank,
            device=rank,
        )

    cfg = current_network.Cfg()
    cfg.backbone_pretrained = False
    model = current_network.Net(cfg)
    model.eval()

    state_dict = mge.load(weight_file)
    if "state_dict" in state_dict:
        state_dict = state_dict["state_dict"]
    model.load_state_dict(state_dict)

    def pred_func(data):
        pred = model(data)
        return pred

    test_loader = build_dataloader(dataset_dir, model.cfg)
    if dist.get_world_size() == 1:
        test_loader = tqdm(test_loader)

    for data in test_loader:
        img = data[0].squeeze()
        label = data[1].squeeze()
        im_info = data[2]
        pred = evaluate(pred_func, img, model.cfg)
        result = {"pred": pred, "gt": label, "name": im_info[2]}
        if dist.get_world_size() > 1:
            result_list.put_nowait(result)
        else:
            result_list.append(result)
예제 #15
0
    def __init__(
        self,
        dataset,
        batch_size=1,
        drop_last=False,
        num_samples=None,
        world_size=None,
        rank=None,
        seed=None,
    ):
        if (
            not isinstance(batch_size, int)
            or isinstance(batch_size, bool)
            or batch_size <= 0
        ):
            raise ValueError(
                "batch_size should be a positive integer value, "
                "but got batch_size={}".format(batch_size)
            )
        if not isinstance(drop_last, bool):
            raise ValueError(
                "drop_last should be a boolean value, but got "
                "drop_last={}".format(drop_last)
            )
        if num_samples is not None and (
            not isinstance(num_samples, int)
            or isinstance(num_samples, bool)
            or num_samples <= 0
        ):
            raise ValueError(
                "num_samples should be a positive integer "
                "value, but got num_samples={}".format(num_samples)
            )

        self.batch_size = batch_size
        self.dataset = dataset
        self.drop_last = drop_last

        if world_size is None:
            world_size = dist.get_world_size() if dist.is_distributed() else 1
        self.world_size = world_size
        if rank is None:
            rank = dist.get_rank() if dist.is_distributed() else 0
        self.rank = rank

        if num_samples is None:
            num_samples = len(self.dataset)
        self.num_samples = int(math.ceil(num_samples / self.world_size))

        # Make sure seeds are the same at each rank
        if seed is None and self.world_size > 1:
            seed = 0
        self.rng = np.random.RandomState(seed)
예제 #16
0
def train_generator_batch(image, label, *, gm, netG, netloss):
    B, T, _, h, w = image.shape
    biup = get_bilinear(image)
    # np_weight = [0,-1,0,-1,4,-1,0,-1,0]  # (1,1,3,3)
    # conv_weight = mge.tensor(np.array(np_weight).astype(np.float32)).reshape(1,1,3,3)
    # HR_mask = F.mean(label, axis=2, keepdims=False) # [B,T,H,W]       对T是做depthwise
    # HR_mask = HR_mask.reshape(B*T, 1, 4*h, 4*w)
    # HR_mask = F.conv2d(HR_mask, conv_weight, padding=1) #
    # HR_mask = (F.abs(HR_mask) > 0.1).astype("float32") # [B*T, 1, H, W]
    # HR_mask = HR_mask.reshape(B, T, 1, 4*h, 4*w)
    # HR_mask = 1 + HR_mask * 0.1
    HR_mask = 1
    netG.train()
    with gm:
        forward_hiddens = []
        backward_hiddens = []
        res = []
        # 对所有的image提取特征
        image = image.reshape(B * T, 3, h, w)
        image = netG.rgb(image).reshape(B, T, -1, h, w)
        # T=0
        now_frame = image[:, 0, ...]
        hidden = now_frame
        forward_hiddens.append(now_frame)
        for i in range(1, T):
            now_frame = image[:, i, ...]
            hidden = netG.aggr(F.concat([hidden, now_frame], axis=1))
            forward_hiddens.append(hidden)
        # T=-1
        now_frame = image[:, T - 1, ...]
        hidden = now_frame
        backward_hiddens.append(now_frame)
        for i in range(T - 2, -1, -1):
            now_frame = image[:, i, ...]
            hidden = netG.aggr(F.concat([hidden, now_frame], axis=1))
            backward_hiddens.append(hidden)
        # do upsample for all frames
        for i in range(T):
            res.append(
                netG.upsample(
                    F.concat([forward_hiddens[i], backward_hiddens[T - i - 1]],
                             axis=1)))

        res = F.stack(res, axis=1)  # [B,T,3,H,W]
        res = res + biup
        loss = netloss(res, label, HR_mask)
        # 加上edge损失
        # 探测label的edge map
        gm.backward(loss)
        if dist.is_distributed():
            loss = dist.functional.all_reduce_sum(loss) / dist.get_world_size()
    return loss
예제 #17
0
파일: train.py 프로젝트: wjfwzzc/Models
def train_one_epoch(model, data_queue, opt, gm, epoch, args):
    def train_func(image, im_info, gt_boxes):
        with gm:
            loss_dict = model(image=image, im_info=im_info, gt_boxes=gt_boxes)
            gm.backward(loss_dict["total_loss"])
            loss_list = list(loss_dict.values())
        opt.step().clear_grad()
        return loss_list

    meter = AverageMeter(record_len=model.cfg.num_losses)
    time_meter = AverageMeter(record_len=2)
    log_interval = model.cfg.log_interval
    tot_step = model.cfg.nr_images_epoch // (args.batch_size * dist.get_world_size())
    for step in range(tot_step):
        adjust_learning_rate(opt, epoch, step, model.cfg, args)

        data_tik = time.time()
        mini_batch = next(data_queue)
        data_tok = time.time()

        tik = time.time()
        loss_list = train_func(
            image=mge.tensor(mini_batch["data"]),
            im_info=mge.tensor(mini_batch["im_info"]),
            gt_boxes=mge.tensor(mini_batch["gt_boxes"])
        )
        tok = time.time()

        time_meter.update([tok - tik, data_tok - data_tik])

        if dist.get_rank() == 0:
            info_str = "e%d, %d/%d, lr:%f, "
            loss_str = ", ".join(
                ["{}:%f".format(loss) for loss in model.cfg.losses_keys]
            )
            time_str = ", train_time:%.3fs, data_time:%.3fs"
            log_info_str = info_str + loss_str + time_str
            meter.update([loss.numpy() for loss in loss_list])
            if step % log_interval == 0:
                logger.info(
                    log_info_str,
                    epoch,
                    step,
                    tot_step,
                    opt.param_groups[0]["lr"],
                    *meter.average(),
                    *time_meter.average()
                )
                meter.reset()
                time_meter.reset()
예제 #18
0
    def worker(rank):
        dist.init_process_group("localhost", port, world_size, rank, rank, backend)
        assert dist.is_distributed() == True
        assert dist.get_rank() == rank
        assert dist.get_world_size() == world_size
        assert dist.get_backend() == backend

        py_server_addr = dist.get_py_server_addr()
        assert py_server_addr[0] == "localhost"
        assert py_server_addr[1] == port

        mm_server_addr = dist.get_mm_server_addr()
        assert mm_server_addr[0] == "localhost"
        assert mm_server_addr[1] > 0

        assert isinstance(dist.get_client(), dist.Client)
예제 #19
0
def evaluate(model, manager):
    rank = dist.get_rank()
    world_size = dist.get_world_size()

    # set model to evaluation mode
    model.eval()

    # compute metrics over the dataset
    if manager.dataloaders["val"] is not None:
        # loss status and val status initial
        manager.reset_loss_status()
        manager.reset_metric_status("val")
        for data_batch in manager.dataloaders["val"]:
            # compute the real batch size
            bs = data_batch["img1"].shape[0]
            # move to GPU if available
            data_batch = utils.tensor_mge(data_batch)

            data_batch["imgs"] = F.concat(
                [data_batch["img1"] / 255.0, data_batch["img2"] / 255.0], 1)
            # compute model output
            output_batch = model(data_batch)
            # compute all loss on this batch
            # loss = compute_losses(data_batch, output_batch, manager.params)
            metrics = {}
            metrics["EPE"] = compute_metrics(data_batch, output_batch)
            if world_size > 1:
                # loss['total'] = F.distributed.all_reduce_sum(loss['total']) / world_size
                metrics['EPE'] = F.distributed.all_reduce_sum(
                    metrics['EPE']) / world_size
            # manager.update_loss_status(loss, "val", bs)
            # compute all metrics on this batch

            manager.update_metric_status(metrics, "val", bs)
            # manager.print_metrics("val", title="Val", color="green")

        # update data to tensorboard
        if rank == 0:
            # manager.writer.add_scalar("Loss/val", manager.loss_status["total"].avg, manager.epoch)
            # manager.logger.info("Loss/valid epoch {}: {}".format(manager.epoch, manager.loss_status['total'].avg))

            for k, v in manager.val_status.items():
                manager.writer.add_scalar("Metric/val/{}".format(k), v.avg,
                                          manager.epoch)
                # manager.logger.info("Metric/valid epoch {}: {}".format(manager.epoch, v.avg))
            # For each epoch, print the metric
            manager.print_metrics("val", title="Val", color="green")
예제 #20
0
def worker(rank, world_size, ngpus_per_node, args):
    if world_size > 1:
        # init process group
        dist.init_process_group(
            master_ip=args.dist_addr,
            port=args.dist_port,
            world_size=world_size,
            rank=rank,
            device=rank % ngpus_per_node,
            backend="nccl",
        )
        logging.info("init process group rank %d / %d", dist.get_rank(),
                     dist.get_world_size())

    # build dataset
    _, valid_dataloader = build_dataset(args)

    # build model
    model = resnet_model.__dict__[args.arch](pretrained=args.model is None)
    if args.model is not None:
        logging.info("load from checkpoint %s", args.model)
        checkpoint = megengine.load(args.model)
        if "state_dict" in checkpoint:
            state_dict = checkpoint["state_dict"]
        model.load_state_dict(state_dict)

    def valid_step(image, label):
        logits = model(image)
        loss = F.nn.cross_entropy(logits, label)
        acc1, acc5 = F.topk_accuracy(logits, label, topk=(1, 5))
        # calculate mean values
        if world_size > 1:
            loss = F.distributed.all_reduce_sum(loss) / world_size
            acc1 = F.distributed.all_reduce_sum(acc1) / world_size
            acc5 = F.distributed.all_reduce_sum(acc5) / world_size
        return loss, acc1, acc5

    model.eval()
    _, valid_acc1, valid_acc5 = valid(valid_step, valid_dataloader, args)
    logging.info(
        "Test Acc@1 %.3f, Acc@5 %.3f",
        valid_acc1,
        valid_acc5,
    )
예제 #21
0
def train_one_epoch(model, data_queue, opt, gm, epoch):
    # @trace(symbolic=True)
    def train_func(data, label):
        with gm:
            pred = model(data)
            loss = cross_entropy(pred,
                                 label,
                                 ignore_label=model.cfg.ignore_label)
            gm.backward(loss)
        opt.step().clear_grad()
        return loss

    meter = AverageMeter(record_len=1)
    time_meter = AverageMeter(record_len=2)
    log_interval = model.cfg.log_interval
    tot_step = model.cfg.nr_images_epoch // (model.cfg.batch_size *
                                             dist.get_world_size())
    for step in range(tot_step):
        adjust_learning_rate(opt, epoch, step, tot_step, model.cfg)

        data_tik = time.time()
        inputs, labels = next(data_queue)
        labels = np.squeeze(labels, axis=1).astype(np.int32)
        data_tok = time.time()

        tik = time.time()
        loss = train_func(mge.tensor(inputs), mge.tensor(labels))
        tok = time.time()

        time_meter.update([tok - tik, data_tok - data_tik])

        if dist.get_rank() == 0:
            info_str = "e%d, %d/%d, lr:%f, "
            loss_str = ", ".join(["{}:%f".format(loss) for loss in ["loss"]])
            time_str = ", train_time:%.3fs, data_time:%.3fs"
            log_info_str = info_str + loss_str + time_str
            meter.update([loss.numpy() for loss in [loss]])
            if step % log_interval == 0:
                logger.info(log_info_str, epoch, step, tot_step,
                            opt.param_groups[1]["lr"], *meter.average(),
                            *time_meter.average())
                meter.reset()
                time_meter.reset()
예제 #22
0
    def __init__(self, dataloader, **eval_kwargs):
        if not isinstance(dataloader, DataLoader):
            raise TypeError(
                'dataloader must be a mge DataLoader, but got {}'.format(
                    type(dataloader)))
        self.dataloader = dataloader
        self.eval_kwargs = eval_kwargs
        self.interval = self.eval_kwargs.pop('interval', 10000)
        self.save_image = self.eval_kwargs.pop('save_image', False)
        self.save_path = self.eval_kwargs.pop('save_path', None)
        self.log_path = self.eval_kwargs.pop('log_path', None)
        self.multi_process = self.eval_kwargs.pop('multi_process', False)
        self.ensemble = self.eval_kwargs.pop('ensemble', False)
        mkdir_or_exist(self.save_path)
        self.logger = get_logger(name="EvalIterHook",
                                 log_file=self.log_path)  # only for rank0

        if is_distributed():
            self.local_rank = dist.get_rank()
            self.nranks = dist.get_world_size()
        else:
            self.local_rank = 0
            self.nranks = 1
예제 #23
0
파일: train.py 프로젝트: zzh7982/Models
def worker(args):
    current_network = import_from_file(args.file)

    model = current_network.Net(current_network.Cfg())
    model.train()

    if dist.get_rank() == 0:
        logger.info(get_config_info(model.cfg))
        logger.info(repr(model))

    backbone_params = []
    head_params = []
    for name, param in model.named_parameters():
        if "backbone" in name:
            backbone_params.append(param)
        else:
            head_params.append(param)

    opt = SGD(
        [
            {
                "params": backbone_params,
                "lr": model.cfg.learning_rate * 0.1
            },
            {
                "params": head_params
            },
        ],
        lr=model.cfg.learning_rate,
        momentum=model.cfg.momentum,
        weight_decay=model.cfg.weight_decay * dist.get_world_size(),
    )

    gm = GradManager()
    if dist.get_world_size() > 1:
        gm.attach(model.parameters(),
                  callbacks=[dist.make_allreduce_cb("SUM", dist.WORLD)])
    else:
        gm.attach(model.parameters())

    cur_epoch = 0
    if args.resume is not None:
        pretrained = mge.load(args.resume)
        cur_epoch = pretrained["epoch"] + 1
        model.load_state_dict(pretrained["state_dict"])
        opt.load_state_dict(pretrained["opt"])
        if dist.get_rank() == 0:
            logger.info("load success: epoch %d", cur_epoch)

    if dist.get_world_size() > 1:
        dist.bcast_list_(model.parameters(), dist.WORLD)  # sync parameters

    if dist.get_rank() == 0:
        logger.info("Prepare dataset")
    train_loader = iter(
        build_dataloader(model.cfg.batch_size, args.dataset_dir, model.cfg))

    for epoch in range(cur_epoch, model.cfg.max_epoch):
        train_one_epoch(model, train_loader, opt, gm, epoch)
        if dist.get_rank() == 0:
            save_path = "log-of-{}/epoch_{}.pkl".format(
                os.path.basename(args.file).split(".")[0], epoch)
            mge.save(
                {
                    "epoch": epoch,
                    "state_dict": model.state_dict(),
                    "opt": opt.state_dict()
                }, save_path)
            logger.info("dump weights to %s", save_path)
예제 #24
0
def build_gradmanager(module):
    world_size = dist.get_world_size()
    gm = GradManager().attach(
        module.parameters(),
        callbacks=dist.make_allreduce_cb("SUM") if world_size > 1 else None)
    return gm
예제 #25
0
def all_reduce_mean(array: Tensor) -> Tensor:
    if dist.get_world_size() > 1:
        array = dist.functional.all_reduce_sum(array) / dist.get_world_size()
    return array
예제 #26
0
파일: train.py 프로젝트: zzh7982/Models
def worker(rank, world_size, ngpus_per_node, args):
    # pylint: disable=too-many-statements
    if rank == 0:
        os.makedirs(os.path.join(args.save, args.arch), exist_ok=True)
        megengine.logger.set_log_file(
            os.path.join(args.save, args.arch, "log.txt"))
    # init process group
    if world_size > 1:
        dist.init_process_group(
            master_ip=args.dist_addr,
            port=args.dist_port,
            world_size=world_size,
            rank=rank,
            device=rank % ngpus_per_node,
            backend="nccl",
        )
        logging.info("init process group rank %d / %d", dist.get_rank(),
                     dist.get_world_size())

    # build dataset
    train_dataloader, valid_dataloader = build_dataset(args)
    train_queue = iter(train_dataloader)  # infinite
    steps_per_epoch = 1280000 // (world_size * args.batch_size)

    # build model
    model = resnet_model.__dict__[args.arch]()

    # Sync parameters
    if world_size > 1:
        dist.bcast_list_(model.parameters(), dist.WORLD)

    # Autodiff gradient manager
    gm = autodiff.GradManager().attach(
        model.parameters(),
        callbacks=dist.make_allreduce_cb("SUM") if world_size > 1 else None,
    )

    # Optimizer
    opt = optim.SGD(
        model.parameters(),
        lr=args.lr,
        momentum=args.momentum,
        weight_decay=args.weight_decay *
        world_size,  # scale weight decay in "SUM" mode
    )

    # train and valid func
    def train_step(image, label):
        with gm:
            logits = model(image)
            loss = F.nn.cross_entropy(logits, label)
            acc1, acc5 = F.topk_accuracy(logits, label, topk=(1, 5))
            gm.backward(loss)
            opt.step().clear_grad()
        return loss, acc1, acc5

    def valid_step(image, label):
        logits = model(image)
        loss = F.nn.cross_entropy(logits, label)
        acc1, acc5 = F.topk_accuracy(logits, label, topk=(1, 5))
        # calculate mean values
        if world_size > 1:
            loss = F.distributed.all_reduce_sum(loss) / world_size
            acc1 = F.distributed.all_reduce_sum(acc1) / world_size
            acc5 = F.distributed.all_reduce_sum(acc5) / world_size
        return loss, acc1, acc5

    # multi-step learning rate scheduler with warmup
    def adjust_learning_rate(step):
        lr = args.lr * 0.1**bisect.bisect_right(
            [30 * steps_per_epoch, 60 * steps_per_epoch, 80 * steps_per_epoch],
            step)
        if step < 5 * steps_per_epoch:  # warmup
            lr = args.lr * (step / (5 * steps_per_epoch))
        for param_group in opt.param_groups:
            param_group["lr"] = lr
        return lr

    # start training
    objs = AverageMeter("Loss")
    top1 = AverageMeter("Acc@1")
    top5 = AverageMeter("Acc@5")
    clck = AverageMeter("Time")

    for step in range(0, args.epochs * steps_per_epoch):
        lr = adjust_learning_rate(step)

        t = time.time()

        image, label = next(train_queue)
        image = megengine.tensor(image, dtype="float32")
        label = megengine.tensor(label, dtype="int32")

        loss, acc1, acc5 = train_step(image, label)

        objs.update(loss.item())
        top1.update(100 * acc1.item())
        top5.update(100 * acc5.item())
        clck.update(time.time() - t)

        if step % args.print_freq == 0 and dist.get_rank() == 0:
            logging.info(
                "Epoch %d Step %d, LR %.4f, %s %s %s %s",
                step // steps_per_epoch,
                step,
                lr,
                objs,
                top1,
                top5,
                clck,
            )
            objs.reset()
            top1.reset()
            top5.reset()
            clck.reset()

        if (step + 1) % steps_per_epoch == 0:
            model.eval()
            _, valid_acc1, valid_acc5 = valid(valid_step, valid_dataloader,
                                              args)
            model.train()
            logging.info(
                "Epoch %d Test Acc@1 %.3f, Acc@5 %.3f",
                (step + 1) // steps_per_epoch,
                valid_acc1,
                valid_acc5,
            )
            megengine.save(
                {
                    "epoch": (step + 1) // steps_per_epoch,
                    "state_dict": model.state_dict(),
                },
                os.path.join(args.save, args.arch, "checkpoint.pkl"),
            )
예제 #27
0
def worker(args):
    # pylint: disable=too-many-statements
    rank = dist.get_rank()
    world_size = dist.get_world_size()
    if rank == 0:
        os.makedirs(os.path.join(args.save, args.arch), exist_ok=True)
        megengine.logger.set_log_file(os.path.join(args.save, args.arch, "log.txt"))
    # init process group

    # build dataset
    train_dataloader, valid_dataloader = build_dataset(args)
    train_queue = iter(train_dataloader)  # infinite
    steps_per_epoch = args.steps_per_epoch

    # build model
    model = UNetD(3)
    # Sync parameters
    if world_size > 1:
        dist.bcast_list_(model.parameters(), dist.WORLD)

    # Autodiff gradient manager
    gm = autodiff.GradManager().attach(
        model.parameters(),
        callbacks=dist.make_allreduce_cb("SUM") if world_size > 1 else None,
    )

    # Optimizer
    opt = optim.Adam(
        model.parameters(),
        lr=args.lr,
        weight_decay=args.weight_decay * world_size,  # scale weight decay in "SUM" mode
    )

    # mixup
    def preprocess(image, label):
        if args.dnd:
            image, label = MixUp_AUG(image, label)
        return image, label

    # train and valid func
    def train_step(image, label):
        with gm:
            logits = model(image)
            logits = image - logits
            loss = F.nn.l1_loss(logits, label)
            gm.backward(loss)
            opt.step().clear_grad()
        return loss

    def valid_step(image, label):
        pred = model(image)
        pred = image - pred
        mae_iter = F.nn.l1_loss(pred, label)
        psnr_it = batch_PSNR(pred, label)
        #print(psnr_it.item())
        if world_size > 1:
            mae_iter = F.distributed.all_reduce_sum(mae_iter) / world_size
            psnr_it = F.distributed.all_reduce_sum(psnr_it) / world_size

        return mae_iter, psnr_it

    # multi-step learning rate scheduler with warmup
    def adjust_learning_rate(step):
        #lr = 1e-6 + 0.5 * (args.lr - 1e-6)*(1 + np.cos(step/(args.epochs*steps_per_epoch) * np.pi))
        lr = args.lr * (np.cos(step / (steps_per_epoch * args.epochs) * np.pi) + 1) / 2
        for param_group in opt.param_groups:
            param_group["lr"] = lr
        return lr

    # start training
    for step in range(0, int(args.epochs * steps_per_epoch)):
        #print(step)
        lr = adjust_learning_rate(step)

        t_step = time.time()

        image, label = next(train_queue)
        if step > steps_per_epoch:
            image, label = preprocess(image, label)
        image = megengine.tensor(image)
        label = megengine.tensor(label)
        t_data = time.time() - t_step
        loss = train_step(image, label)
        t_train = time.time() - t_step
        speed = 1. / t_train
        if step % args.print_freq == 0 and dist.get_rank() == 0:
            logging.info(
                "Epoch {} Step {}, Speed={:.2g} mb/s, dp_cost={:.2g}, Loss={:5.2e}, lr={:.2e}".format(
                step // int(steps_per_epoch),
                step,
                speed,
                t_data/t_train,
                loss.item(),
                lr
            ))
        #print(steps_per_epoch)
        if (step + 1) % steps_per_epoch == 0:
            model.eval()
            loss, psnr_v = valid(valid_step, valid_dataloader)
            model.train()
            logging.info(
                "Epoch {} Test mae {:.3f}, psnr {:.3f}".format(
                (step + 1) // steps_per_epoch,
                loss.item(),
                psnr_v.item(),
            ))
            megengine.save(
                {
                    "epoch": (step + 1) // steps_per_epoch,
                    "state_dict": model.state_dict(),
                },
                os.path.join(args.save, args.arch, "checkpoint.pkl"),
            ) if rank == 0 else None
예제 #28
0
파일: train.py 프로젝트: zzh7982/Models
def worker(master_ip, port, rank, world_size, args):
    if world_size > 1:
        # Initialize distributed process group
        logger.info("init distributed process group {} / {}".format(rank, world_size))
        dist.init_process_group(
            master_ip=master_ip,
            port=port,
            world_size=world_size,
            rank=rank,
            device=rank,
        )

    model_name = "{}_{}x{}".format(args.arch, cfg.input_shape[0], cfg.input_shape[1])
    save_dir = os.path.join(args.save, model_name)

    model = getattr(kpm, args.arch)()
    model.train()
    start_epoch = 0
    if args.resume is not None:
        file = mge.load(args.resume)
        model.load_state_dict(file["state_dict"])
        start_epoch = file["epoch"]

    optimizer = optim.Adam(
        model.parameters(), lr=cfg.initial_lr, weight_decay=cfg.weight_decay
    )

    gm = GradManager()
    if dist.get_world_size() > 1:
        gm.attach(
            model.parameters(), callbacks=[dist.make_allreduce_cb("SUM", dist.WORLD)],
        )
    else:
        gm.attach(model.parameters())

    if dist.get_world_size() > 1:
        dist.bcast_list_(model.parameters(), dist.WORLD)  # sync parameters

    # Build train datasets
    logger.info("preparing dataset..")
    ann_file = os.path.join(
        cfg.data_root, "annotations", "person_keypoints_train2017.json"
    )
    train_dataset = COCOJoints(
        cfg.data_root,
        ann_file,
        image_set="train2017",
        order=("image", "keypoints", "boxes", "info"),
    )
    logger.info("Num of Samples: {}".format(len(train_dataset)))
    train_sampler = data.RandomSampler(
        train_dataset, batch_size=cfg.batch_size, drop_last=True
    )

    transforms = [
        T.Normalize(mean=cfg.img_mean, std=cfg.img_std),
        RandomHorizontalFlip(0.5, keypoint_flip_order=cfg.keypoint_flip_order)
    ]

    if cfg.half_body_transform:
        transforms.append(
            HalfBodyTransform(
                cfg.upper_body_ids, cfg.lower_body_ids, cfg.prob_half_body
            )
        )
    if cfg.extend_boxes:
        transforms.append(
            ExtendBoxes(cfg.x_ext, cfg.y_ext, cfg.input_shape[1] / cfg.input_shape[0])
        )

    transforms += [
        RandomBoxAffine(
            degrees=cfg.rotate_range,
            scale=cfg.scale_range,
            output_shape=cfg.input_shape,
            rotate_prob=cfg.rotation_prob,
            scale_prob=cfg.scale_prob,
        )
    ]
    transforms += [T.ToMode()]

    train_queue = data.DataLoader(
        train_dataset,
        sampler=train_sampler,
        num_workers=args.workers,
        transform=T.Compose(transforms=transforms, order=train_dataset.order,),
        collator=HeatmapCollator(
            cfg.input_shape,
            cfg.output_shape,
            cfg.keypoint_num,
            cfg.heat_thr,
            cfg.heat_kernels if args.multi_scale_supervision else cfg.heat_kernels[-1:],
            cfg.heat_range,
        ),
    )

    # Start training
    for epoch in range(start_epoch, cfg.epochs):
        loss = train(model, train_queue, optimizer, gm, epoch=epoch)
        logger.info("Epoch %d Train %.6f ", epoch, loss)

        if rank == 0 and epoch % cfg.save_freq == 0:  # save checkpoint
            mge.save(
                {"epoch": epoch + 1, "state_dict": model.state_dict()},
                os.path.join(save_dir, "epoch_{}.pkl".format(epoch)),
            )
예제 #29
0
def worker(master_ip, port, world_size, rank, configs):
    if world_size > 1:
        dist.init_process_group(
            master_ip=master_ip,
            port=port,
            world_size=world_size,
            rank=rank,
            device=rank,
        )
        logger.info("init process group for gpu{} done".format(rank))

    # set up logger
    os.makedirs(configs["base_dir"], exist_ok=True)
    worklog_path = os.path.join(configs["base_dir"], "worklog.txt")
    mge.set_log_file(worklog_path)

    # prepare model-related components
    model = FaceRecognitionModel(configs)

    # prepare data-related components
    preprocess = T.Compose([T.Normalize(mean=127.5, std=128), T.ToMode("CHW")])
    augment = T.Compose([T.RandomHorizontalFlip()])

    train_dataset = get_train_dataset(configs["dataset"],
                                      dataset_dir=configs["dataset_dir"])
    train_sampler = data.RandomSampler(train_dataset,
                                       batch_size=configs["batch_size"],
                                       drop_last=True)
    train_queue = data.DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  transform=T.Compose([augment, preprocess]))

    # prepare optimize-related components
    configs["learning_rate"] = configs["learning_rate"] * dist.get_world_size()
    if dist.get_world_size() > 1:
        dist.bcast_list_(model.parameters())
        gm = ad.GradManager().attach(
            model.parameters(), callbacks=[dist.make_allreduce_cb("mean")])
    else:
        gm = ad.GradManager().attach(model.parameters())
    opt = optim.SGD(
        model.parameters(),
        lr=configs["learning_rate"],
        momentum=configs["momentum"],
        weight_decay=configs["weight_decay"],
    )

    # try to load checkpoint
    model, start_epoch = try_load_latest_checkpoint(model, configs["base_dir"])

    # do training
    def train_one_epoch():
        def train_func(images, labels):
            opt.clear_grad()
            with gm:
                loss, accuracy, _ = model(images, labels)
                gm.backward(loss)
                if dist.is_distributed():
                    # all_reduce_mean
                    loss = dist.functional.all_reduce_sum(
                        loss) / dist.get_world_size()
                    accuracy = dist.functional.all_reduce_sum(
                        accuracy) / dist.get_world_size()
            opt.step()
            return loss, accuracy

        model.train()

        average_loss = AverageMeter("loss")
        average_accuracy = AverageMeter("accuracy")
        data_time = AverageMeter("data_time")
        train_time = AverageMeter("train_time")

        total_step = len(train_queue)
        data_iter = iter(train_queue)
        for step in range(total_step):
            # get next batch of data
            data_tic = time.time()
            images, labels = next(data_iter)
            data_toc = time.time()

            # forward pass & backward pass
            train_tic = time.time()
            images = mge.tensor(images, dtype="float32")
            labels = mge.tensor(labels, dtype="int32")
            loss, accuracy = train_func(images, labels)
            train_toc = time.time()

            # do the statistics and logging
            n = images.shape[0]
            average_loss.update(loss.item(), n)
            average_accuracy.update(accuracy.item() * 100, n)
            data_time.update(data_toc - data_tic)
            train_time.update(train_toc - train_tic)
            if step % configs["log_interval"] == 0 and dist.get_rank() == 0:
                logger.info(
                    "epoch: %d, step: %d, %s, %s, %s, %s",
                    epoch,
                    step,
                    average_loss,
                    average_accuracy,
                    data_time,
                    train_time,
                )

    for epoch in range(start_epoch, configs["num_epoch"]):
        adjust_learning_rate(opt, epoch, configs)
        train_one_epoch()

        if dist.get_rank() == 0:
            checkpoint_path = os.path.join(configs["base_dir"],
                                           f"epoch-{epoch+1}-checkpoint.pkl")
            mge.save(
                {
                    "epoch": epoch + 1,
                    "state_dict": model.state_dict()
                },
                checkpoint_path,
            )
예제 #30
0
    def __init__(
        self,
        dataset,
        batch_size=1,
        drop_last=False,
        num_samples=None,
        world_size=None,
        rank=None,
        seed=None,
    ):
        r"""
        An abstract class for all sampler.

        :type dataset: `dataset`
        :param dataset: dataset to sample from.
        :type batch_size: positive integer
        :param batch_size: batch size for batch method.
        :type drop_last: bool
        :param drop_last: set ``True`` to drop the last incomplete batch,
            if the dataset size is not divisible by the batch size. If ``False`` and 
            the size of dataset is not divisible by the batch_size, then the last batch will
            be smaller. Default: False
        :type num_samples: positive integer
        :param num_samples: number of samples assigned to one rank.
        :type world_size: positive integer
        :param world_size: number of ranks.
        :type rank: non-negative integer within 0 and world_size
        :param rank: rank id, non-negative interger within 0 and ``world_size``.
        :type seed: non-negative integer
        :param seed: seed for random operators.
        """
        if (not isinstance(batch_size, int) or isinstance(batch_size, bool)
                or batch_size <= 0):
            raise ValueError("batch_size should be a positive integer value, "
                             "but got batch_size={}".format(batch_size))
        if not isinstance(drop_last, bool):
            raise ValueError("drop_last should be a boolean value, but got "
                             "drop_last={}".format(drop_last))
        if num_samples is not None and (not isinstance(num_samples, int)
                                        or isinstance(num_samples, bool)
                                        or num_samples <= 0):
            raise ValueError(
                "num_samples should be a positive integer "
                "value, but got num_samples={}".format(num_samples))

        self.batch_size = batch_size
        self.dataset = dataset
        self.drop_last = drop_last

        if world_size is None:
            world_size = dist.get_world_size() if dist.is_distributed() else 1
        self.world_size = world_size
        if rank is None:
            rank = dist.get_rank() if dist.is_distributed() else 0
        self.rank = rank

        if num_samples is None:
            num_samples = len(self.dataset)
        self.num_samples = int(math.ceil(num_samples / self.world_size))

        # Make sure seeds are the same at each rank
        if seed is None and self.world_size > 1:
            seed = 0
        self.rng = np.random.RandomState(seed)