示例#1
0
    def train(self, **kwargs):
        config = self.config
        train_dataset = Dataset.create(
            config.train.dataset,
            split="train",
            data_root=config.system.data_root,
            transforms=self._transforms(is_train=True,
                                        crop_size=config.train.crop_size),
        )
        train_loader = create_loader(
            train_dataset,
            batch_size=config.train.batch_size,
            num_workers=config.system.workers,
            dryrun=config.system.dryrun,
        )

        val_dataset = Dataset.create(
            config.val.dataset,
            split="val",
            data_root=config.system.data_root,
            transforms=self._transforms(is_train=False),
        )
        val_loader = create_loader(
            val_dataset,
            batch_size=config.val.batch_size,
            num_workers=config.system.workers,
            dryrun=config.system.dryrun,
        )

        logger.info("Start training estimator: %s", type(self).__name__)
        self.model.to(self.device)
        n_epochs = config.train.epochs
        val_interval = config.system.val_interval
        for epoch in range(1, n_epochs + 1):
            logger.info(f"Training Epoch[{epoch}/{n_epochs}]")
            self._train_one_epoch(train_loader, epoch)

            if epoch % val_interval == 0:
                self._evaluate_one_epoch(val_loader, epoch)

            self.checkpointer.save(self, epoch=epoch)

        self.writer.close()
示例#2
0
def create_dataset(config, split):
    """download dataset from source.

    Args:
        config: (CfgNode): estimator config:
        split: train, val, test

    Returns dataset: dataset obj must have len and __get_item__

    """
    dataset = Dataset.create(
        config[split].dataset.name,
        data_root=config.system.data_root,
        transforms=FasterRCNN.get_transform(),
        **config[split].dataset.args,
    )
    return dataset
示例#3
0
    def evaluate(self, **kwargs):
        config = self.config
        test_dataset = Dataset.create(
            config.test.dataset,
            split="test",
            data_root=config.system.data_root,
            transforms=self._NYU_transforms(is_train=False),
        )
        test_loader = create_loader(
            test_dataset,
            batch_size=config.test.batch_size,
            num_workers=config.system.workers,
            dryrun=config.system.dryrun,
        )

        logger.info("Start evaluating estimator: %s", type(self).__name__)
        self.model.to(self.device)
        self._evaluate_one_epoch(test_loader, 1, 1)
示例#4
0
def create_dataset(config, data_path, split):
    """download dataset from source.

    Args:
        config: (CfgNode): estimator config:
        data_path: Directory on localhost where datasets are located.
        split: train, val, test

    Returns dataset: dataset obj must have len and __get_item__

    """
    dataset = Dataset.create(
        config[split].dataset.name,
        data_path=data_path,
        transforms=FasterRCNN.get_transform(),
        **config[split].dataset.args,
    )
    return dataset
def download(name, data_root, version):
    # TODO this method should be refactored once we have clean download
    # CLI interface.
    dataset = Dataset.find(name)
    dataset.download(data_root, version)
示例#6
0
    def train(self, **kwargs):
        # Training parameters
        config = self.config
        optimizer = self.optimizer
        val_interval = config.system.val_interval
        writer = self.writer

        # Load data
        train_dataset = Dataset.create(
            config.train.dataset,
            split="train",
            data_root=config.system.data_root,
            transforms=self._NYU_transforms(is_train=True),
        )

        train_loader = create_loader(
            train_dataset,
            batch_size=config.train.batch_size,
            num_workers=config.system.workers,
            dryrun=config.system.dryrun,
        )

        val_dataset = Dataset.create(
            config.val.dataset,
            split="test",
            data_root=config.system.data_root,
            transforms=self._NYU_transforms(is_train=False),
        )

        val_loader = create_loader(
            val_dataset,
            batch_size=config.val.batch_size,
            num_workers=config.system.workers,
            dryrun=config.system.dryrun,
        )

        # Logging
        logger.info("Start training estimator: %s", type(self).__name__)

        self.model.to(self.device)
        n_epochs = config.train.epochs

        # Start training
        for epoch in range(1, n_epochs + 1):
            logger.info(f"Epoch[{epoch}/{n_epochs}] training started.")
            loss_metric = Loss(self._loss_fn)
            self.model.train()
            N = len(train_loader)
            accumulation_steps = self.config.train.accumulation_steps
            optimizer.zero_grad()
            for i, (image, depth) in enumerate(train_loader):
                # Prepare sample and depth
                image = image.to(self.device)
                depth_n = depth.to(self.device)

                # Predict
                output = self.model(image)

                # Compute loss
                loss = self._loss_fn(output, depth_n)

                # Backward
                loss.backward()

                if (i + 1) % accumulation_steps == 0:
                    self.optimizer.step()
                    self.optimizer.zero_grad()

                loss_metric.update((output, depth_n))

                # Log progress
                logger.debug(f"[{i}/{N}] Loss: {loss:.4f}")

            epoch_loss = loss_metric.compute()
            if epoch % val_interval == 0:
                self._evaluate_one_epoch(val_loader, epoch, n_epochs)

            # Record epoch's intermediate results
            writer.add_scalar("Training/Loss", epoch_loss, epoch)
            self.checkpointer.save(self, epoch=epoch)
示例#7
0
def run(command, cfg):
    if cfg.system.verbose:
        root_logger = logging.getLogger()
        root_logger.setLevel(logging.DEBUG)

    logger.info("Run command: %s with config: %s\n", command, cfg)

    if torch.cuda.is_available() and not cfg.system.no_cuda:
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
    logdir = cfg.system.logdir
    if logdir == const.NULL_STRING:
        # Use logdir=None to force using SummaryWriter default logdir,
        # which points to ./runs/<model>_<timestamp>
        logdir = None

    # todo this makes it so that we lose the tensorboard writer of non-master
    # processes which could make debugging harder
    writer = SummaryWriter(logdir, write_to_disk=is_master())
    kfp_writer = KubeflowPipelineWriter(filename=cfg.system.metricsfilename,
                                        filepath=cfg.system.metricsdir)
    checkpointer = EstimatorCheckpoint(
        estimator_name=cfg.estimator,
        log_dir=writer.logdir,
        distributed=cfg.system.distributed,
    )
    estimator = Estimator.create(
        cfg.estimator,
        config=cfg,
        writer=writer,
        kfp_writer=kfp_writer,
        device=device,
        checkpointer=checkpointer,
        gpu=args.gpu,
        rank=args.rank,
    )

    if command == "train":
        estimator.train()
    elif command == "evaluate":
        estimator.evaluate()
    elif command == "download-train":
        # TODO (YC)
        # We should remove reference to auth-token in various places to
        # enable download synthetic dataset. Usim is working on a solution
        # that will enable customers to sprcify cloud storage path
        # to store simulations. In the future, we should simply rely
        # on gcs service accounts to access simulation data for a given
        # run execution id.
        Dataset.create(
            cfg.train.dataset.name,
            data_root=cfg.system.data_root,
            auth_token=cfg.system.auth_token,  # XXX(YC) This should be removed
            **cfg.train.dataset.args,
        )
        Dataset.create(
            cfg.val.dataset.name,
            data_root=cfg.system.data_root,
            auth_token=cfg.system.auth_token,  # XXX(YC) This should be removed
            **cfg.val.dataset.args,
        )
    elif command == "download-evaluate":
        Dataset.create(
            cfg.test.dataset.name,
            data_root=cfg.system.data_root,
            auth_token=cfg.system.auth_token,  # XXX(YC) This should be removed
            **cfg.test.dataset.args,
        )

    writer.close()
    kfp_writer.write_metric()