def train(self, **kwargs): config = self.config train_dataset = Dataset.create( config.train.dataset, split="train", data_root=config.system.data_root, transforms=self._transforms(is_train=True, crop_size=config.train.crop_size), ) train_loader = create_loader( train_dataset, batch_size=config.train.batch_size, num_workers=config.system.workers, dryrun=config.system.dryrun, ) val_dataset = Dataset.create( config.val.dataset, split="val", data_root=config.system.data_root, transforms=self._transforms(is_train=False), ) val_loader = create_loader( val_dataset, batch_size=config.val.batch_size, num_workers=config.system.workers, dryrun=config.system.dryrun, ) logger.info("Start training estimator: %s", type(self).__name__) self.model.to(self.device) n_epochs = config.train.epochs val_interval = config.system.val_interval for epoch in range(1, n_epochs + 1): logger.info(f"Training Epoch[{epoch}/{n_epochs}]") self._train_one_epoch(train_loader, epoch) if epoch % val_interval == 0: self._evaluate_one_epoch(val_loader, epoch) self.checkpointer.save(self, epoch=epoch) self.writer.close()
def create_dataset(config, split): """download dataset from source. Args: config: (CfgNode): estimator config: split: train, val, test Returns dataset: dataset obj must have len and __get_item__ """ dataset = Dataset.create( config[split].dataset.name, data_root=config.system.data_root, transforms=FasterRCNN.get_transform(), **config[split].dataset.args, ) return dataset
def evaluate(self, **kwargs): config = self.config test_dataset = Dataset.create( config.test.dataset, split="test", data_root=config.system.data_root, transforms=self._NYU_transforms(is_train=False), ) test_loader = create_loader( test_dataset, batch_size=config.test.batch_size, num_workers=config.system.workers, dryrun=config.system.dryrun, ) logger.info("Start evaluating estimator: %s", type(self).__name__) self.model.to(self.device) self._evaluate_one_epoch(test_loader, 1, 1)
def create_dataset(config, data_path, split): """download dataset from source. Args: config: (CfgNode): estimator config: data_path: Directory on localhost where datasets are located. split: train, val, test Returns dataset: dataset obj must have len and __get_item__ """ dataset = Dataset.create( config[split].dataset.name, data_path=data_path, transforms=FasterRCNN.get_transform(), **config[split].dataset.args, ) return dataset
def download(name, data_root, version): # TODO this method should be refactored once we have clean download # CLI interface. dataset = Dataset.find(name) dataset.download(data_root, version)
def train(self, **kwargs): # Training parameters config = self.config optimizer = self.optimizer val_interval = config.system.val_interval writer = self.writer # Load data train_dataset = Dataset.create( config.train.dataset, split="train", data_root=config.system.data_root, transforms=self._NYU_transforms(is_train=True), ) train_loader = create_loader( train_dataset, batch_size=config.train.batch_size, num_workers=config.system.workers, dryrun=config.system.dryrun, ) val_dataset = Dataset.create( config.val.dataset, split="test", data_root=config.system.data_root, transforms=self._NYU_transforms(is_train=False), ) val_loader = create_loader( val_dataset, batch_size=config.val.batch_size, num_workers=config.system.workers, dryrun=config.system.dryrun, ) # Logging logger.info("Start training estimator: %s", type(self).__name__) self.model.to(self.device) n_epochs = config.train.epochs # Start training for epoch in range(1, n_epochs + 1): logger.info(f"Epoch[{epoch}/{n_epochs}] training started.") loss_metric = Loss(self._loss_fn) self.model.train() N = len(train_loader) accumulation_steps = self.config.train.accumulation_steps optimizer.zero_grad() for i, (image, depth) in enumerate(train_loader): # Prepare sample and depth image = image.to(self.device) depth_n = depth.to(self.device) # Predict output = self.model(image) # Compute loss loss = self._loss_fn(output, depth_n) # Backward loss.backward() if (i + 1) % accumulation_steps == 0: self.optimizer.step() self.optimizer.zero_grad() loss_metric.update((output, depth_n)) # Log progress logger.debug(f"[{i}/{N}] Loss: {loss:.4f}") epoch_loss = loss_metric.compute() if epoch % val_interval == 0: self._evaluate_one_epoch(val_loader, epoch, n_epochs) # Record epoch's intermediate results writer.add_scalar("Training/Loss", epoch_loss, epoch) self.checkpointer.save(self, epoch=epoch)
def run(command, cfg): if cfg.system.verbose: root_logger = logging.getLogger() root_logger.setLevel(logging.DEBUG) logger.info("Run command: %s with config: %s\n", command, cfg) if torch.cuda.is_available() and not cfg.system.no_cuda: device = torch.device("cuda") else: device = torch.device("cpu") logdir = cfg.system.logdir if logdir == const.NULL_STRING: # Use logdir=None to force using SummaryWriter default logdir, # which points to ./runs/<model>_<timestamp> logdir = None # todo this makes it so that we lose the tensorboard writer of non-master # processes which could make debugging harder writer = SummaryWriter(logdir, write_to_disk=is_master()) kfp_writer = KubeflowPipelineWriter(filename=cfg.system.metricsfilename, filepath=cfg.system.metricsdir) checkpointer = EstimatorCheckpoint( estimator_name=cfg.estimator, log_dir=writer.logdir, distributed=cfg.system.distributed, ) estimator = Estimator.create( cfg.estimator, config=cfg, writer=writer, kfp_writer=kfp_writer, device=device, checkpointer=checkpointer, gpu=args.gpu, rank=args.rank, ) if command == "train": estimator.train() elif command == "evaluate": estimator.evaluate() elif command == "download-train": # TODO (YC) # We should remove reference to auth-token in various places to # enable download synthetic dataset. Usim is working on a solution # that will enable customers to sprcify cloud storage path # to store simulations. In the future, we should simply rely # on gcs service accounts to access simulation data for a given # run execution id. Dataset.create( cfg.train.dataset.name, data_root=cfg.system.data_root, auth_token=cfg.system.auth_token, # XXX(YC) This should be removed **cfg.train.dataset.args, ) Dataset.create( cfg.val.dataset.name, data_root=cfg.system.data_root, auth_token=cfg.system.auth_token, # XXX(YC) This should be removed **cfg.val.dataset.args, ) elif command == "download-evaluate": Dataset.create( cfg.test.dataset.name, data_root=cfg.system.data_root, auth_token=cfg.system.auth_token, # XXX(YC) This should be removed **cfg.test.dataset.args, ) writer.close() kfp_writer.write_metric()