Exemplo n.º 1
0
    def __call__(self, optimizer=None, epochs=None, steps=None):
        """Call lr scheduler class."""
        params = obj2config(self.config).get("params", {})
        logging.debug("Call LrScheduler. name={}, params={}".format(
            self._cls.__name__, params))

        if self._cls.__name__ == "CosineAnnealingLR":
            if params.get("T_max", -1) == -1:
                if params.get("by_epoch", True):
                    params["T_max"] = epochs
                else:
                    params["T_max"] = epochs * steps

        if self._cls.__name__ == "WarmupScheduler":
            params["epochs"] = epochs
            params["steps"] = steps

        try:
            if params and optimizer:
                return self._cls(optimizer, **params)
            elif optimizer:
                return self._cls(optimizer)
            else:
                return self._cls(**params)
        except Exception as ex:
            logging.error(
                "Failed to call LrScheduler name={}, params={}".format(
                    self._cls.__name__, params))
            raise ex
Exemplo n.º 2
0
 def _do_horovod_fully_train(self):
     pwd_dir = os.path.dirname(os.path.abspath(__file__))
     cf_file = os.path.join(pwd_dir, 'cf.pickle')
     cf_content = {'configs': ClassFactory.__configs__,
                   'registry': ClassFactory.__registry__,
                   'data': UserConfig().__data__,
                   'network_registry': NetworkFactory.__network_registry__,
                   'general': obj2config(General)}
     with open(cf_file, 'wb') as f:
         pickle.dump(cf_content, f)
     cf_file_remote = os.path.join(self.task.local_base_path, 'cf.pickle')
     FileOps.copy_file(cf_file, cf_file_remote)
     if os.environ.get('DLS_TASK_NUMBER') is None:
         # local cluster
         worker_ips = '127.0.0.1'
         if General.cluster.master_ip is not None and General.cluster.master_ip != '127.0.0.1':
             worker_ips = General.cluster.master_ip
             for ip in General.cluster.slaves:
                 worker_ips = worker_ips + ',' + ip
         cmd = ['bash', '{}/horovod/run_cluster_horovod_train.sh'.format(pwd_dir),
                str(self.world_device_size), cf_file_remote, worker_ips]
     else:
         # Roma
         cmd = ['bash', '{}/horovod/run_horovod_train.sh'.format(pwd_dir),
                str(self.world_device_size), cf_file_remote]
     proc = subprocess.Popen(cmd, env=os.environ)
     proc.wait()
Exemplo n.º 3
0
 def __call__(self):
     """Call loss cls."""
     params = obj2config(self.config).get("params", {})
     logging.debug("Call Loss. name={}, params={}".format(self._cls.__name__, params))
     try:
         if params:
             cls_obj = self._cls(**params) if isclass(self._cls) else partial(self._cls, **params)
         else:
             cls_obj = self._cls() if isclass(self._cls) else partial(self._cls)
         if vega.is_torch_backend() and TrainerConfig().cuda:
             cls_obj = cls_obj.cuda()
         return cls_obj
     except Exception as ex:
         logging.error("Failed to call Loss name={}, params={}".format(self._cls.__name__, params))
         raise ex
Exemplo n.º 4
0
    def __init__(self, **kwargs):
        super(Cityscapes, self).__init__(**kwargs)
        config = obj2config(getattr(self.config, self.mode))
        config.update(self.args)
        self.args = config
        self.root_dir = self.args['root_dir']
        self.image_size = self.args.Rescale.size
        self.list_file = self.args.list_file
        self.batch_size = self.args.get('batch_size', 1)
        self.num_parallel_batches = self.args.get('num_parallel_batches', 1)
        self.drop_remainder = self.args.get('drop_remainder', False)

        self.transforms = self._init_transforms()
        self.root_dir = FileOps.download_dataset(self.root_dir)
        self._init_data_files()
Exemplo n.º 5
0
 def __init__(self, hps=None, mode='train', **kwargs):
     """Construct method."""
     super(Dataset, self).__init__()
     if not hasattr(self, 'config'):
         raise ValueError("Dataset class should has attr config.")
     self.mode = mode
     if self.mode == "test" and not hasattr(self.config, "test"):
         self.mode = "val"
     self.args = deepcopy(obj2config(getattr(self.config, self.mode)))
     self._init_hps(hps)
     self.train = self.mode in ["train", "val"]
     self.num_images = self.args.get('num_images', 0)
     self.batch_size = self.args.batch_size
     self.world_size = 1
     self.rank = 0
Exemplo n.º 6
0
 def __init__(self, search_space=None, **kwargs):
     """Init SearchAlgorithm."""
     super(SearchAlgorithm, self).__init__()
     # modify config by kwargs, using local scope
     if self.config and kwargs:
         self.config = self.config()
         load_conf_from_desc(self.config, kwargs)
     self.search_space = search_space
     if hasattr(self.config, 'codec'):
         self.codec = Codec(search_space, type=self.config.codec)
     else:
         self.codec = None
     logging.debug("Config=%s", obj2config(self.config))
     self.report = Report()
     self.record = ReportRecord()
     self.record.step_name = self.step_name
Exemplo n.º 7
0
 def __init__(self, metric_cfg=None):
     """Init Metrics."""
     self.mdict = {}
     metric_config = obj2config(self.config)
     if not isinstance(metric_config, list):
         metric_config = [metric_config]
     for metric_item in metric_config:
         ClassFactory.get_cls(ClassType.METRIC, self.config.type)
         metric_name = metric_item.pop('type')
         metric_class = ClassFactory.get_cls(ClassType.METRIC, metric_name)
         if isfunction(metric_class):
             metric_class = partial(metric_class, **metric_item.get("params", {}))
         else:
             metric_class = metric_class(**metric_item.get("params", {}))
         self.mdict[metric_name] = metric_class
     self.mdict = Config(self.mdict)
     self.metric_results = dict()
Exemplo n.º 8
0
    def __call__(self,
                 model=None,
                 lr_scheduler=None,
                 epoch=None,
                 distributed=False):
        """Call Optimizer class.

        :param model: model, used in torch case
        :param lr_scheduler: learning rate scheduler, used in tf case
        :param epoch: epoch of training, used in tf case
        :param distributed: use distributed
        :return: optimizer
        """
        params = obj2config(self.config).get("params", {})
        logging.debug("Call Optimizer. name={}, params={}".format(
            self.optim_cls.__name__, params))
        optimizer = None
        try:
            if vega.is_torch_backend():
                learnable_params = [
                    param for param in model.parameters()
                    if param.requires_grad
                ]
                optimizer = self.optim_cls(learnable_params, **params)
                if distributed:
                    optimizer = hvd.DistributedOptimizer(
                        optimizer,
                        named_parameters=model.named_parameters(),
                        compression=hvd.Compression.none)
            elif vega.is_tf_backend():
                lr_scheduler.step(epoch)
                params['learning_rate'] = lr_scheduler.get_lr()[0]
                optimizer = self.optim_cls(**params)
                if distributed:
                    optimizer = hvd.DistributedOptimizer(optimizer) if vega.is_gpu_device() else \
                        NPUDistributedOptimizer(optimizer)
            return optimizer
        except Exception as ex:
            logging.error("Failed to call Optimizer name={}, params={}".format(
                self.optim_cls.__name__, params))
            raise ex
Exemplo n.º 9
0
 def __init__(self, hps=None, mode='train', **kwargs):
     """Construct method."""
     super(Dataset, self).__init__()
     self.args = dict()
     self.mode = mode
     if mode == "val" and not hasattr(self.config, "val"):
         self.mode = "test"
     # modify config from kwargs, `Cifar10(mode='test', data_path='/cache/datasets')`
     if kwargs:
         self.args = Config(kwargs)
     if hasattr(self, 'config'):
         config = obj2config(getattr(self.config, self.mode))
         config.update(self.args)
         self.args = config
     self._init_hps(hps)
     self.train = self.mode in ["train", "val"]
     transforms_list = self._init_transforms()
     self._transforms = Transforms(transforms_list)
     if "transforms" in kwargs.keys():
         self._transforms.__transform__ = kwargs["transforms"]
     self.dataset_init()
     self.sampler = self._init_sampler()
Exemplo n.º 10
0
 def __init__(self, args=None):
     """Init DistributedWorker."""
     super(DistributedWorker, self).__init__()
     # privates
     DistributedWorker.__worker_id__ = DistributedWorker.__worker_id__ + 1
     self._worker_id = DistributedWorker.__worker_id__
     # publics
     self.rank = 0
     self.world_size = 1
     self.worker_addr = ""
     self.worker_nccl_port = 16666
     self.timeout = int(float(General.worker.timeout) * 60 * 60)
     self.__env_config__ = (copy.deepcopy(UserConfig().data),
                            copy.deepcopy(ClassFactory.__configs__),
                            copy.deepcopy(ClassFactory.__registry__))
     self.__network_config__ = copy.deepcopy(
         NetworkFactory.__network_registry__)
     self.__general__ = obj2config(General)
     self.__worker_device_folder__ = os.path.join(self.temp_path,
                                                  '.worker_device')
     if not os.path.exists(self.__worker_device_folder__):
         os.makedirs(self.__worker_device_folder__, exist_ok=True)
     return
Exemplo n.º 11
0
    def build(self,
              model=None,
              optimizer=None,
              loss=None,
              lr_scheduler=None,
              metrics=None,
              hps=None,
              callbacks=None,
              train_loader=None,
              valid_loader=None,
              make_batch=None,
              train_step=None,
              valid_step=None,
              model_fn=None,
              train_input_fn=None,
              valid_input_fn=None,
              load_ckpt_flag=False,
              checkpoint_file_name="checkpoint.pth",
              model_pickle_file_name="model.pkl"):
        """Build the trainer by assembling the necessary components."""
        # Intitialize hyperparameters by parameters or configurations
        self._init_hps(hps)
        logging.debug("Trainer Config: {}".format(obj2config(self.config)))
        self.checkpoint_file_name = checkpoint_file_name
        self.model_pickle_file_name = model_pickle_file_name
        if vega.is_torch_backend():
            self._init_step_functions(make_batch, train_step, valid_step)
        elif vega.is_tf_backend():
            self._init_estimator_fn(model_fn, train_input_fn, valid_input_fn)
        self._init_tf_session()
        self._init_distributed_setting()
        self._init_cuda_setting()
        self._init_tf_estimator()
        self.do_validation = self.config.with_valid
        self.model = self._init_model(model)
        self.load_ckpt_flag = load_ckpt_flag
        if self.load_ckpt_flag:
            self.load_checkpoint()
        else:
            self._load_pretrained_model()
        self.use_syncbn = self.config.syncbn
        if self.use_syncbn and vega.is_torch_backend():
            self.model = apex.parallel.convert_syncbn_model(self.model)
        self.train_loader = self._init_dataloader(mode='train',
                                                  loader=train_loader)
        self.valid_loader = self._init_dataloader(mode='val',
                                                  loader=valid_loader)
        if vega.is_torch_backend():
            self.optimizer = Optimizer()(model=self.model, distributed=self.distributed) \
                if optimizer is None else optimizer
            self.loss = Loss()() if loss is None else loss
            self.lr_scheduler = LrScheduler()(
                self.optimizer) if lr_scheduler is None else lr_scheduler
        # Some trainer has different train batch size from valid batch
        self.train_metrics = self._init_metrics(
            metrics) if vega.is_torch_backend() else None
        self.valid_metrics = self._init_metrics(metrics)

        self._init_horovod_setting()
        if self.use_amp and vega.is_torch_backend():
            self.model, self.optimizer = amp.initialize(self.model,
                                                        self.optimizer,
                                                        opt_level='O1')
        if self.callbacks is None:
            self.callbacks = callbacks
        # self.output_model_desc()
        cur_working_dir = FileOps.join_path(self.local_output_path,
                                            self.step_name)
        FileOps.make_dir(cur_working_dir)
        # Make sure Trainer has been built for training
        self.has_built = True