def build(self): """Build the trainer by assembling the necessary components.""" logging.debug("Trainer Config: {}".format(self.config)) self.do_validation = self.config.with_valid self.use_syncbn = self.config.syncbn if self.use_syncbn and zeus.is_torch_backend(): self.model = apex.parallel.convert_syncbn_model(self.model) self.train_loader = self._init_dataloader(mode='train') self.valid_loader = self._init_dataloader(mode='val') self.batch_num_train = self.train_loader.get_dataset_size( ) if zeus.is_ms_backend() else len(self.train_loader) self.batch_num_valid = self.valid_loader.get_dataset_size( ) if zeus.is_ms_backend() else len(self.valid_loader)
def _get_data_format(): if zeus.is_torch_backend() or zeus.is_ms_backend(): return 'channels_first' elif zeus.is_tf_backend(): return 'channels_last' else: return None
def _train_loop(self): """Do the training with data, callbacks and step functions etc.""" # Allow user to build trainer in before_train() callback, but they # should set lazy_built in configuration file to True self.callbacks.before_train() if self.skip_train: return if self.use_unsupervised_pretrain and zeus.is_torch_backend(): from .trainer.simclr.transforms import TransformsSimCLR from .trainer.simclr.train import simclr_train train_loader = self._init_dataloader(mode="train", transforms=TransformsSimCLR()) self.model = simclr_train(self.model, train_loader) repeat_time = 1 if zeus.is_ms_backend() else self.epochs for epoch in range(self._start_epoch, repeat_time): epoch_logs = {'train_num_batches': self.batch_num_train} if self.do_validation: epoch_logs.update({'valid_num_batches': self.batch_num_valid}) self.callbacks.before_epoch(epoch, epoch_logs) self._train_epoch() if self.do_validation and self._should_run_validation(epoch): self._valid_epoch() self.callbacks.after_epoch(epoch) self.callbacks.after_train() if self.distributed: self._shutdown_distributed()
def get_cls(cls, type_name, t_cls_name=None): """Get class and bind config to class. :param type_name: type name of class registry :param t_cls_name: class name :return:t_cls """ # lazy load class if not cls.is_exists(type_name, t_cls_name) and t_cls_name: cls._import_pkg(type_name, t_cls_name) # verify class if not cls.is_exists(type_name, t_cls_name): raise ValueError("can't find class type {} class name {} in class registry".format(type_name, t_cls_name)) # create instance without configs if t_cls_name is None: from zeus.datasets.conf.dataset import DatasetConfig from zeus.evaluator.conf import EvaluatorConfig if type_name == ClassType.DATASET: t_cls_name = DatasetConfig.type elif type_name == ClassType.TRAINER: import zeus if zeus.is_torch_backend(): t_cls_name = "TrainerTorch" elif zeus.is_tf_backend(): t_cls_name = "TrainerTf" elif zeus.is_ms_backend(): t_cls_name = "TrainerMs" elif type_name == ClassType.EVALUATOR: t_cls_name = EvaluatorConfig.type else: pass if t_cls_name is None: raise ValueError("can't find class. class type={}".format(type_name)) t_cls = cls.__registry__.get(type_name).get(t_cls_name) return t_cls
def __call__(self, model=None, distributed=False): """Call Optimizer class. :param model: model, used in torch case :param distributed: use distributed :return: optimizer """ params = self.map_config.get("params", {}) logging.debug("Call Optimizer. name={}, params={}".format(self.optim_cls.__name__, params)) optimizer = None try: if zeus.is_torch_backend(): learnable_params = [param for param in model.parameters() if param.requires_grad] optimizer = self.optim_cls(learnable_params, **params) if distributed: optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters(), compression=hvd.Compression.none) elif zeus.is_tf_backend(): optimizer = dynamic_optimizer(self.optim_cls, **params) if distributed: optimizer = hvd.DistributedOptimizer(optimizer) if zeus.is_gpu_device() else \ NPUDistributedOptimizer(optimizer) elif zeus.is_ms_backend(): learnable_params = [param for param in model.trainable_params() if param.requires_grad] optimizer = self.optim_cls(learnable_params, **params) return optimizer except Exception as ex: logging.error("Failed to call Optimizer name={}, params={}".format(self.optim_cls.__name__, params)) raise ex
def _train_epoch(self): if zeus.is_torch_backend(): self.model.train() for batch_index, batch in enumerate(self.train_loader): batch = self.make_batch(batch) batch_logs = {'train_batch': batch} self.callbacks.before_train_step(batch_index, batch_logs) train_batch_output = self.train_step(batch) batch_logs.update(train_batch_output) if self.config.is_detection_trainer: batch_logs.update({'is_detection_trainer': True}) self.callbacks.after_train_step(batch_index, batch_logs) elif zeus.is_tf_backend(): self.estimator.train(input_fn=self.train_input_fn, steps=len(self.train_loader), hooks=self._init_logging_hook()) elif zeus.is_ms_backend(): self.ms_model = MsModel(network=self.model, loss_fn=self.loss, optimizer=self.optimizer, metrics={self.metric_name: self.valid_metrics()}) config_ck = CheckpointConfig(save_checkpoint_steps=self.config.save_steps) # save the network model and parameters for subsequence fine-tuning save_path = self.get_local_worker_path(self.step_name, self.worker_id) ckpoint_cb = ModelCheckpoint(config=config_ck, directory=save_path) loss_cb = LossMonitor(per_print_times=self.config.report_freq) eval_cb = EvalCallBack(self.ms_model, self.valid_loader) self.ms_model.train(epoch=self.epochs, train_dataset=self.train_loader, callbacks=[ckpoint_cb, loss_cb, eval_cb], dataset_sink_mode=self.dataset_sink_mode)
def _save_best_model(self): """Save best model.""" if zeus.is_torch_backend(): torch.save(self.trainer.model.state_dict(), self.trainer.weights_file) elif zeus.is_tf_backend(): worker_path = self.trainer.get_local_worker_path() model_id = "model_{}".format(self.trainer.worker_id) weights_folder = FileOps.join_path(worker_path, model_id) FileOps.make_dir(weights_folder) checkpoint_file = tf.train.latest_checkpoint(worker_path) ckpt_globs = glob.glob("{}.*".format(checkpoint_file)) for _file in ckpt_globs: dst_file = model_id + os.path.splitext(_file)[-1] FileOps.copy_file(_file, FileOps.join_path(weights_folder, dst_file)) FileOps.copy_file(FileOps.join_path(worker_path, 'checkpoint'), weights_folder) elif zeus.is_ms_backend(): worker_path = self.trainer.get_local_worker_path() save_path = os.path.join( worker_path, "model_{}.ckpt".format(self.trainer.worker_id)) for file in os.listdir(worker_path): if file.startswith("CKP") and file.endswith(".ckpt"): self.weights_file = FileOps.join_path(worker_path, file) os.rename(self.weights_file, save_path)
def _valid_epoch(self): self.callbacks.before_valid() valid_logs = None if zeus.is_torch_backend(): self.model.eval() with torch.no_grad(): for batch_index, batch in enumerate(self.valid_loader): batch = self.make_batch(batch) batch_logs = {'valid_batch': batch} self.callbacks.before_valid_step(batch_index, batch_logs) valid_batch_output = self.valid_step(batch) self.callbacks.after_valid_step(batch_index, valid_batch_output) elif zeus.is_tf_backend(): eval_metrics = self.estimator.evaluate(input_fn=self.valid_input_fn, steps=len(self.valid_loader)) self.valid_metrics.update(eval_metrics) valid_logs = dict() valid_logs['cur_valid_perfs'] = self.valid_metrics.results elif zeus.is_ms_backend(): eval_metrics = self.ms_model.eval(valid_dataset=self.valid_loader, dataset_sink_mode=self.dataset_sink_mode) self.valid_metrics.update(eval_metrics) valid_logs = dict() valid_logs['cur_valid_perfs'] = self.valid_metrics.results self.callbacks.after_valid(valid_logs)
def load_records_from_model_folder(cls, model_folder): """Transfer json_file to records.""" if not model_folder or not os.path.exists(model_folder): logging.error( "Failed to load records from model folder, folder={}".format( model_folder)) return [] records = [] pattern = FileOps.join_path(model_folder, "desc_*.json") files = glob.glob(pattern) for _file in files: try: with open(_file) as f: worker_id = _file.split(".")[-2].split("_")[-1] weights_file = os.path.join(os.path.dirname(_file), "model_{}".format(worker_id)) if zeus.is_torch_backend(): weights_file = '{}.pth'.format(weights_file) elif zeus.is_ms_backend(): weights_file = '{}.ckpt'.format(weights_file) if not os.path.exists(weights_file): weights_file = None sample = dict(worker_id=worker_id, desc=json.load(f), weights_file=weights_file) record = ReportRecord().load_dict(sample) records.append(record) except Exception as ex: logging.info( 'Can not read records from json because {}'.format(ex)) return records
def _load_pretrained_model(cls, model, pretrained_model_file): pretrained_model_file = cls._get_abs_path(pretrained_model_file) logging.info("load model weights from file, weights file={}".format(pretrained_model_file)) if zeus.is_torch_backend(): if not os.path.isfile(pretrained_model_file): raise "Pretrained model is not existed, model={}".format(pretrained_model_file) import torch checkpoint = torch.load(pretrained_model_file) model.load_state_dict(checkpoint) if zeus.is_tf_backend(): if pretrained_model_file.endswith('.pth'): checkpoint = convert_checkpoint_from_pytorch(pretrained_model_file, model) model.load_checkpoint_from_numpy(checkpoint) else: pretrained_model_file = cls._get_tf_model_file(pretrained_model_file) model.load_checkpoint(pretrained_model_file) elif zeus.is_ms_backend(): from mindspore.train.serialization import load_checkpoint if hasattr(model, "pretrained"): pretrained_weight = model.pretrained(pretrained_model_file) else: if os.path.isfile(pretrained_model_file): pretrained_weight = pretrained_model_file else: for file in os.listdir(pretrained_model_file): if file.endswith(".ckpt"): pretrained_weight = os.path.join(pretrained_model_file, file) break load_checkpoint(pretrained_weight, net=model) return model
def get_model(cls, model_desc=None, pretrained_model_file=None): """Get model from model zoo. :param network_name: the name of network, eg. ResNetVariant. :type network_name: str or None. :param network_desc: the description of network. :type network_desc: str or None. :param pretrained_model_file: path of model. :type pretrained_model_file: str. :return: model. :rtype: model. """ try: network = NetworkDesc(model_desc) model = network.to_model() except Exception as e: logging.error("Failed to get model, model_desc={}, msg={}".format( model_desc, str(e))) raise e logging.info("Model was created.") if zeus.is_torch_backend() and pretrained_model_file: model = cls._load_pretrained_model(model, pretrained_model_file) elif zeus.is_ms_backend() and pretrained_model_file: model = cls._load_pretrained_model(model, pretrained_model_file) return model
def __call__(self, model=None, distributed=False, **kwargs): """Call Optimizer class. :param model: model, used in torch case :param distributed: use distributed :return: optimizer """ params = self.map_config.get("params", {}) logging.debug("Call Optimizer. name={}, params={}".format( self.optim_cls.__name__, params)) optimizer = None try: if zeus.is_torch_backend(): learnable_params = [ param for param in model.parameters() if param.requires_grad ] optimizer = self.optim_cls(learnable_params, **params) if distributed: optimizer = self.set_distributed(optimizer, model) elif zeus.is_tf_backend(): optimizer = dynamic_optimizer(self.optim_cls, **params) elif zeus.is_ms_backend(): if "dynamic_lr" in kwargs: params.update({"learning_rate": kwargs["dynamic_lr"]}) learnable_params = [ param for param in model.trainable_params() if param.requires_grad ] optimizer = self.optim_cls(learnable_params, **params) return optimizer except Exception as ex: logging.error("Failed to call Optimizer name={}, params={}".format( self.optim_cls.__name__, params)) raise ex
def _init_ms_context(self): if not zeus.is_ms_backend(): return if zeus.is_npu_device(): context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") else: context.set_context(mode=context.GRAPH_MODE, device_target="CPU") self.dataset_sink_mode = True if zeus.is_npu_device() else False
def get_named_modules(layer): """Get named modules.""" if zeus.is_tf_backend(): return [(op.name, op) for op in layer] elif zeus.is_torch_backend(): return layer.named_modules() elif zeus.is_ms_backend(): return layer._children_scope_recursive()
def get_shape(layer): """Get weight shape.""" if zeus.is_tf_backend(): return layer.get_shape() elif zeus.is_torch_backend(): return layer.weight.data.shape elif zeus.is_ms_backend(): para_name = list(layer._params)[0] return getattr(layer, para_name).default_input.shape
def Adapter(dataset): """Adapter of dataset.""" if zeus.is_torch_backend(): from .pytorch import TorchAdapter as Adapter elif zeus.is_tf_backend(): from .tensorflow import TfAdapter as Adapter elif zeus.is_ms_backend(): from .mindspore import MsAdapter as Adapter else: raise ValueError return Adapter(dataset)
def _init_metrics(self, metrics=None): """Init metrics.""" if metrics is not None: return metrics else: if zeus.is_torch_backend(): from zeus.metrics.pytorch.metrics import Metrics elif zeus.is_tf_backend(): from zeus.metrics.tensorflow.metrics import Metrics elif zeus.is_ms_backend(): from zeus.metrics.mindspore.metrics import Metrics return Metrics()
def _load_pretrained_model(cls, model, pretrained_model_file): if zeus.is_torch_backend(): import torch if not os.path.isfile(pretrained_model_file): raise "Pretrained model is not existed, model={}".format(pretrained_model_file) logging.info("load model weights from file, weights file={}".format(pretrained_model_file)) checkpoint = torch.load(pretrained_model_file) model.load_state_dict(checkpoint) elif zeus.is_ms_backend(): from mindspore.train.serialization import load_checkpoint load_checkpoint(pretrained_model_file, net=model) return model
def build(self): """Build the trainer by assembling the necessary components.""" self._init_hps(self.hps) logging.debug("Trainer Config: {}".format(self.config)) self.do_validation = self.config.with_valid self.use_syncbn = self.config.syncbn if self.use_syncbn and zeus.is_torch_backend(): self.model = apex.parallel.convert_syncbn_model(self.model) self.train_loader = self._init_dataloader(mode='train') self.valid_loader = self._init_dataloader(mode='val') self.batch_num_train = self.train_loader.get_dataset_size() if zeus.is_ms_backend() else len(self.train_loader) self.batch_num_valid = self.valid_loader.get_dataset_size() if zeus.is_ms_backend() else len(self.valid_loader) if zeus.is_torch_backend(): self.optimizer = Optimizer()(model=self.model, distributed=self.distributed) if hasattr(self.model, 'add_loss'): loss_cls = Loss()() self.model.add_loss(loss_cls) self.loss = self.model.overall_loss() else: self.loss = Loss()() self.lr_scheduler = LrScheduler()(self.optimizer) elif zeus.is_ms_backend(): self.optimizer = Optimizer()(model=self.model) if hasattr(self.model, 'add_loss'): loss_cls = Loss()() self.model.add_loss(loss_cls) self.loss = self.model.overall_loss() else: self.loss = Loss()() self.metric_name = self.config.metric().type # Some trainer has different train batch size from valid batch self.train_metrics = self._init_metrics() if zeus.is_torch_backend() else None self.valid_metrics = self._init_metrics() self._init_horovod_setting() if self.use_amp and zeus.is_torch_backend(): self.model, self.optimizer = amp.initialize( self.model, self.optimizer, opt_level='O1')
def parse_module_name(name, module): """Parse the module name of mindspore.""" if zeus.is_ms_backend(): while (list(module.cells()) != []): module = list(module.cells())[0] name_list = name.split("/")[1:] new_name = "" for name in name_list: name = "." + name.split("-")[0] new_name += name return new_name[1:], module else: return name, module
def load_model(self): """Load model.""" self.saved_folder = self.get_local_worker_path(self.step_name, self.worker_id) if not self.model_desc: self.model_desc = FileOps.join_path(self.saved_folder, 'desc_{}.json'.format(self.worker_id)) if not self.weights_file: if zeus.is_torch_backend(): self.weights_file = FileOps.join_path(self.saved_folder, 'model_{}.pth'.format(self.worker_id)) elif zeus.is_ms_backend(): for file in os.listdir(self.saved_folder): if file.startswith("CKP") and file.endswith(".ckpt"): self.weights_file = FileOps.join_path(self.saved_folder, file) if 'modules' not in self.model_desc: self.model_desc = ModelConfig.model_desc self.model = ModelZoo.get_model(self.model_desc, self.weights_file)
def is_filtered(self, desc=None): """Filter function of latency.""" if zeus.is_ms_backend(): return False try: if not self.dataloader: dataset_cls = ClassFactory.get_cls(ClassType.DATASET) self.dataset = dataset_cls() from zeus.datasets import Adapter self.dataloader = Adapter(self.dataset).loader model, count_input = self.get_model_input(desc) model(count_input) return False except Exception: encoding = desc['backbone']['encoding'] logging.info('Invalid encoding: {}'.format(encoding)) return True
def calc_model_flops_params(model, input, custom_hooks=None, verbose=False): """Pytorch model flops and parameters calculation. :param model: pytorch model :type model: torch.nn.Module :param input: pytorch input tensor :type input: torch.Tensor :param custom_hooks: hooks defined by outside customer :type custom_hooks: dict or None :param verbose: whether to print op type which not in collection :type verbose: bool, default True :return: flops and params :rtype: float, float """ if zeus.is_torch_backend(): from thop import profile try: _model = deepcopy(model) except Exception as e: _model = model if custom_hooks is None: custom_hooks = {} custom_hooks = add_new_hooks(custom_hooks) inputs = (input, ) flops, params = profile(_model, inputs, custom_hooks, verbose) del _model elif zeus.is_tf_backend(): import tensorflow.compat.v1 as tf with tf.Graph().as_default() as graph: dummy_input = tf.placeholder(dtype=tf.float32, shape=input.shape.as_list()) model.training = False model(dummy_input) opts = tf.profiler.ProfileOptionBuilder.float_operation() flops = tf.profiler.profile(graph, cmd='op', options=opts).total_float_ops opts = tf.profiler.ProfileOptionBuilder.trainable_variables_parameter( ) params = tf.profiler.profile(graph, cmd='op', options=opts).total_parameters flops *= 0.5 elif zeus.is_ms_backend(): # TODO flops, params = 0, 0 return flops, params
def get_input_data(self): """Get input data.""" count_input = None if zeus.is_torch_backend(): data_iter = iter(self.dataloader) input_data, _ = data_iter.next() count_input = input_data[:1] elif zeus.is_tf_backend(): import tensorflow as tf datasets = self.dataloader.input_fn() data_iter = tf.compat.v1.data.make_one_shot_iterator(datasets) input_data, _ = data_iter.get_next() count_input = input_data[:1] elif zeus.is_ms_backend(): data_iter = self.dataloader.create_dict_iterator() for batch in data_iter: count_input = batch['image'] break return count_input
def apply(self, mask_code): """Apply mask to batchNorm.""" end_mask = np.asarray(mask_code) idx = np.squeeze( np.argwhere(np.asarray(np.ones(end_mask.shape) - end_mask))).tolist() self._make_mask(idx) if zeus.is_tf_backend(): import tensorflow as tf return tf.assign( self.layer, self.layer * tf.constant(self.mask, dtype=self.layer.dtype)) elif zeus.is_torch_backend(): import torch self.layer.weight.data = self.layer.weight.data * torch.FloatTensor( self.mask) self.layer.bias.data = self.layer.bias.data * torch.FloatTensor( self.mask) self.layer.running_mean = self.layer.running_mean * torch.FloatTensor( self.mask) self.layer.running_var = self.layer.running_var * torch.FloatTensor( self.mask) self.layer.weight.data[idx].requires_grad = False self.layer.bias.data[idx].requires_grad = False self.layer.running_mean[idx].requires_grad = False self.layer.running_var[idx].requires_grad = False elif zeus.is_ms_backend(): from mindspore import Tensor self.layer.moving_mean.default_input = self.layer.moving_mean.default_input * \ Tensor(self.mask, self.layer.moving_mean.default_input.dtype) self.layer.moving_variance.default_input = self.layer.moving_variance.default_input * \ Tensor(self.mask, self.layer.moving_variance.default_input.dtype) self.layer.gamma.default_input = self.layer.gamma.default_input * \ Tensor(self.mask, self.layer.gamma.default_input.dtype) self.layer.beta.default_input = self.layer.beta.default_input * \ Tensor(self.mask, self.layer.beta.default_input.dtype) for id in idx: self.layer.moving_mean.default_input[id].requires_grad = False self.layer.moving_variance.default_input[ id].requires_grad = False self.layer.gamma.default_input[id].requires_grad = False self.layer.beta.default_input[id].requires_grad = False
def apply(self, end_mask_code, start_mask_code=None): """Apply mask to weight.""" end_mask_code = np.array(end_mask_code) if start_mask_code is not None: start_mask_code = np.array(start_mask_code) start_channel_idx = None end_channel_idx = np.squeeze( np.argwhere( np.asarray(np.ones(end_mask_code.shape) - end_mask_code))).tolist() if start_mask_code is not None: start_channel_idx = np.squeeze( np.argwhere( np.asarray( np.ones(start_mask_code.shape) - start_mask_code))).tolist() self._make_mask(end_mask_code, start_mask_code) if zeus.is_tf_backend(): import tensorflow as tf return tf.assign( self.layer, self.layer * tf.constant(self.mask, dtype=self.layer.dtype)) elif zeus.is_torch_backend(): import torch self.layer.weight.data = self.layer.weight.data * torch.FloatTensor( self.mask) self.layer.weight.data[ end_channel_idx, :, :, :].requires_grad = False if start_channel_idx is not None: self.layer.weight.data[:, start_channel_idx, :, :].requires_grad = False elif zeus.is_ms_backend(): from mindspore import Tensor self.layer.weight.default_input = self.layer.weight.default_input * \ Tensor(self.mask, self.layer.weight.default_input.dtype) for idx in end_channel_idx: self.layer.weight.default_input[ idx, :, :, :].requires_grad = False if start_channel_idx is not None: for idx in start_channel_idx: self.layer.weight.default_input[:, idx, :, :].requires_grad = False
def _train_loop(self): """Do the training with data, callbacks and step functions etc.""" # Allow user to build trainer in before_train() callback, but they # should set lazy_built in configuration file to True self.callbacks.before_train() if self.skip_train: return repeat_time = 1 if zeus.is_ms_backend() else self.epochs for epoch in range(repeat_time): epoch_logs = {'train_num_batches': self.batch_num_train} if self.do_validation: epoch_logs.update({'valid_num_batches': self.batch_num_valid}) self.callbacks.before_epoch(epoch, epoch_logs) self._train_epoch() if self.do_validation and self._should_run_validation(epoch): self._valid_epoch() self.callbacks.after_epoch(epoch) self.callbacks.after_train() if self.distributed: self._shutdown_distributed()
def _get_callbacks(self, customs, disables): defaults = [] if zeus.is_torch_backend(): defaults = ["ModelStatistics", "MetricsEvaluator", "ModelCheckpoint", "ModelBuilder", "PerformanceSaver", "RuntimeCallback", "LearningRateScheduler", "ProgressLogger", "ReportCallback", "VisualCallBack" ] elif zeus.is_tf_backend(): defaults = ["ModelStatistics", "MetricsEvaluator", "ModelCheckpoint", "ModelBuilder", "PerformanceSaver", "RuntimeCallback", "ProgressLogger", "ReportCallback", "VisualCallBack"] elif zeus.is_ms_backend(): defaults = ["ModelStatistics", "MetricsEvaluator", "ModelCheckpoint", "ModelBuilder", "PerformanceSaver", "ProgressLogger", "ReportCallback", "VisualCallBack"] custom_disables = [] disables = disables if disables else [] customs = customs if customs else [] custom_enables = [] if customs: if isinstance(customs, str): customs = [customs] for customs_name in customs: callback_class = ClassFactory.get_cls(ClassType.CALLBACK, customs_name) # Sort the callbacks if hasattr(callback_class, "disable_callbacks"): _disables = callback_class.disable_callbacks if not isinstance(_disables, list): _disables = [_disables] custom_disables += _disables if hasattr(callback_class, "enable_callbacks"): _enables = callback_class.enable_callbacks if not isinstance(_enables, list): _enables = [_enables] custom_enables += _enables if custom_enables: callbacks = custom_enables else: callbacks = set([_cls for _cls in defaults + customs if _cls not in disables + custom_disables]) callbacks = [ClassFactory.get_cls(ClassType.CALLBACK, _cls)() for _cls in callbacks] callbacks = sorted(callbacks, key=lambda callback: callback.priority) return callbacks
def _calc_forward_latency_gpu(model, input, sess_config=None, num=100): """Model forward latency calculation. :param model: network model :type model: torch or tf module :param input: input tensor :type input: Tensor of torch or tf :param num: forward number :type num: int :return: forward latency :rtype: float """ prepare_num = int(0.05 * num) if zeus.is_torch_backend(): for _ in range(prepare_num): model(input) start_time = time.time() for _ in range(num): model(input) latency = (time.time() - start_time) / num elif zeus.is_tf_backend(): import tensorflow.compat.v1 as tf with tf.Graph().as_default() as graph: input_holder = tf.placeholder(dtype=tf.float32, shape=input.shape.as_list()) model.training = False output = model(input_holder) with tf.Session(config=sess_config) as sess: sess.run(tf.global_variables_initializer()) input = tf.random.uniform(input.shape.as_list(), dtype=input.dtype) input_numpy = input.eval(session=sess) for _ in range(prepare_num): sess.run(output, feed_dict={input_holder: input_numpy}) start_time = time.time() for _ in range(num): sess.run(output, feed_dict={input_holder: input_numpy}) latency = (time.time() - start_time) / num elif zeus.is_ms_backend(): latency = 0. return latency
def before_train(self, logs=None): """Fetch trainer info before train stage.""" self._fix_path = "_".join( [self.trainer.step_name, str(self.trainer.worker_id)]) self.summary = SummaryBoard(self._archive_root, self._fix_path) # add graph only once. if zeus.is_tf_backend(): import tensorflow as tf datasets = self.trainer.valid_input_fn() data_iter = tf.compat.v1.data.make_one_shot_iterator(datasets) input_data, _ = data_iter.get_next() self.input = input_data[:1] graph = self.trainer.graph _graph_name_list = [n.name for n in graph.as_graph_def().node] if len(_graph_name_list) < 2: graph = _fetch_tf_graph(self.trainer.model, self.input) self.summary.add_graph(graph=graph, backend="tf") elif zeus.is_torch_backend(): model = self.trainer.model data_iter = iter(self.trainer.train_loader) input_batch, _ = data_iter.next() input_data = input_batch[:1] if self.trainer.use_cuda and not self.trainer.config.is_detection_trainer: input_data = input_data.cuda() try: self.summary.add_graph(model=model, feed_data=input_data, backend="torch") except BaseException as err: logging.warning( "Dump PyTorch model failed! with: \n{}".format(err)) elif zeus.is_ms_backend(): logging.debug("Don't support mindspore model dump yet.") else: logging.warning("non-known backend.")