예제 #1
0
파일: trainer_ms.py 프로젝트: ylfzr/vega
 def _init_ms_context(self):
     if zeus.is_npu_device():
         context.set_context(mode=context.GRAPH_MODE,
                             device_target="Ascend")
     else:
         context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
     self.dataset_sink_mode = True if zeus.is_npu_device() else False
예제 #2
0
 def _init_tf_estimator(self):
     """Init tensorflow estimator."""
     sess_config = self._init_session_config()
     if zeus.is_gpu_device():
         self._init_gpu_estimator(sess_config)
     elif zeus.is_npu_device():
         self._init_npu_estimator(sess_config)
예제 #3
0
파일: adapter.py 프로젝트: vineetrao25/vega
    def input_fn(self):
        """Return the next `batch_size` examples from this data set."""
        if hasattr(self.dataset, "input_fn"):
            return self.dataset.input_fn()
        self._get_dateset_info()
        dataset = tf.data.Dataset.from_tensor_slices(
            (self.data_index, self.data_index))
        if self.dataset.world_size > 1:
            dataset = dataset.shard(self.dataset.world_size, self.dataset.rank)
        if self.dataset.mode == 'train':
            dataset = dataset.repeat()
        if self.args.shuffle:
            dataset = dataset.shuffle(buffer_size=self._num_examples)

        if zeus.is_npu_device():
            # esr cannot adapt to num_parallel_calls on NPU
            dataset = dataset.map(self.data_map_func)
            dataset = dataset.batch(batch_size=self.args.batch_size,
                                    drop_remainder=self.args.drop_last)
        else:
            dataset = dataset.map(
                self.data_map_func,
                num_parallel_calls=tf.data.experimental.AUTOTUNE)
            dataset = dataset.batch(batch_size=self.args.batch_size,
                                    drop_remainder=self.args.drop_last)
            dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE)
        return dataset
예제 #4
0
def run_remote_worker(worker_id, worker_path, id):
    """Run worker on remote mochine."""
    from zeus.common.utils import init_log
    init_log(level="info",
             log_file=".temp_{}.log".format(worker_id),
             log_path=worker_path)

    config = _load_config(worker_id, worker_path, id)
    os.environ["LD_LIBRARY_PATH"] = config["env"]["LD_LIBRARY_PATH"]
    os.environ["PWD"] = config["env"]["PWD"]
    os.chdir(os.environ["PWD"])
    zeus.register_zeus(os.environ['BACKEND_TYPE'].lower())

    if zeus.is_gpu_device():
        sub_pid_list = call_in_gpu(config, id, worker_id, worker_path)
    elif zeus.is_npu_device():
        os.environ["PYTHONPATH"] = config["env"]["PYTHONPATH"]
        os.environ["PATH"] = config["env"]["PATH"]
        os.environ["ASCEND_OPP_PATH"] = config["env"]["ASCEND_OPP_PATH"]
        sub_pid_list = call_in_npu(config, id, worker_id, worker_path)
    logging.info("DistributedWorker finished!")
    for sub_pid in sub_pid_list:
        kill_proc_tree(pid=sub_pid)
    logging.info("DistributedWorker subprocess cleaned!")
    return 0
예제 #5
0
 def _init_tf_estimator(self):
     """Init tensorflow estimator."""
     if not zeus.is_tf_backend():
         return
     sess_config = self._init_session_config()
     if zeus.is_gpu_device():
         self._init_gpu_estimator(sess_config)
     elif zeus.is_npu_device():
         self._init_npu_estimator(sess_config)
예제 #6
0
 def _init_distributed_setting(self):
     if not self.distributed:
         return
     if zeus.is_npu_device():
         self.npu_init = npu_ops.initialize_system()
         self.npu_shutdown = npu_ops.shutdown_system()
         self.sess.run(self.npu_init)
     self._world_size = hvd.size() if zeus.is_gpu_device() else get_rank_size()
     self._rank_id = hvd.rank() if zeus.is_gpu_device() else get_rank_id()
     self._local_rank_id = hvd.local_rank() if zeus.is_gpu_device() else get_local_rank_id()
예제 #7
0
 def exclude_ignore_index(self, logits, labels):
     """Ignore certain index."""
     logits = tf.transpose(logits, [0, 2, 3, 1])
     if zeus.is_gpu_device():
         indices = tf.where(tf.not_equal(labels, self.ignore_index))
         labels = tf.cast(tf.gather_nd(labels, indices), tf.int32)
         logits = tf.gather_nd(logits, indices)
         return logits, labels, 1.0
     elif zeus.is_npu_device():
         weights = tf.not_equal(labels, self.ignore_index)
         labels = tf.multiply(labels, tf.cast(weights, labels.dtype))
         return logits, labels, tf.to_float(weights)
예제 #8
0
    def _init_distributed_setting(self):
        if not self.distributed:
            return

        if zeus.is_npu_device():
            from npu_bridge.estimator import npu_ops
            self.npu_init = npu_ops.initialize_system()
            self.npu_shutdown = npu_ops.shutdown_system()
            self.sess.run(self.npu_init)

        import horovod.tensorflow as hvd
        if zeus.is_gpu_device():
            self._world_size = hvd.size()
            self._rank_id = hvd.rank()
            self._local_rank_id = hvd.local_rank()
        elif zeus.is_npu_device():
            from hccl.manage.api import get_local_rank_id
            from hccl.manage.api import get_rank_size
            from hccl.manage.api import get_rank_id
            self._world_size = get_rank_size()
            self._rank_id = get_rank_id()
            self._local_rank_id = get_local_rank_id()
예제 #9
0
 def _calc_workers_num(self):
     """Calculate workers numbers."""
     if not General.parallel_search:
         return 1
     if zeus.is_gpu_device():
         import torch
         world_size = General.env.world_size
         devices_per_node = torch.cuda.device_count()
         worker_num = (world_size *
                       devices_per_node) // General.devices_per_trainer
     elif zeus.is_npu_device():
         world_devices = int(os.environ['RANK_SIZE'])
         worker_num = world_devices // General.devices_per_trainer
     return worker_num
예제 #10
0
 def _init_session_config(self):
     import tensorflow as tf
     if zeus.is_gpu_device():
         sess_config = tf.compat.v1.ConfigProto()
         sess_config.gpu_options.allow_growth = True
         return sess_config
     elif zeus.is_npu_device():
         from tensorflow.core.protobuf.rewriter_config_pb2 import RewriterConfig
         sess_config = tf.ConfigProto()
         sess_config.graph_options.rewrite_options.remapping = RewriterConfig.OFF
         custom_op = sess_config.graph_options.rewrite_options.custom_optimizers.add()
         custom_op.name = "NpuOptimizer"
         custom_op.parameter_map["use_off_line"].b = True
         return sess_config
예제 #11
0
 def adjust_pipeline_config(self, cfg):
     """Adjust pipeline config according."""
     cfg_cp = copy.deepcopy(cfg)
     cfg_tiny = copy.deepcopy(cfg)
     workers_num = self._calc_workers_num()
     General.parallel_search = False
     self._get_time_params(cfg_cp)
     self._simulate_tiny_pipeline(cfg_tiny)
     General.parallel_search = cfg.general.parallel_search
     self._modify_pipeline_config(workers_num, self.epoch_time,
                                  self.params_dict)
     if zeus.is_npu_device():
         os.environ['RANK_TABLE_FILE'] = os.environ[
             'ORIGIN_RANK_TABLE_FILE']
         os.environ['RANK_SIZE'] = os.environ['ORIGIN_RANK_SIZE']
     logging.info('Adjust runtime config successfully.')
예제 #12
0
 def _shutdown_distributed(self):
     if zeus.is_npu_device() and self.distributed:
         self.sess.run(self.npu_shutdown)
         self.sess.close()
예제 #13
0
    import tensorflow as tf
    from zeus.metrics.tensorflow.metrics import Metrics

    try:
        import horovod.tensorflow as hvd
    except Exception:
        # logging.warning("horovod not been installed, {}".format(str(e)))
        pass
elif zeus.is_ms_backend():
    from mindspore import context
    from mindspore.train import Model as MsModel
    from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor
    from .callbacks.ms_callbacks import EvalCallBack
    from zeus.metrics.mindspore.metrics import Metrics

if zeus.is_npu_device() and zeus.is_tf_backend():
    from npu_bridge.estimator.npu.npu_config import NPURunConfig
    from npu_bridge.estimator.npu.npu_estimator import NPUEstimator
    from npu_bridge.estimator import npu_ops
    from hccl.manage.api import get_local_rank_id
    from hccl.manage.api import get_rank_size
    from hccl.manage.api import get_rank_id
    from tensorflow.core.protobuf.rewriter_config_pb2 import RewriterConfig

logger = logging.getLogger(__name__)


@ClassFactory.register(ClassType.TRAINER)
class Trainer(DistributedWorker):
    """Trainer class.
예제 #14
0
"""TensorFlow Trainer."""
import logging
import numpy as np
import tensorflow as tf
from tensorflow.python.estimator import estimator as est

from zeus.common.general import General
import zeus
from zeus.metrics.tensorflow.metrics import Metrics
from zeus.trainer_base import TrainerBase
try:
    import horovod.tensorflow as hvd
except Exception:
    pass

if zeus.is_npu_device():
    from npu_bridge.estimator.npu.npu_config import NPURunConfig
    from npu_bridge.estimator.npu.npu_estimator import NPUEstimator
    from npu_bridge.estimator import npu_ops
    from hccl.manage.api import get_local_rank_id
    from hccl.manage.api import get_rank_size
    from hccl.manage.api import get_rank_id
    from tensorflow.core.protobuf.rewriter_config_pb2 import RewriterConfig

from zeus.common import FileOps, init_log
from zeus.trainer.modules.losses import Loss
from zeus.trainer.modules.lr_schedulers import LrScheduler
from zeus.trainer.modules.optimizer import Optimizer
from zeus.tf_utils import TFVariables

예제 #15
0
    def valid(self, valid_loader):
        """Validate one step of mode.

        :param loader: valid data loader
        """
        if zeus.is_torch_backend():
            import torch
            from zeus.metrics.pytorch import Metrics
            metrics = Metrics(self.config.metric)
            self.model.eval()
            data_num = 0
            latency_sum = 0.0
            with torch.no_grad():
                for step, batch in enumerate(valid_loader):
                    if isinstance(batch, list) or isinstance(batch, tuple):
                        data = batch[0]
                        target = batch[1]
                    else:
                        raise ValueError("The dataset format must be tuple or list,"
                                         "but get {}.".format(type(batch)))
                    if self.config.cuda:
                        data, target = data.cuda(), target.cuda()
                        self.model = self.model.cuda()
                    time_start = time.time()
                    logits = self.model(data)
                    latency_sum += time.time() - time_start
                    metrics(logits, target)
                    n = data.size(0)
                    data_num += n
                    if step % self.config.report_freq == 0:
                        logging.info("step [{}/{}], valid metric [{}]".format(
                            step + 1, len(valid_loader), str(metrics.results)))
            latency = latency_sum / data_num
        elif zeus.is_tf_backend():
            from zeus.metrics.tensorflow.metrics import Metrics
            metrics = Metrics(self.config.metric)
            estimator = self._init_tf_estimator()
            time_start = time.time()
            eval_metrics = estimator.evaluate(input_fn=valid_loader.input_fn, steps=len(valid_loader))
            latency = (time.time() - time_start) / (len(valid_loader) * valid_loader.args.batch_size)
            metrics.update(eval_metrics)
        elif zeus.is_ms_backend():
            from zeus.metrics.mindspore.metrics import Metrics
            from mindspore.train import Model as MsModel
            from .utils import FakeLoss
            metrics = Metrics(self.config.metric)
            metric_name = self.config.metric().type
            dataset_sink_mode = True if zeus.is_npu_device() else False
            # when eval, the loss_fn is not needed actually, but when initilized, the loss_fn can't be None
            ms_model = MsModel(network=self.model,
                               loss_fn=FakeLoss(),
                               metrics={metric_name: metrics()})
            time_start = time.time()
            eval_metrics = ms_model.eval(valid_dataset=valid_loader,
                                         callbacks=None,
                                         dataset_sink_mode=dataset_sink_mode)
            for batch in valid_loader.create_dict_iterator():
                batch_size = batch["image"].shape[0]
                break
            latency = (time.time() - time_start) / (valid_loader.get_dataset_size() * batch_size)
            metrics.update(eval_metrics)
        pfms = metrics.results
        if self.config.evaluate_latency:
            pfms["latency"] = latency
        logging.info("evaluate performance: {}".format(pfms))
        return pfms