Пример #1
0
    def initialize_cache_files(self, filename):
        length = -1
        with self._filereader.open_cache(filename) as cache:

            # Check variables.
            if self._variables is None:
                self._variables = list(cache.keys())
            else:
                if current_communicator():
                    if not set(self._variables) == set(cache.keys()):
                        logger.log(
                            99, 'Error at worker {} {} {}'.format(
                                current_communicator().rank,
                                set(self._variables), set(cache.keys())))
                        raise

            for k, v in cache.items():
                if length < 0:
                    length = len(v)
                else:
                    assert (length == len(v))
            self._cache_files.append((filename, length))
            logger.info('{} {}'.format(filename, length))
            if length > self._max_length:
                self._max_length = length
Пример #2
0
    def _get_data(self, position):

        self._position = position
        if current_communicator():
            try:
                filename, index = self._order[position]
            except IndexError:
                logger.log(99, '_get_data() fails at worker {} retrying.'.format(
                    current_communicator().rank))
                sleep(0.01)
                return self._get_data(position)
        else:
            filename, index = self._order[position]

        if filename != self._current_filename:
            file_names_to_prefetch = None
            if self._cache_type == ".npy" and self._num_of_threads > 0:
                file_names_to_prefetch = [o[0] for o in self._order[position + self._max_length:position + self._max_length *
                                                                    self._num_of_threads:self._max_length]]

            self._current_data = self._get_next_data(
                filename, file_names_to_prefetch)
            self._current_filename = filename

        data = [self._current_data[v][index] for v in self.variables]

        if self._normalize:
            data = [d.astype(numpy.float32) * (1.0 / 255.0)
                    if d.dtype == numpy.uint8 else d for d in data]
        return data
Пример #3
0
def collect_and_shape_result(c_load, g_load):
    # c_load : float e.g. 58.5
    # g_load : [[nvidia_device_id, gpu_load]]

    comm = current_communicator()
    if comm:
        res = [[comm.rank, c_load], *g_load[:1]]
        t_load_ndarray = np.array(res).reshape(-1)

        load_var = nn.Variable([
            len(t_load_ndarray),
        ])
        load_var.d = t_load_ndarray
        load_list_var = [
            nn.Variable([
                len(t_load_ndarray),
            ]) for _ in range(comm.size)
        ]
        comm.all_gather(load_var.data, [a.data for a in load_list_var])
        result_arr = [[*np.round(a.d.astype(float), decimals=1)]
                      for a in load_list_var]
    else:
        res = [[0, c_load], *g_load[:1]]
        t_load_ndarray = np.round(np.array(res).reshape(-1), decimals=1)
        result_arr = [[*t_load_ndarray.astype(float)]]

    result_arr = sorted(result_arr, key=lambda x: x[0])

    return result_arr
Пример #4
0
def train(info, config):
    config.global_config = info.global_config
    config.training_config = info.training_config

    class OptConfig:
        pass

    config.optimizers = OrderedDict()
    for name, opt in info.optimizers.items():
        o = OptConfig()
        o.optimizer = opt
        o.data_iterators = []
        config.optimizers[name] = o

    class MonConfig:
        pass

    config.monitors = OrderedDict()
    for name, mon in info.monitors.items():
        m = MonConfig()
        m.monitor = mon
        m.data_iterators = []
        config.monitors[name] = m

    # Training
    comm = current_communicator()
    config.training_config.iter_per_epoch //= comm.size if comm else 1
    rng = np.random.RandomState(comm.rank if comm else 0)
    with ExitStack() as stack:
        # Create data_iterator instance only once for each dataset in optimizers
        optimizer_data_iterators = {}
        for name, o in config.optimizers.items():
            for di in o.optimizer.data_iterators.values():
                if di not in optimizer_data_iterators:
                    di_instance = stack.enter_context(di())
                    if comm and comm.size > 1:
                        di_instance = di_instance.slice(
                            rng, comm.size, comm.rank)
                    optimizer_data_iterators[di] = di_instance
                else:
                    di_instance = optimizer_data_iterators[di]
                o.data_iterators.append(di_instance)

        # Create data_iterator instance only once for each dataset in monitors
        monitor_data_iterators = {}
        for name, m in config.monitors.items():
            for di in m.monitor.data_iterators.values():
                if di not in monitor_data_iterators:
                    di_instance = stack.enter_context(di())
                    if comm and comm.size > 1:
                        di_instance = di_instance.slice(
                            rng, comm.size, comm.rank)
                    monitor_data_iterators[di] = di_instance
                else:
                    di_instance = monitor_data_iterators[di]
                m.data_iterators.append(di_instance)
        monitor_data_iterators.update(optimizer_data_iterators)
        yield from _train(config)
Пример #5
0
def _create_dataset(uri, batch_size, shuffle, no_image_normalization, cache_dir, overwrite_cache, create_cache_explicitly, prepare_data_iterator):
    class Dataset:
        pass
    dataset = Dataset()
    dataset.uri = uri
    dataset.normalize = not no_image_normalization

    comm = current_communicator()

    # use same random state for each process until slice is called
    rng = numpy.random.RandomState(0)
    use_memory_cache = comm.size == 1 if comm else True

    if prepare_data_iterator:
        if cache_dir == '':
            cache_dir = None

        # Disable implicit cache creation when MPI is available.
        if cache_dir and (create_cache_explicitly or comm):
            cache_index = os.path.join(cache_dir, "cache_index.csv")
            if not os.path.exists(cache_index) or overwrite_cache:
                if single_or_rankzero():
                    logger.log(99, 'Creating cache data for "' + uri + '"')

                    try:
                        os.makedirs(cache_dir)
                    except OSError:
                        pass  # python2 does not support exists_ok arg

                    with data_iterator_csv_dataset(uri, batch_size, shuffle, rng=rng, normalize=False, cache_dir=cache_dir, with_memory_cache=False) as di:
                        pass

            rng = numpy.random.RandomState(0)
            dataset.data_iterator = (lambda: data_iterator_cache(
                cache_dir, batch_size, shuffle, rng=rng, normalize=dataset.normalize, with_memory_cache=use_memory_cache))
        elif not cache_dir or overwrite_cache or not os.path.exists(cache_dir) or len(os.listdir(cache_dir)) == 0:
            if comm:
                logger.critical(
                    'Implicit cache creation does not support with MPI')
                import sys
                sys.exit(-1)
            else:
                if cache_dir:
                    try:
                        os.makedirs(cache_dir)
                    except OSError:
                        pass  # python2 does not support exists_ok arg
                dataset.data_iterator = (lambda: data_iterator_csv_dataset(
                    uri, batch_size, shuffle, rng=rng, normalize=dataset.normalize, cache_dir=cache_dir))
        else:
            dataset.data_iterator = (lambda: data_iterator_cache(
                cache_dir, batch_size, shuffle, rng=rng, normalize=dataset.normalize, with_memory_cache=use_memory_cache))
    else:
        dataset.data_iterator = None
    return dataset
Пример #6
0
    def _get_next_data(self, filename, file_names_to_prefetch, retry=1):
        if retry > 10:
            logger.log(99, '_get_next_data() retry count over give up.')
            raise
        if self._cache_type == '.npy':
            next_data = self._cache_reader_with_prefetch.open_and_prefetch_cache(
                filename, file_names_to_prefetch)
        else:
            # h5 format
            next_data = {}
            with self._filereader.open_cache(filename) as cache:
                for k, v in cache.items():
                    next_data[k] = v[()]

        if current_communicator():
            if set(self._variables) != set(next_data.keys()):
                logger.log(99, '_get_next_data() fails at worker {} retrying count {}/10.'.format(
                    current_communicator().rank, retry))
                sleep(0.01)
                return self._get_next_data(filename, file_names_to_prefetch, retry+1)
        return next_data
Пример #7
0
def _context(proto):
    comm = current_communicator()
    if not proto.backends:
        logger.warn('Old-style context. Updating to new format.')
        # Update from old Context
        backends = [x.strip() for x in proto.backend.split('|')]
        compute_backends = [
            x.strip() for x in proto.compute_backend.split('|')
        ]
        if 'cuda' in backends:
            device_id = str(proto.device_id)
            if comm:
                device_id = str(comm.local_rank)

            if 'cudnn' in compute_backends:
                try:
                    import nnabla_ext.cudnn
                    ctx = nnabla_ext.cudnn.context(device_id=device_id)
                except ImportError:
                    logger.warn('Fallback to CPU context.')
                    import nnabla_ext.cpu
                    ctx = nnabla_ext.cpu.context()
            elif 'default' in compute_backends:
                try:
                    import nnabla_ext.cuda
                    ctx = nnabla_ext.cuda.context(device_id=device_id)
                except ImportError:
                    logger.warn('Fallback to CPU context.')
                    import nnabla_ext.cpu
                    ctx = nnabla_ext.cpu.context()
            else:
                raise ValueError('Invalid compute_backend {}'.format(
                    proto.compute_backend))
        elif 'cpu' in backends:
            import nnabla_ext.cpu
            ctx = nnabla_ext.cpu.context()
        else:
            raise ValueError('Invalid context {}'.format(proto))
        ctx.array_class = str(proto.array_class)
        return ctx
    ctx = nn.Context()
    ctx.backend = proto.backends
    ctx.array_class = str(proto.array_class)

    if comm:
        ctx.device_id = str(comm.local_rank)
    else:
        ctx.device_id = str(proto.device_id)

    return ctx
Пример #8
0
def measure_cpu_gpu_instant_load():
    # Get current cpu gpu load, as
    # load = [rank, cpu_load, nvidia_device_id, gpu_load]
    # result_arr: [load, load, ...]

    gpu_load = []
    if gpu_load_backend_ok:
        global gpu_a_load
        global gpu_m_count

        gpu_m_count += 1
        try:
            comm = current_communicator()
            if comm:
                index = comm.local_rank
            elif 'cuda' in str(nn.get_current_context().backend):
                index = 0
            else:
                raise Exception
            handler = pynvml.nvmlDeviceGetHandleByIndex(index)
            gpu_load = [[
                index,
                pynvml.nvmlDeviceGetUtilizationRates(handler).gpu
            ]]

            if index in gpu_a_load.keys():
                gpu_a_load[index]['name'] = pynvml.nvmlDeviceGetName(
                    handler).decode("utf-8")
                o_load = gpu_a_load[index]['load']
                n_load = gpu_load[0][1]
                gpu_a_load[index]['load'] = (
                    (gpu_m_count - 1) * o_load + n_load) / gpu_m_count
            else:
                gpu_a_load[index] = {
                    'name': pynvml.nvmlDeviceGetName(handler).decode("utf-8"),
                    'load': gpu_load[0][1]
                }

        except Exception:
            gpu_load = []

    if cpu_load_backend_ok:
        global p_handler
        cpu_load = p_handler.cpu_percent()
        callback.update_status(
            ('cpu_gpu_load', collect_and_shape_result(cpu_load, gpu_load)))
Пример #9
0
def load(filenames,
         prepare_data_iterator=True,
         batch_size=None,
         exclude_parameter=False,
         parameter_only=False,
         extension=".nntxt"):
    '''load
    Load network information from files.

    Args:
        filenames (list): file-like object or List of filenames.
        extension: if filenames is file-like object, extension is one of ".nntxt", ".prototxt", ".protobuf", ".h5", ".nnp".
    Returns:
        dict: Network information.
    '''
    class Info:
        pass

    info = Info()

    proto = nnabla_pb2.NNablaProtoBuf()

    # optimizer checkpoint
    opti_proto = nnabla_pb2.NNablaProtoBuf()
    OPTI_BUF_EXT = ['.optimizer']
    opti_h5_files = {}
    tmpdir = tempfile.mkdtemp()

    if isinstance(filenames, list) or isinstance(filenames, tuple):
        pass
    elif isinstance(filenames, str) or hasattr(filenames, 'read'):
        filenames = [filenames]

    for filename in filenames:
        if isinstance(filename, str):
            _, ext = os.path.splitext(filename)
        else:
            ext = extension

        # TODO: Here is some known problems.
        #   - Even when protobuf file includes network structure,
        #     it will not loaded.
        #   - Even when prototxt file includes parameter,
        #     it will not loaded.

        if ext in ['.nntxt', '.prototxt']:
            if not parameter_only:
                with get_file_handle_load(filename, ext) as f:
                    try:
                        text_format.Merge(f.read(), proto)
                    except:
                        logger.critical('Failed to read {}.'.format(filename))
                        logger.critical(
                            '2 byte characters may be used for file name or folder name.'
                        )
                        raise
            if len(proto.parameter) > 0:
                if not exclude_parameter:
                    nn.load_parameters(filename, extension=ext)
        elif ext in ['.protobuf', '.h5']:
            if not exclude_parameter:
                nn.load_parameters(filename, extension=ext)
            else:
                logger.info('Skip loading parameter.')

        elif ext == '.nnp':
            with get_file_handle_load(filename, ext) as nnp:
                for name in nnp.namelist():
                    _, ext = os.path.splitext(name)
                    if name == 'nnp_version.txt':
                        pass  # TODO currently do nothing with version.
                    elif ext in ['.nntxt', '.prototxt']:
                        if not parameter_only:
                            with nnp.open(name, 'r') as f:
                                text_format.Merge(f.read(), proto)
                        if len(proto.parameter) > 0:
                            if not exclude_parameter:
                                with nnp.open(name, 'r') as f:
                                    nn.load_parameters(f, extension=ext)
                    elif ext in ['.protobuf', '.h5']:
                        if not exclude_parameter:
                            with nnp.open(name, 'r') as f:
                                nn.load_parameters(f, extension=ext)
                        else:
                            logger.info('Skip loading parameter.')
                    elif ext in OPTI_BUF_EXT:
                        buf_type = get_buf_type(name)
                        if buf_type == 'protobuf':
                            with nnp.open(name, 'r') as f:
                                with get_file_handle_load(
                                        f, '.protobuf') as opti_p:
                                    opti_proto.MergeFromString(opti_p.read())
                        elif buf_type == 'h5':
                            nnp.extract(name, tmpdir)
                            opti_h5_files[name] = os.path.join(tmpdir, name)

    default_context = None
    if proto.HasField('global_config'):
        info.global_config = _global_config(proto)
        default_context = info.global_config.default_context
        if 'cuda' in default_context.backend:
            import nnabla_ext.cudnn
        elif 'cuda:float' in default_context.backend:
            try:
                import nnabla_ext.cudnn
            except:
                pass
        try:
            x = nn.Variable()
            y = nn.Variable()
            func = F.ReLU(default_context, inplace=True)
            func.setup([x], [y])
            func.forward([x], [y])
        except:
            logger.warn('Fallback to CPU context.')
            import nnabla_ext.cpu
            default_context = nnabla_ext.cpu.context()
    else:
        import nnabla_ext.cpu
        default_context = nnabla_ext.cpu.context()

    comm = current_communicator()
    if comm:
        default_context.device_id = str(comm.local_rank)
    if proto.HasField('training_config'):
        info.training_config = _training_config(proto)

    info.datasets = _datasets(
        proto, prepare_data_iterator if prepare_data_iterator is not None else
        info.training_config.max_epoch > 0)

    info.networks = _networks(proto, default_context, batch_size)

    info.optimizers = _optimizers(proto, default_context, info.networks,
                                  info.datasets)
    _load_optimizer_checkpoint(opti_proto, opti_h5_files, info)
    shutil.rmtree(tmpdir)

    info.monitors = _monitors(proto, default_context, info.networks,
                              info.datasets)

    info.executors = _executors(proto, info.networks)

    return info
Пример #10
0
def train_command(args):

    if single_or_rankzero():
        configure_progress(os.path.join(args.outdir, 'progress.txt'))

    info = load.load([args.config], exclude_parameter=True)

    # Check dataset uri is empty.
    dataset_error = False
    for dataset in info.datasets.values():
        if dataset.uri.strip() == '':
            dataset_error = True
    if dataset_error:
        logger.log(99, 'Fatal error. Dataset URI is empty.')
        return False

    class TrainConfig:
        pass

    config = TrainConfig()
    config.timelimit = -1
    if args.param:
        load.load([args.param], parameter_only=True)

    config.global_config = info.global_config
    config.training_config = info.training_config

    if single_or_rankzero():
        logger.log(99, 'Train with contexts {}'.format(available_contexts))

    class OptConfig:
        pass

    config.optimizers = OrderedDict()
    for name, opt in info.optimizers.items():
        o = OptConfig()
        o.optimizer = opt
        o.data_iterator = None
        config.optimizers[name] = o

    class MonConfig:
        pass

    config.monitors = OrderedDict()
    for name, mon in info.monitors.items():
        m = MonConfig()
        m.monitor = mon
        m.data_iterator = None
        config.monitors[name] = m

    # Training
    comm = current_communicator()
    config.training_config.iter_per_epoch //= comm.size if comm else 1
    max_iteration = config.training_config.max_epoch * \
        config.training_config.iter_per_epoch

    global _save_parameter_info
    _save_parameter_info = {}
    _, config_ext = os.path.splitext(args.config)
    if config_ext == '.prototxt' or config_ext == '.nntxt':
        _save_parameter_info['config'] = args.config
    elif config_ext == '.nnp':
        with zipfile.ZipFile(args.config, 'r') as nnp:
            for name in nnp.namelist():
                _, ext = os.path.splitext(name)
                if ext == '.nntxt' or ext == '.prototxt':
                    nnp.extract(name, args.outdir)
                    _save_parameter_info['config'] = os.path.join(
                        args.outdir, name)

    result = False
    if max_iteration > 0:
        data_iterators = {'optimizer': {}, 'monitor': {}}
        rng = np.random.RandomState(comm.rank if comm else 0)
        with ExitStack() as stack:
            for name, o in config.optimizers.items():
                o.data_iterator = stack.enter_context(
                    o.optimizer.data_iterator())
                if comm and comm.size > 1:
                    o.data_iterator = o.data_iterator.slice(
                        rng, comm.size, comm.rank)
            for name, m in config.monitors.items():
                m.data_iterator = stack.enter_context(
                    m.monitor.data_iterator())
                if comm and comm.size > 1:
                    m.data_iterator = m.data_iterator.slice(
                        rng, comm.size, comm.rank)
            result = _train(args, config)
    else:
        # save parameters without training (0 epoch learning)
        logger.log(99, '0 epoch learning. (Just save parameter.)')
        if single_or_rankzero():
            _save_parameters(args, 'current', 0, True)
        result = True

    if single_or_rankzero():
        if result:
            logger.log(99, 'Training Completed.')
        else:
            logger.log(99, 'Training Incompleted.')
    if single_or_rankzero():
        progress(None)

    return True
Пример #11
0
def _create_optimizer(ctx, o, networks, datasets):
    class Optimizer:
        pass

    optimizer = Optimizer()

    optimizer.comm = current_communicator()
    comm_size = optimizer.comm.size if optimizer.comm else 1
    optimizer.start_iter = (o.start_iter - 1) // comm_size + \
        1 if o.start_iter > 0 else 0
    optimizer.end_iter = (o.end_iter - 1) // comm_size + \
        1 if o.end_iter > 0 else 0
    optimizer.name = o.name
    optimizer.order = o.order
    optimizer.update_interval = o.update_interval if o.update_interval > 0 else 1
    optimizer.network = networks[o.network_name]
    optimizer.data_iterators = OrderedDict()
    for d in o.dataset_name:
        optimizer.data_iterators[d] = datasets[d].data_iterator

    optimizer.dataset_assign = OrderedDict()
    for d in o.data_variable:
        optimizer.dataset_assign[optimizer.network.variables[
            d.variable_name]] = d.data_name

    optimizer.generator_assign = OrderedDict()
    for g in o.generator_variable:
        optimizer.generator_assign[optimizer.network.variables[
            g.variable_name]] = _get_generator(g)

    optimizer.loss_variables = []
    for l in o.loss_variable:
        optimizer.loss_variables.append(
            optimizer.network.variables[l.variable_name])

    optimizer.parameter_learning_rate_multipliers = OrderedDict()
    for p in o.parameter_variable:
        param_variable_names = _get_matching_variable_names(
            p.variable_name, optimizer.network.variables.keys())
        for v_name in param_variable_names:
            optimizer.parameter_learning_rate_multipliers[
                optimizer.network.
                variables[v_name]] = p.learning_rate_multiplier

    with nn.context_scope(ctx):
        if o.solver.type == 'Adagrad':
            optimizer.solver = S.Adagrad(o.solver.adagrad_param.lr,
                                         o.solver.adagrad_param.eps)
            init_lr = o.solver.adagrad_param.lr
        elif o.solver.type == 'Adadelta':
            optimizer.solver = S.Adadelta(o.solver.adadelta_param.lr,
                                          o.solver.adadelta_param.decay,
                                          o.solver.adadelta_param.eps)
            init_lr = o.solver.adadelta_param.lr
        elif o.solver.type == 'Adam':
            optimizer.solver = S.Adam(o.solver.adam_param.alpha,
                                      o.solver.adam_param.beta1,
                                      o.solver.adam_param.beta2,
                                      o.solver.adam_param.eps)
            init_lr = o.solver.adam_param.alpha
        elif o.solver.type == 'Adamax':
            optimizer.solver = S.Adamax(o.solver.adamax_param.alpha,
                                        o.solver.adamax_param.beta1,
                                        o.solver.adamax_param.beta2,
                                        o.solver.adamax_param.eps)
            init_lr = o.solver.adamax_param.alpha
        elif o.solver.type == 'AdaBound':
            optimizer.solver = S.AdaBound(o.solver.adabound_param.alpha,
                                          o.solver.adabound_param.beta1,
                                          o.solver.adabound_param.beta2,
                                          o.solver.adabound_param.eps,
                                          o.solver.adabound_param.final_lr,
                                          o.solver.adabound_param.gamma)
            init_lr = o.solver.adabound_param.alpha
        elif o.solver.type == 'AMSGRAD':
            optimizer.solver = S.AMSGRAD(o.solver.amsgrad_param.alpha,
                                         o.solver.amsgrad_param.beta1,
                                         o.solver.amsgrad_param.beta2,
                                         o.solver.amsgrad_param.eps)
            init_lr = o.solver.amsgrad_param.alpha
        elif o.solver.type == 'AMSBound':
            optimizer.solver = S.AMSBound(o.solver.amsbound_param.alpha,
                                          o.solver.amsbound_param.beta1,
                                          o.solver.amsbound_param.beta2,
                                          o.solver.amsbound_param.eps,
                                          o.solver.amsbound_param.final_lr,
                                          o.solver.amsbound_param.gamma)
            init_lr = o.solver.amsbound_param.alpha
        elif o.solver.type == 'Eve':
            p = o.solver.eve_param
            optimizer.solver = S.Eve(p.alpha, p.beta1, p.beta2, p.beta3, p.k,
                                     p.k2, p.eps)
            init_lr = p.alpha
        elif o.solver.type == 'Momentum':
            optimizer.solver = S.Momentum(o.solver.momentum_param.lr,
                                          o.solver.momentum_param.momentum)
            init_lr = o.solver.momentum_param.lr
        elif o.solver.type == 'Nesterov':
            optimizer.solver = S.Nesterov(o.solver.nesterov_param.lr,
                                          o.solver.nesterov_param.momentum)
            init_lr = o.solver.nesterov_param.lr
        elif o.solver.type == 'RMSprop':
            optimizer.solver = S.RMSprop(o.solver.rmsprop_param.lr,
                                         o.solver.rmsprop_param.decay,
                                         o.solver.rmsprop_param.eps)
            init_lr = o.solver.rmsprop_param.lr
        elif o.solver.type == 'Sgd' or o.solver.type == 'SGD':
            optimizer.solver = S.Sgd(o.solver.sgd_param.lr)
            init_lr = o.solver.sgd_param.lr
        else:
            raise ValueError('Solver "' + o.solver.type +
                             '" is not supported.')

    parameters = {
        v.name: v.variable_instance
        for v, local_lr in
        optimizer.parameter_learning_rate_multipliers.items() if local_lr > 0.0
    }
    optimizer.solver.set_parameters(parameters)
    optimizer.parameters = OrderedDict(
        sorted(parameters.items(), key=lambda x: x[0]))

    optimizer.weight_decay = o.solver.weight_decay

    # keep following 2 lines for backward compatibility
    optimizer.lr_decay = o.solver.lr_decay if o.solver.lr_decay > 0.0 else 1.0
    optimizer.lr_decay_interval = o.solver.lr_decay_interval if o.solver.lr_decay_interval > 0 else 1
    optimizer.solver.set_states_from_protobuf(o)

    optimizer.comm = current_communicator()
    comm_size = optimizer.comm.size if optimizer.comm else 1
    optimizer.scheduler = ExponentialScheduler(init_lr, 1.0, 1)

    if o.solver.lr_scheduler_type == 'Polynomial':
        if o.solver.polynomial_scheduler_param.power != 0.0:
            optimizer.scheduler = PolynomialScheduler(
                init_lr,
                o.solver.polynomial_scheduler_param.max_iter // comm_size,
                o.solver.polynomial_scheduler_param.power)
    elif o.solver.lr_scheduler_type == 'Cosine':
        optimizer.scheduler = CosineScheduler(
            init_lr, o.solver.cosine_scheduler_param.max_iter // comm_size)
    elif o.solver.lr_scheduler_type == 'Exponential':
        if o.solver.exponential_scheduler_param.gamma != 1.0:
            optimizer.scheduler = ExponentialScheduler(
                init_lr, o.solver.exponential_scheduler_param.gamma,
                o.solver.exponential_scheduler_param.iter_interval //
                comm_size if
                o.solver.exponential_scheduler_param.iter_interval > comm_size
                else 1)
    elif o.solver.lr_scheduler_type == 'Step':
        if o.solver.step_scheduler_param.gamma != 1.0 and len(
                o.solver.step_scheduler_param.iter_steps) > 0:
            optimizer.scheduler = StepScheduler(
                init_lr, o.solver.step_scheduler_param.gamma, [
                    step // comm_size
                    for step in o.solver.step_scheduler_param.iter_steps
                ])
    elif o.solver.lr_scheduler_type == 'Custom':
        # ToDo
        raise NotImplementedError()
    elif o.solver.lr_scheduler_type == '':
        if o.solver.lr_decay_interval != 0 or o.solver.lr_decay != 0.0:
            optimizer.scheduler = ExponentialScheduler(
                init_lr, o.solver.lr_decay if o.solver.lr_decay > 0.0 else 1.0,
                o.solver.lr_decay_interval //
                comm_size if o.solver.lr_decay_interval > comm_size else 1)
    else:
        raise ValueError('Learning Rate Scheduler "' +
                         o.solver.lr_scheduler_type + '" is not supported.')

    if o.solver.lr_warmup_scheduler_type == 'Linear':
        if o.solver.linear_warmup_scheduler_param.warmup_iter >= comm_size:
            optimizer.scheduler = LinearWarmupScheduler(
                optimizer.scheduler,
                o.solver.linear_warmup_scheduler_param.warmup_iter //
                comm_size)

    optimizer.forward_sequence = optimizer.network.get_forward_sequence(
        optimizer.loss_variables)
    optimizer.backward_sequence = optimizer.network.get_backward_sequence(
        optimizer.loss_variables,
        optimizer.parameter_learning_rate_multipliers)

    return optimizer
Пример #12
0
def _train(args, config):
    global _save_parameter_info
    comm = current_communicator()
    _CGLOAD_LOG_INTERVAL = 20

    best_epoch = None
    best_error = None
    last_epoch = 0
    if args.resume:
        last_epoch, best_epoch, best_error = _get_current_parameter(args)
        if best_epoch is not None:
            logger.log(
                99, "Best error {} recorded at epoch {} in previous training.".
                format(best_error, best_epoch))
            if best_epoch > last_epoch:
                logger.log(
                    99,
                    "Resumed epoch is {} but this training keep this result.".
                    format(last_epoch))
        logger.log(99, "Resume from epoch {}".format(last_epoch + 1))

    callback.update_status(('epoch.max', config.training_config.max_epoch))
    callback.update_status(
        ('epoch.current',
         last_epoch + 1 if last_epoch < config.training_config.max_epoch else
         config.training_config.max_epoch))

    max_iteration = config.training_config.max_epoch * \
        config.training_config.iter_per_epoch
    if single_or_rankzero():
        logger.log(
            99, 'Training epoch {} of {} begin'.format(
                last_epoch + 1, config.training_config.max_epoch))

    class Cost:
        pass

    cost = Cost()
    cost.sum_epoch = 0.0
    cost.num_iteration = 0
    cost.sum_iteration = 0.0
    cost.variables = None

    class TimeInfo:
        pass

    timeinfo = TimeInfo()
    timeinfo.past_time = 0
    timeinfo.estimate_time = 0
    timeinfo.last_past_time = None

    if max_iteration > 0:
        last_iteration = last_epoch * config.training_config.iter_per_epoch
        if last_iteration < max_iteration:

            timeinfo.start_time = time.time()
            timeinfo.last_epoch_start_time = timeinfo.start_time

            callback.update_status('processing', True, timeinfo.start_time)

            for iteration in range(last_iteration, max_iteration):

                # instant load measurement
                measure_cpu_gpu_instant_load()

                cost = _update(iteration, config, cost)

                if np.isnan(cost.sum_epoch) or np.isinf(cost.sum_epoch):
                    logger.log(99, 'Cost is Nan')
                    return False, False

                timeinfo = _calc_estimate_time(timeinfo, max_iteration,
                                               last_iteration, iteration + 1)
                callback.update_time_train(prediction=timeinfo.estimate_time)

                if 0 < config.timelimit < timeinfo.estimate_time:
                    logger.log(
                        99,
                        'Expected training time ({:.3f}s) will exceed time limit ({}s).'
                        .format(timeinfo.estimate_time, config.timelimit))
                    return False, False

                if (iteration +
                        1) % config.training_config.iter_per_epoch == 0:
                    last_past_time = -1
                    # End of epoch
                    epoch = iteration // config.training_config.iter_per_epoch + 1
                    cost_avg_epoch = cost.sum_epoch / cost.num_iteration if cost.num_iteration else 0
                    cost.sum_epoch = 0.0
                    cost.num_iteration = 0
                    monitoring_report = []

                    # Evaluation
                    error_str = ''
                    if epoch % config.training_config.monitor_interval == 0 or epoch <= 5:
                        best_error, error_str = _evaluate(
                            args, config, monitoring_report, best_error, epoch)

                    # Cpu/Gpu average load
                    cg_load_str = ''
                    cgload_log = ''
                    cg_load = get_cpu_gpu_average_load()
                    if cg_load:
                        cg_load_str = 'epoch {} average_load_matrix: {}'.format(
                            epoch, cg_load)
                        span = _calc_epoch_span(timeinfo)
                        if span > _CGLOAD_LOG_INTERVAL:
                            cgload_log = _format_cgload_log(cg_load)

                    if single_or_rankzero():
                        # Write to monitoring_report.yml
                        f = open(
                            os.path.join(args.outdir, 'monitoring_report.yml'),
                            'a')
                        f.write('{}:\n'.format(epoch - 1))
                        f.write('  cost: {}\n'.format(cost_avg_epoch))
                        for s in monitoring_report:
                            f.write(s)
                        f.close()

                        callback.update_status(
                            (['monitoring_report', epoch,
                              'cost'], cost_avg_epoch))

                        _save_parameters(args, 'current', epoch, config)

                        callback.update_status(('epoch.current', epoch))
                        callback.update_status()

                        logger.log(
                            99,
                            'epoch {} of {} cost={:.6f} {} time=({:.1f}s /{:.1f}s) {}'
                            .format(epoch, config.training_config.max_epoch,
                                    cost_avg_epoch, error_str,
                                    timeinfo.past_time, timeinfo.estimate_time,
                                    cgload_log))

                        if cg_load_str:
                            # cpu_gpu_average_load record at epoch level
                            callback.update_status(
                                (['cpu_gpu_epoch_load', epoch], cg_load))
                            progress(cg_load_str, 1)

                        if not callback.check_training_time(
                                args, config, timeinfo, epoch, last_epoch):
                            _save_parameters(args, 'current', epoch, config,
                                             True)
                            return False, True

            if single_or_rankzero():
                _save_parameters(args, 'current', epoch, config, True)
    return True, False
Пример #13
0
def _create_optimizer(ctx, o, networks, datasets):
    class Optimizer:
        pass

    optimizer = Optimizer()

    optimizer.name = o.name
    optimizer.order = o.order
    optimizer.update_interval = o.update_interval if o.update_interval > 0 else 1
    optimizer.network = networks[o.network_name]
    optimizer.data_iterator = datasets[o.dataset_name].data_iterator

    optimizer.dataset_assign = OrderedDict()
    for d in o.data_variable:
        optimizer.dataset_assign[optimizer.network.variables[
            d.variable_name]] = d.data_name

    optimizer.generator_assign = OrderedDict()
    for g in o.generator_variable:
        optimizer.generator_assign[optimizer.network.variables[
            g.variable_name]] = _get_generator(g)

    optimizer.loss_variables = []
    for l in o.loss_variable:
        optimizer.loss_variables.append(
            optimizer.network.variables[l.variable_name])

    optimizer.parameter_learning_rate_multipliers = OrderedDict()
    for p in o.parameter_variable:
        param_variable_names = _get_matching_variable_names(
            p.variable_name, optimizer.network.variables.keys())
        for v_name in param_variable_names:
            optimizer.parameter_learning_rate_multipliers[
                optimizer.network.
                variables[v_name]] = p.learning_rate_multiplier

    with nn.context_scope(ctx):
        if o.solver.type == 'Adagrad':
            optimizer.solver = S.Adagrad(o.solver.adagrad_param.lr,
                                         o.solver.adagrad_param.eps)
        elif o.solver.type == 'Adadelta':
            optimizer.solver = S.Adadelta(o.solver.adadelta_param.lr,
                                          o.solver.adadelta_param.decay,
                                          o.solver.adadelta_param.eps)
        elif o.solver.type == 'Adam':
            optimizer.solver = S.Adam(o.solver.adam_param.alpha,
                                      o.solver.adam_param.beta1,
                                      o.solver.adam_param.beta2,
                                      o.solver.adam_param.eps)
        elif o.solver.type == 'Adamax':
            optimizer.solver = S.Adamax(o.solver.adamax_param.alpha,
                                        o.solver.adamax_param.beta1,
                                        o.solver.adamax_param.beta2,
                                        o.solver.adamax_param.eps)
        elif o.solver.type == 'Eve':
            p = o.solver.eve_param
            optimizer.solver = S.Eve(p.alpha, p.beta1, p.beta2, p.beta3, p.k,
                                     p.k2, p.eps)
        elif o.solver.type == 'Momentum':
            optimizer.solver = S.Momentum(o.solver.momentum_param.lr,
                                          o.solver.momentum_param.momentum)
        elif o.solver.type == 'Nesterov':
            optimizer.solver = S.Nesterov(o.solver.nesterov_param.lr,
                                          o.solver.nesterov_param.momentum)
        elif o.solver.type == 'RMSprop':
            optimizer.solver = S.RMSprop(o.solver.rmsprop_param.lr,
                                         o.solver.rmsprop_param.decay,
                                         o.solver.rmsprop_param.eps)
        elif o.solver.type == 'Sgd' or o.solver.type == 'SGD':
            optimizer.solver = S.Sgd(o.solver.sgd_param.lr)
        else:
            raise ValueError('Solver "' + o.solver.type +
                             '" is not supported.')

    parameters = {
        v.name: v.variable_instance
        for v, local_lr in
        optimizer.parameter_learning_rate_multipliers.items() if local_lr > 0.0
    }
    optimizer.solver.set_parameters(parameters)
    optimizer.parameters = OrderedDict(
        sorted(parameters.items(), key=lambda x: x[0]))

    optimizer.weight_decay = o.solver.weight_decay
    optimizer.lr_decay = o.solver.lr_decay if o.solver.lr_decay > 0.0 else 1.0
    optimizer.lr_decay_interval = o.solver.lr_decay_interval if o.solver.lr_decay_interval > 0 else 1

    optimizer.comm = current_communicator()
    if optimizer.comm is not None:
        new_interval = optimizer.lr_decay_interval // optimizer.comm.size
        if new_interval == 0:
            new_interval = 1
        logger.log(
            99, 'LR Decay interval divide by {} ({} -> {})'.format(
                optimizer.comm.size, optimizer.lr_decay_interval,
                new_interval))
        optimizer.lr_decay_interval = new_interval

    optimizer.forward_sequence = optimizer.network.get_forward_sequence(
        optimizer.loss_variables)
    optimizer.backward_sequence = optimizer.network.get_backward_sequence(
        optimizer.loss_variables,
        optimizer.parameter_learning_rate_multipliers)

    return optimizer
Пример #14
0
def lms_scheduler(ctx, use_lms, gpu_memory_size=None, window_length=None):
    _check_list = [x.split(":")[0] for x in ctx.backend]
    if "cudnn" not in _check_list and "cuda" not in _check_list:
        logger.warn(
            "ctx passed to scheduler doesn't have cuda/cudnn backend. lms scheduler will not be used."
        )
        use_lms = False

    comm = current_communicator()
    if comm:
        logger.log(99,
                   f'[OoC] Currently OoC is disabled for Multi-GPU training.')
        use_lms = False

    if use_lms:
        gpu_index = 0
        if 'cuda' in str(ctx.backend):
            gpu_index = int(ctx.device_id)
        else:
            logger.log(99, f'[OoC] OoC is only enabled for GPU training.')
            raise Exception

        if gpu_memory_size is None or gpu_memory_size == 0:
            try:
                handle = nvml.nvmlDeviceGetHandleByIndex(gpu_index)
                total_memory = nvml.nvmlDeviceGetMemoryInfo(handle).total
                gpu_memory_size = int(total_memory * 0.7)
            except:
                logger.log(
                    99,
                    f'[OoC] Could not get GPU memory size using default value(6GB).'
                )
                gpu_memory_size = 6e9  # default 6 GiB
                pass

        if window_length is None or window_length == 0:
            window_length = int(gpu_memory_size * 1.5)

        logger.log(
            99,
            f'[OoC] gpu_memory_limit: {gpu_memory_size / 1e9}GB, prefetch_window_length: {window_length / 1e9}GB'
        )
        # Change array preference so that lms works well.
        # import nnabla_ext.cuda.init as cuda_init
        # cuda_init.prefer_cpu_pinned_array()
        # cuda_init.prefer_cuda_virtual_array()
        from nnabla.ext_utils import get_extension_context
        be, tc = ctx.backend[0].split(":")
        cpu_ctx = get_extension_context("cpu", device_id="", type_config=tc)
        return SwapInOutScheduler(cpu_ctx, ctx, gpu_memory_size, window_length)
    else:

        class DummyScheduler(object):
            function_pre_hook = None
            function_post_hook = None
            update_pre_hook = None
            update_post_hook = None

            def start_scheduling(self):
                return None

            def end_scheduling(self):
                return None

            def __enter__(self):
                return self

            def __exit__(self, exc_type, exc_val, exc_tb):
                pass

        return DummyScheduler()
Пример #15
0
def load(filenames, prepare_data_iterator=True, batch_size=None, exclude_parameter=False, parameter_only=False, extension=".nntxt", context=None):
    '''load
    Load network information from files.

    Args:
        filenames (list): file-like object or List of filenames.
        extension: if filenames is file-like object, extension is one of ".nntxt", ".prototxt", ".protobuf", ".h5", ".nnp".
    Returns:
        dict: Network information.
    '''
    class Info:
        pass
    info = Info()

    info.prepare_data_iterator = prepare_data_iterator
    info.batch_size = batch_size
    info.exclude_parameter = exclude_parameter
    info.parameter_only = parameter_only
    info.proto = nnabla_pb2.NNablaProtoBuf()

    # first stage file loaders
    file_loaders = get_initial_file_loader()

    # using global parameter scope, keep consistency with legacy implementation.
    # To avoid to surprise previous developers, but it is better using
    # stand-alone OrderedDict() instance.
    info.parameter_scope = nn.parameter.get_current_parameter_scope()
    load_files(info, file_loaders, filenames, extension)

    default_context = None
    if context:
        if context == 'cpu':
            import nnabla_ext.cpu
            default_context = nnabla_ext.cpu.context()
        else:
            cs = context.split(':')
            if cs[0] == 'cudnn':
                if len(cs) == 1:
                    devid = 0
                else:
                    devid = int(cs[1])
            import nnabla_ext.cudnn
            default_context = nnabla_ext.cudnn.context(device_id=devid)
        if default_context is None:
            logger.warn('Invalid context [{}]'.format(context))
        elif info.proto.HasField('global_config'):
            info.global_config = _global_config(proto)
            info.global_config.default_context = default_context

    if default_context is None:
        if info.proto.HasField('global_config'):
            info.global_config = _global_config(info.proto)
            default_context = info.global_config.default_context
            if 'cuda' in default_context.backend:
                import nnabla_ext.cudnn
            elif 'cuda:float' in default_context.backend:
                try:
                    import nnabla_ext.cudnn
                except:
                    pass
        else:
            import nnabla_ext.cpu
            default_context = nnabla_ext.cpu.context()
            info.global_config = _global_config(
                None, default_context=default_context)

    default_context = _check_context(default_context)
    logger.log(99, 'Using context "{}"'.format(default_context))
    comm = current_communicator()
    if comm:
        default_context.device_id = str(comm.local_rank)
    if info.proto.HasField('training_config'):
        info.training_config = _training_config(info.proto)

    info.default_context = default_context
    info.datasets = _datasets(
        info.proto, prepare_data_iterator if prepare_data_iterator is not None else info.training_config.max_epoch > 0)

    info.renamed_variables = {}
    info.networks = _networks(info, nn.graph_def.ProtoGraph.from_proto(info.proto, param_scope=info.parameter_scope,
                                                                       rng=numpy.random.RandomState(0)))

    info.optimizers = _optimizers(info)
    info.monitors = _monitors(info)
    info.executors = _executors(info)

    return info
Пример #16
0
def _update(iter, config, cost):
    comm = current_communicator()

    loaded_data = {}
    is_first_optimizer = True

    def _sum_cost():
        if comm:
            # logger.log(99, "Calc cost with communicator")
            var = [nn.NdArray()]
            var[0].data = cost.sum_iteration
            _all_reduce(comm, var, division=False, inplace=True)
            cost.sum_epoch += var[0].data
            cost.num_iteration += comm.size
        else:
            cost.sum_epoch += cost.sum_iteration
            cost.num_iteration += 1

    def _get_reserved_variable(shape, reserved_variable_name, iter,
                               iter_per_epoch, max_epoch):
        if reserved_variable_name == "%iter":
            value = iter
        elif reserved_variable_name == "%max_iter":
            value = max_epoch * iter_per_epoch
        elif reserved_variable_name == "%epoch":
            value = iter // iter_per_epoch
        elif reserved_variable_name == "%epochf":
            value = iter * 1.0 / iter_per_epoch
        elif reserved_variable_name == "%max_epoch":
            value = max_epoch
        elif reserved_variable_name == "%progress":
            value = (iter * 1.0 / iter_per_epoch) / max_epoch
        else:
            raise ValueError(
                "Unknown reserved variable {}".format(reserved_variable_name))
        return value

    for opt in config.optimizers.values():
        o = opt.optimizer
        if (o.start_iter == 0
                or iter + 1 >= o.start_iter) and (o.end_iter == 0
                                                  or iter + 1 <= o.end_iter):
            # Load dataset
            data = OrderedDict()
            for di in opt.data_iterators:
                if di not in loaded_data:
                    loaded_data[di] = di.next()
                data.update(zip(di.variables, loaded_data[di]))
            for v, d in o.dataset_assign.items():
                dest_context = config.global_config.default_context if not o.forward_sequence or v not in o.forward_sequence[
                    0].inputs else None
                if d not in data and d[0] == "%":
                    value = _get_reserved_variable(
                        v.variable_instance.shape, d, iter,
                        config.training_config.iter_per_epoch,
                        config.training_config.max_epoch)
                    v.variable_instance.data.fill(value)
                elif d in data:
                    let_data_to_variable(v.variable_instance,
                                         data[d],
                                         ctx=dest_context,
                                         data_name=d,
                                         variable_name=v.name)
                else:
                    raise ValueError(
                        'Variable "{}" is not found in dataset "{}", optimizer "{}"'
                        .format(d, ', '.join(o.data_iterators.keys()), o.name))

            # Generate data
            for v, generator in o.generator_assign.items():
                dest_context = config.global_config.default_context if not o.forward_sequence or v not in o.forward_sequence[
                    0].inputs else None
                let_data_to_variable(v.variable_instance,
                                     data=generator(v.shape),
                                     ctx=dest_context,
                                     variable_name=v.name)

            # Monitor loss before forward to prepare input data while processing on
            # GPU
            if cost.variables:
                for l in cost.variables:
                    cost.sum_iteration += np.mean(l.variable_instance.d)
                    # l.variable_instance.data.zero()
                if is_first_optimizer:
                    is_first_optimizer = False
                    _sum_cost()
                    if single_or_rankzero():
                        progress(
                            "Training : cost={0:0.6f}".format(
                                cost.sum_iteration),
                            (iter % config.training_config.iter_per_epoch) *
                            1.0 / config.training_config.iter_per_epoch)
                    cost.sum_iteration = 0.0

            with nodeTimeCollector.collect_cost_time(comm, iter):
                # Forward
                o.network.forward(o.forward_sequence)

                # Backward
                o.network.backward(o.backward_sequence,
                                   iter % o.update_interval == 0)

            # Update
            if iter % o.update_interval == o.update_interval - 1:
                if o.weight_decay > 0:
                    o.solver.weight_decay(o.weight_decay)

                if o.comm:  # Updated param with communicator
                    params = [x.grad for x in o.parameters.values()]
                    _all_reduce(o.comm, params, division=True, inplace=True)

                if o.scheduler is not None:
                    o.solver.set_learning_rate(
                        o.scheduler.get_learning_rate(iter))
                o.solver.update()
            # Sync w sometimes
            if iter % 10 == 9:  # TODO: change the interval
                if o.comm:
                    params = [x.data for x in o.parameters.values()]
                    _all_reduce(o.comm, params, division=True, inplace=True)

            # Reserve monitor loss
            cost.variables = o.loss_variables

    # Monitor loss at the end of epoch
    if iter % config.training_config.iter_per_epoch == config.training_config.iter_per_epoch - 1 and cost.variables:
        for l in cost.variables:
            cost.sum_iteration += np.mean(l.variable_instance.d)
            # l.variable_instance.data.zero()
        _sum_cost()
        cost.variables = None
        cost.sum_iteration = 0.0

    return cost
Пример #17
0
def _evaluate(args, config, monitoring_report, best_error, epoch):
    comm = current_communicator()
    error_str = ''
    valid_error = 0.0

    def _sum_error(sum, error):
        ret = None
        if comm:
            # logger.log(99, "Calc error with communicator")
            var = [nn.NdArray()]
            var[0].data = error
            _all_reduce(comm, var, division=False, inplace=True)
            ret = sum + var[0].data
        else:
            ret = sum + error
        return ret

    for name, mon in config.monitors.items():
        m = mon.monitor
        error_sum_monitor = 0.0
        error_count = 0
        data_size = max([di.size for di in mon.data_iterators])
        batch_size = max([di.batch_size for di in mon.data_iterators])

        for i in range(data_size // batch_size):
            # Load dataset
            data = OrderedDict()
            for di in mon.data_iterators:
                data.update(zip(di.variables, di.next()))

            # Set data to variable
            for v, d in m.dataset_assign.items():
                dest_context = config.global_config.default_context if not m.forward_sequence or v not in m.forward_sequence[
                    0].inputs else None
                let_data_to_variable(v.variable_instance,
                                     data[d],
                                     ctx=dest_context,
                                     data_name=d,
                                     variable_name=v.name)

            # Generate data
            for v, generator in m.generator_assign.items():
                dest_context = config.global_config.default_context if not m.forward_sequence or v not in m.forward_sequence[
                    0].inputs else None
                let_data_to_variable(v.variable_instance,
                                     data=generator(v.shape),
                                     ctx=dest_context,
                                     variable_name=v.name)

            # Sum error before forward to prepare input data while processing
            # on GPU
            if error_count > 0:
                error_sum = 0.0
                for v in m.monitor_variables:
                    error_sum += np.mean(v.variable_instance.d)
                    # v.variable_instance.data.zero()
                error_sum_monitor = _sum_error(error_sum_monitor, error_sum)
                if single_or_rankzero():
                    progress(
                        'Evaluating "{0}"'.format(name) +
                        ' : error={0:0.6f}'.format(
                            error_sum_monitor / error_count),
                        di.position * 1.0 / di.size)
            error_count += comm.size if comm else 1

            # Forward recursive
            m.network.forward(m.forward_sequence)

        # Sum error at the end of dataset
        error_sum = 0.0
        for v in m.monitor_variables:
            error_sum += np.mean(v.variable_instance.d)
            # v.variable_instance.data.zero()
        error_sum_monitor = _sum_error(error_sum_monitor, error_sum)

        if error_count == 0:
            error = 0
        else:
            error = error_sum_monitor / error_count

        if np.isnan(error) or np.isinf(error):
            logger.log(99, 'Validation error is Nan')
            error = 0.0

        monitoring_report.append('  {}: {}\n'.format(name, error))

        callback.update_status((['monitoring_report', epoch, name], error))
        callback.update_status((['last', name], error))  # save last value

        if error_str != '':
            error_str += ', '
        else:
            error_str = ' {'
        error_str += '{}={:.6f}'.format(name, error)
        if name == 'valid_error':
            valid_error = error

    if error_str != '':
        error_str += '}'

    # Save Parameters
    if single_or_rankzero():
        if (not config.training_config.save_best) or \
           (not best_error) or \
           (best_error is not None and valid_error <= best_error):
            best_error = valid_error
            callback.update_status(('best.valid_error', best_error))
            callback.update_status(('best.epoch', epoch))
            _save_parameters(args, 'best', epoch, config, True)

    return best_error, error_str
Пример #18
0
def _update(iter, config, cost):
    comm = current_communicator()
    loaded_data = {}
    is_first_optimizer = True

    def _sum_cost():
        if comm:
            # logger.log(99, "Calc cost with communicator")
            var = [nn.NdArray()]
            var[0].data = cost.sum_iteration
            _all_reduce(comm, var, division=False, inplace=True)
            cost.sum_epoch += var[0].data
            cost.num_iteration += comm.size
        else:
            cost.sum_epoch += cost.sum_iteration
            cost.num_iteration += 1

    for opt in config.optimizers.values():
        o = opt.optimizer
        # Load dataset
        di = opt.data_iterator
        if o.data_iterator not in loaded_data:
            loaded_data[o.data_iterator] = di.next()
        data = loaded_data[o.data_iterator]
        for v, d in o.dataset_assign.items():
            dest_context = config.global_config.default_context if not o.forward_sequence or v not in o.forward_sequence[
                0].inputs else None
            let_data_to_variable(v.variable_instance,
                                 data[di.variables.index(d)],
                                 ctx=dest_context,
                                 data_name=d,
                                 variable_name=v.name)

        # Generate data
        for v, generator in o.generator_assign.items():
            dest_context = config.global_config.default_context if not o.forward_sequence or v not in o.forward_sequence[
                0].inputs else None
            let_data_to_variable(v.variable_instance,
                                 data=generator(v.shape),
                                 ctx=dest_context,
                                 variable_name=v.name)

        # Monitor loss before forward to prepare input data while processing on
        # GPU
        if cost.variables:
            for l in cost.variables:
                cost.sum_iteration += np.mean(l.variable_instance.d)
                l.variable_instance.data.zero()
            if is_first_optimizer:
                is_first_optimizer = False
                _sum_cost()
                if single_or_rankzero():
                    progress(
                        "Training : cost={0:0.6f}".format(cost.sum_iteration),
                        (iter % config.training_config.iter_per_epoch) * 1.0 /
                        config.training_config.iter_per_epoch)
                cost.sum_iteration = 0.0

        # Forward
        o.network.forward(o.forward_sequence)

        # Backward
        o.network.backward(o.backward_sequence, iter % o.update_interval == 0)

        # Update
        if iter % o.update_interval == o.update_interval - 1:
            if o.weight_decay > 0:
                o.solver.weight_decay(o.weight_decay)

            if o.comm:  # Updated param with communicator
                params = [x.grad for x in o.parameters.values()]
                _all_reduce(o.comm, params, division=True, inplace=True)

            if o.scheduler is not None:
                o.solver.set_learning_rate(o.scheduler.get_learning_rate(iter))
            o.solver.update()
        # Sync w sometimes
        if iter % 10 == 9:  # TODO: change the interval
            if o.comm:
                params = [x.data for x in o.parameters.values()]
                _all_reduce(o.comm, params, division=True, inplace=True)

        # Reserve monitor loss
        cost.variables = o.loss_variables

    # Monitor loss at the end of iteration
    if iter % config.training_config.iter_per_epoch == config.training_config.iter_per_epoch - 1 and cost.variables:
        for l in cost.variables:
            cost.sum_iteration += np.mean(l.variable_instance.d)
            l.variable_instance.data.zero()
        _sum_cost()
        cost.variables = None
        cost.sum_iteration = 0.0

    return cost
Пример #19
0
def train_command(args):
    callback.update_status(args)

    if single_or_rankzero():
        configure_progress(os.path.join(args.outdir, 'progress.txt'))

    info = load.load([args.config],
                     prepare_data_iterator=None,
                     exclude_parameter=True)

    # Check dataset uri is empty.
    dataset_error = False
    for dataset in info.datasets.values():
        if dataset.uri.strip() == '':
            dataset_error = True
    if dataset_error:
        logger.log(99, 'Fatal error. Dataset URI is empty.')
        return False

    class TrainConfig:
        pass

    config = TrainConfig()
    config.timelimit = -1
    if args.param:
        load.load([args.param], parameter_only=True)

    config.timelimit = callback.get_timelimit(args)

    config.global_config = info.global_config
    config.training_config = info.training_config

    if single_or_rankzero():
        logger.log(99, 'Train with contexts {}'.format(available_contexts))

    class OptConfig:
        pass

    config.optimizers = OrderedDict()
    for name, opt in info.optimizers.items():
        o = OptConfig()
        o.optimizer = opt
        o.data_iterators = []
        config.optimizers[name] = o

    class MonConfig:
        pass

    config.monitors = OrderedDict()
    for name, mon in info.monitors.items():
        m = MonConfig()
        m.monitor = mon
        m.data_iterators = []
        config.monitors[name] = m

    # Training
    comm = current_communicator()
    config.training_config.iter_per_epoch //= comm.size if comm else 1
    max_iteration = config.training_config.max_epoch * \
        config.training_config.iter_per_epoch

    global _save_parameter_info
    _save_parameter_info = {}
    _, config_ext = os.path.splitext(args.config)
    if config_ext == '.prototxt' or config_ext == '.nntxt':
        _save_parameter_info['config'] = args.config
    elif config_ext == '.nnp':
        with zipfile.ZipFile(args.config, 'r') as nnp:
            for name in nnp.namelist():
                _, ext = os.path.splitext(name)
                if ext == '.nntxt' or ext == '.prototxt':
                    nnp.extract(name, args.outdir)
                    _save_parameter_info['config'] = os.path.join(
                        args.outdir, name)

    result = False
    restart = False
    if max_iteration > 0:
        rng = np.random.RandomState(comm.rank if comm else 0)
        with ExitStack() as stack:
            # Create data_iterator instance only once for each dataset in optimizers
            optimizer_data_iterators = {}
            for name, o in config.optimizers.items():
                for di in o.optimizer.data_iterators.values():
                    if di not in optimizer_data_iterators:
                        di_instance = stack.enter_context(di())
                        if comm and comm.size > 1:
                            di_instance = di_instance.slice(
                                rng, comm.size, comm.rank)
                        optimizer_data_iterators[di] = di_instance
                    else:
                        di_instance = optimizer_data_iterators[di]
                    o.data_iterators.append(di_instance)

            # Create data_iterator instance only once for each dataset in monitors
            monitor_data_iterators = {}
            for name, m in config.monitors.items():
                for di in m.monitor.data_iterators.values():
                    if di not in monitor_data_iterators:
                        di_instance = stack.enter_context(di())
                        if comm and comm.size > 1:
                            di_instance = di_instance.slice(
                                rng, comm.size, comm.rank)
                        monitor_data_iterators[di] = di_instance
                    else:
                        di_instance = monitor_data_iterators[di]
                    m.data_iterators.append(di_instance)
            monitor_data_iterators.update(optimizer_data_iterators)

            result, restart = _train(args, config)
    else:
        # save parameters without training (0 epoch learning)
        logger.log(99, '0 epoch learning. (Just save parameter.)')
        if single_or_rankzero():
            _save_parameters(args, None, 0, config, True)
        result = True

    if single_or_rankzero() and not restart:
        if result:
            logger.log(99, 'Training Completed.')
            callback.update_status('finished')
        else:
            logger.log(99, 'Training Incompleted.')
            callback.update_status('failed')
    if single_or_rankzero():
        progress(None)
    return True
Пример #20
0
def _train(args, config):
    global _save_parameter_info
    comm = current_communicator()

    last_epoch = 0
    if args.resume:
        last_epoch = _get_current_parameter(args)
        logger.log(99, "Resume from epoch {}".format(last_epoch + 1))

    max_iteration = config.training_config.max_epoch * \
        config.training_config.iter_per_epoch
    if single_or_rankzero():
        logger.log(
            99, 'Training epoch {} of {} begin'.format(
                last_epoch + 1, config.training_config.max_epoch))

    class Cost:
        pass

    cost = Cost()
    cost.sum_epoch = 0.0
    cost.num_iteration = 0
    cost.sum_iteration = 0.0
    cost.variables = None

    best_error = None

    class TimeInfo:
        pass

    timeinfo = TimeInfo()
    timeinfo.last_past_time = None

    if max_iteration > 0:
        last_iteration = last_epoch * config.training_config.iter_per_epoch
        if last_iteration < max_iteration:

            timeinfo.start_time = time.time()

            for iteration in range(last_iteration, max_iteration):

                cost = _update(iteration, config, cost)
                if (iteration - last_iteration) > 0:
                    timeinfo = _calc_estimate_time(timeinfo, max_iteration,
                                                   last_iteration, iteration)
                    if config.timelimit > 0 and timeinfo.estimate_time > config.timelimit:
                        logger.log(
                            99,
                            'Expected training time ({:.3f}s) will exceed time limit ({}s).'
                            .format(timeinfo.estimate_time, config.timelimit))
                        return False

                if (iteration +
                        1) % config.training_config.iter_per_epoch == 0:
                    last_past_time = -1
                    # End of epoch
                    epoch = iteration // config.training_config.iter_per_epoch + 1
                    cost_avg_epoch = cost.sum_epoch / cost.num_iteration
                    cost.sum_epoch = 0.0
                    cost.num_iteration = 0
                    monitoring_report = []

                    # Evaluation
                    error_str = ''
                    if epoch % config.training_config.monitor_interval == 0 or epoch <= 5:
                        best_error, error_str = _evaluate(
                            args, config, monitoring_report, best_error, epoch)

                    if single_or_rankzero():
                        # Write to monitoring_report.yml
                        f = open(
                            os.path.join(args.outdir, 'monitoring_report.yml'),
                            'a')
                        f.write('{}:\n'.format(epoch - 1))
                        f.write('  cost: {}\n'.format(cost_avg_epoch))
                        for s in monitoring_report:
                            f.write(s)
                        f.close()

                        _save_parameters(args, 'current', epoch)

                        logger.log(
                            99,
                            'epoch {} of {} cost={:.6f} {} time=({:.1f}s /{:.1f}s)'
                            .format(epoch, config.training_config.max_epoch,
                                    cost_avg_epoch, error_str,
                                    timeinfo.past_time,
                                    timeinfo.estimate_time))

            if single_or_rankzero():
                _save_parameters(args, 'current', epoch, True)
    return True
Пример #21
0
def load(filenames,
         prepare_data_iterator=True,
         batch_size=None,
         exclude_parameter=False,
         parameter_only=False):
    '''load
    Load network information from files.

    Args:
        filenames (list): List of filenames.
    Returns:
        dict: Network information.
    '''
    class Info:
        pass

    info = Info()

    proto = nnabla_pb2.NNablaProtoBuf()
    for filename in filenames:
        _, ext = os.path.splitext(filename)

        # TODO: Here is some known problems.
        #   - Even when protobuf file includes network structure,
        #     it will not loaded.
        #   - Even when prototxt file includes parameter,
        #     it will not loaded.

        if ext in ['.nntxt', '.prototxt']:
            if not parameter_only:
                with open(filename, 'rt') as f:
                    try:
                        text_format.Merge(f.read(), proto)
                    except:
                        logger.critical('Failed to read {}.'.format(filename))
                        logger.critical(
                            '2 byte characters may be used for file name or folder name.'
                        )
                        raise
            if len(proto.parameter) > 0:
                if not exclude_parameter:
                    nn.load_parameters(filename)
        elif ext in ['.protobuf', '.h5']:
            if not exclude_parameter:
                nn.load_parameters(filename)
            else:
                logger.info('Skip loading parameter.')

        elif ext == '.nnp':
            try:
                tmpdir = tempfile.mkdtemp()
                with zipfile.ZipFile(filename, 'r') as nnp:
                    for name in nnp.namelist():
                        _, ext = os.path.splitext(name)
                        if name == 'nnp_version.txt':
                            nnp.extract(name, tmpdir)
                            with open(os.path.join(tmpdir, name), 'rt') as f:
                                pass  # TODO currently do nothing with version.
                        elif ext in ['.nntxt', '.prototxt']:
                            nnp.extract(name, tmpdir)
                            if not parameter_only:
                                with open(os.path.join(tmpdir, name),
                                          'rt') as f:
                                    text_format.Merge(f.read(), proto)
                            if len(proto.parameter) > 0:
                                if not exclude_parameter:
                                    nn.load_parameters(
                                        os.path.join(tmpdir, name))
                        elif ext in ['.protobuf', '.h5']:
                            nnp.extract(name, tmpdir)
                            if not exclude_parameter:
                                nn.load_parameters(os.path.join(tmpdir, name))
                            else:
                                logger.info('Skip loading parameter.')
            finally:
                shutil.rmtree(tmpdir)

    default_context = None
    if proto.HasField('global_config'):
        info.global_config = _global_config(proto)
        default_context = info.global_config.default_context
        if 'cuda' in default_context.backend:
            import nnabla_ext.cudnn
        elif 'cuda:float' in default_context.backend:
            try:
                import nnabla_ext.cudnn
            except:
                pass
    else:
        import nnabla_ext.cpu
        default_context = nnabla_ext.cpu.context()

    comm = current_communicator()
    if comm:
        default_context.device_id = str(comm.rank)
    if proto.HasField('training_config'):
        info.training_config = _training_config(proto)

    info.datasets = _datasets(
        proto, prepare_data_iterator if prepare_data_iterator is not None else
        info.training_config.max_epoch > 0)

    info.networks = _networks(proto, default_context, batch_size)

    info.optimizers = _optimizers(proto, default_context, info.networks,
                                  info.datasets)

    info.monitors = _monitors(proto, default_context, info.networks,
                              info.datasets)

    info.executors = _executors(proto, info.networks)

    return info
Пример #22
0
def lms_scheduler(ctx, use_lms, gpu_memory_size=None, window_length=None):
    _check_list = [x.split(":")[0] for x in ctx.backend]
    if "cudnn" not in _check_list and "cuda" not in _check_list:
        logger.warn(
            "ctx passed to scheduler doesn't have cuda/cudnn backend. lms scheduler will not be used."
        )
        use_lms = False

    comm = current_communicator()
    if comm:
        logger.log(99,
                   f'[OoC] Currently OoC is disabled for Multi-GPU training.')
        use_lms = False

    if use_lms:
        gpu_index = 0
        if 'cuda' in str(ctx.backend):
            gpu_index = int(ctx.device_id)
        else:
            logger.log(99, f'[OoC] OoC is only enabled for GPU training.')
            raise Exception

        # It is better to use nvml to get GPU infomation but due to windows problem, temporarily get information with `nvidia-smi`.
        if gpu_memory_size is None or gpu_memory_size == 0:
            try:
                import subprocess
                gpu_memory_size = int(
                    int(
                        subprocess.check_output(
                            'nvidia-smi --query-gpu=index,memory.total --format=csv'
                        ).decode().splitlines()[1:][gpu_index].split(',')
                        [1].strip().split()[0]) * (1024**2) * 0.7)
            except:
                logger.log(
                    99,
                    f'[OoC] Could not get GPU memory size using default value(6GB).'
                )
                gpu_memory_size = 6e9  # default 6 GiB
                pass

        if window_length is None or window_length == 0:
            window_length = int(gpu_memory_size * 1.5)

        logger.log(
            99,
            f'[OoC] gpu_memory_limit: {gpu_memory_size / 1e9}GB, prefetch_window_length: {window_length / 1e9}GB'
        )
        # Change array preference so that lms works well.
        # import nnabla_ext.cuda.init as cuda_init
        # cuda_init.prefer_cpu_pinned_array()
        # cuda_init.prefer_cuda_virtual_array()
        from nnabla.ext_utils import get_extension_context
        be, tc = ctx.backend[0].split(":")
        cpu_ctx = get_extension_context("cpu", device_id="", type_config=tc)
        return SwapInOutScheduler(cpu_ctx, ctx, gpu_memory_size, window_length)
    else:

        class DummyScheduler(object):
            function_pre_hook = None
            function_post_hook = None
            update_pre_hook = None
            update_post_hook = None

            def start_scheduling(self):
                return None

            def end_scheduling(self):
                return None

            def __enter__(self):
                return self

            def __exit__(self, exc_type, exc_val, exc_tb):
                pass

        return DummyScheduler()