def cli_main(parser, args): global return_value return_value = False if 'func' not in args: parser.print_help(sys.stderr) sys.exit(-1) if args.mpi: from nnabla.utils.communicator_util import create_communicator comm = create_communicator() try: return_value = args.func(args) except: import traceback print(traceback.format_exc()) logger.log(99, "ABORTED") os.kill(os.getpid(), 9) # comm.abort() else: try: return_value = args.func(args) except: import traceback print(traceback.format_exc()) return_value = False sys.exit(-1)
def _get_data(self, position): self._position = position if current_communicator(): try: filename, index = self._order[position] except IndexError: logger.log(99, '_get_data() fails at worker {} retrying.'.format( current_communicator().rank)) sleep(0.01) return self._get_data(position) else: filename, index = self._order[position] if filename != self._current_filename: file_names_to_prefetch = None if self._cache_type == ".npy" and self._num_of_threads > 0: file_names_to_prefetch = [o[0] for o in self._order[position + self._max_length:position + self._max_length * self._num_of_threads:self._max_length]] self._current_data = self._get_next_data( filename, file_names_to_prefetch) self._current_filename = filename data = [self._current_data[v][index] for v in self.variables] if self._normalize: data = [d.astype(numpy.float32) * (1.0 / 255.0) if d.dtype == numpy.uint8 else d for d in data] return data
def _get_current_parameter(args, config): def convert_to_info(config): class Info: pass ret = Info() ret.optimizers = OrderedDict() for name, opt in config.optimizers.items(): ret.optimizers[name] = opt.optimizer return ret best_error, best_epoch = callback.get_best_from_status(args) globname = os.path.join(args.outdir, 'results_current_*.nnp') exists = glob.glob(globname) if len(exists) > 0: ex_list = {} info = convert_to_info(config) for ex in exists: n = int(ex.rsplit('_', 1)[1].rsplit('.', 1)[0]) ex_list[n] = ex last_epoch = sorted(ex_list.keys(), reverse=True)[0] last_parameter = ex_list[last_epoch] logger.log( 99, "Load parameter from [{}]".format( os.path.basename(last_parameter))) #load.load([last_parameter], parameter_only=True) load_train_state(last_parameter, info) return last_epoch, best_epoch, best_error return 0, best_epoch, best_error
def create_communicator(ignore_error=False, extension_module='cudnn', type_config='float'): global _current_communicator if os.environ.get('OMPI_COMM_WORLD_SIZE') is not None: from nnabla.ext_utils import get_extension_context context = get_extension_context(extension_module, type_config=type_config) try: logger.log(99, 'Create communicator with contexts {}'.format(context)) _current_communicator = C.MultiProcessCommunicator(context) _current_communicator.init() context.device_id = str(_current_communicator.rank % _current_communicator.size) if _current_communicator.size == 1: _current_communicator = None except: if not ignore_error: raise logger.warning("Failed to initialize nnabla.communicators.") _current_communicator = None else: _current_communicator = None return _current_communicator
def load_csv(file, shape=None, normalize=False): """ Load CSV file. :param file: CSV file. :type file: file like object :param shape : data array is reshape to this shape. :type shape: tuple of int :return: numpy array """ value_list = [] if six.PY2: for row in csv.reader(file): if len(row): value_list.append(list(map(float, row))) elif six.PY34: for row in csv.reader([l.decode('utf-8') for l in file.readlines()]): if len(row): value_list.append(list(map(float, row))) try: if shape is None: return numpy.array(value_list) else: return numpy.array(value_list).reshape(shape) except: logger.log(99, 'Failed to load array from "{}".'.format(file.name)) raise
def read_cache(self, file_name, variables): retry = 1 while True: if retry > 10: logger.log(99, 'read_cache() retry count over give up.') logger.log(99, 'Cache file {} not found.'.format(file_name)) logger.log(99, 'Fatal Error! send SIGKILL to myself.') os.kill(os.getpid(), 9) result = {} try: with FileReader(file_name).open(textmode=False) as f: for v in variables: result[v] = numpy.load(f) if set(result.keys()) == set(variables): break else: logger.log( 99, 'read_cache() fails retrying count {}/10.'.format( retry)) retry += 1 except: logger.log( 99, 'Cache file {} not found, retry count {}.'.format( file_name, retry)) retry += 1 return result
def next(self): '''next It generates tuple of data. For example, if :py:meth:`self._variables == ('x', 'y')` This method returns :py:meth:` ( [[X] * batch_size], [[Y] * batch_size] )` Returns: tuple: tuple of data for mini-batch in numpy.ndarray. ''' if self._use_thread: # Wait for finish previous thread. self._next_thread.join() if self._current_data is None: logger.log(99, 'next() got None retrying.') self._next_thread = threading.Thread(target=self._next) self._next_thread.start() self._next_thread.join() self._current_epoch, data = self._current_data # Start next thread. self._next_thread = threading.Thread(target=self._next) self._next_thread.start() else: self._next() self._current_epoch, data = self._current_data return data
def initialize_cache_files(self, filename): length = -1 with self._filereader.open_cache(filename) as cache: # Check variables. if self._variables is None: self._variables = list(cache.keys()) else: if current_communicator(): if not set(self._variables) == set(cache.keys()): logger.log( 99, 'Error at worker {} {} {}'.format( current_communicator().rank, set(self._variables), set(cache.keys()))) raise for k, v in cache.items(): if length < 0: length = len(v) else: assert (length == len(v)) self._cache_files.append((filename, length)) logger.info('{} {}'.format(filename, length)) if length > self._max_length: self._max_length = length
def profile(config, name, func, result_dict): # for sync CPU/GPU identity = F.Identity(config.global_config.default_context) tmp_in = nn.Variable((1,)) tmp_out = nn.Variable((1,)) identity.setup([tmp_in], [tmp_out]) tmp_in.d = [0.] identity.forward([tmp_in], [tmp_out]) # Profile start = time.time() count = 0 while time.time() < start + 1.0 or count < 100: func() count += 1 # sync CPU/GPU identity.forward([tmp_in], [tmp_out]) data = tmp_out.d t = (time.time() - start) * 1000 / count logger.log(99, '%s %f(ms)' % (name, t)) result_dict[name] = t return result_dict
def _create_dataset(uri, batch_size, shuffle, no_image_normalization, cache_dir, overwrite_cache, create_cache_explicitly, prepare_data_iterator): class Dataset: pass dataset = Dataset() dataset.uri = uri dataset.normalize = not no_image_normalization if prepare_data_iterator: if cache_dir == '': cache_dir = None if cache_dir and create_cache_explicitly: if not os.path.exists(cache_dir) or len(os.listdir(cache_dir)) == 0 or overwrite_cache: if not os.path.exists(cache_dir): os.mkdir(cache_dir) logger.log(99, 'Creating cache data for "' + uri + '"') with data_iterator_csv_dataset(uri, batch_size, shuffle, normalize=False, cache_dir=cache_dir) as di: index = 0 while index < di.size: progress('', (1.0 * di.position) / di.size) di.next() index += batch_size dataset.data_iterator = (lambda: data_iterator_cache( cache_dir, batch_size, shuffle, normalize=dataset.normalize)) elif not cache_dir or overwrite_cache or not os.path.exists(cache_dir) or len(os.listdir(cache_dir)) == 0: if cache_dir and not os.path.exists(cache_dir): os.mkdir(cache_dir) dataset.data_iterator = (lambda: data_iterator_csv_dataset( uri, batch_size, shuffle, normalize=dataset.normalize, cache_dir=cache_dir)) else: dataset.data_iterator = (lambda: data_iterator_cache( cache_dir, batch_size, shuffle, normalize=dataset.normalize)) else: dataset.data_iterator = None return dataset
def train_command(args): configure_progress(os.path.join(args.outdir, 'progress.txt')) files = [] files.append(args.config) if args.param: files.append(args.param) class TrainConfig: pass config = TrainConfig() info = load.load(files) logger.log(99, 'Train with contexts {}'.format(available_contexts)) config.global_config = info.global_config config.training_config = info.training_config class OptConfig: pass config.optimizers = OrderedDict() for name, opt in info.optimizers.items(): o = OptConfig() o.optimizer = opt o.data_iterator = None config.optimizers[name] = o class MonConfig: pass config.monitors = OrderedDict() for name, mon in info.monitors.items(): m = MonConfig() m.monitor = mon m.data_iterator = None config.monitors[name] = m # Training max_iter = config.training_config.max_epoch * \ config.training_config.iter_per_epoch if max_iter > 0: data_iterators = {'optimizer': {}, 'monitor': {}} with ExitStack() as stack: for name, o in config.optimizers.items(): o.data_iterator = stack.enter_context( o.optimizer.data_iterator()) for name, m in config.monitors.items(): m.data_iterator = stack.enter_context( m.monitor.data_iterator()) train(args, config) else: # save parameters without training (0 epoch learning) save_parameters(os.path.join(args.outdir, 'parameters.h5')) logger.log(99, 'Training Completed.') progress(None)
def profile_command(args): configure_progress(os.path.join(args.outdir, 'progress.txt')) files = [] files.append(args.config) class TrainConfig: pass config = TrainConfig() info = load.load(files) config.global_config = info.global_config config.training_config = info.training_config class OptConfig: pass config.optimizers = OrderedDict() for name, opt in info.optimizers.items(): o = OptConfig() o.optimizer = opt o.data_iterator = None config.optimizers[name] = o class MonConfig: pass config.monitors = OrderedDict() for name, mon in info.monitors.items(): m = MonConfig() m.monitor = mon m.data_iterator = None config.monitors[name] = m ext_module = import_extension_module( config.global_config.default_context.backend[0].split(':')[0]) def synchronize(): return ext_module.synchronize( device_id=config.global_config.default_context.device_id) result_array = [['time in ms']] # Profile Optimizer with ExitStack() as stack: for name, o in config.optimizers.items(): o.data_iterator = stack.enter_context(o.optimizer.data_iterator()) result_array = profile_optimizer(config, result_array, synchronize) # Write profiling result import csv with open(args.outdir + os.sep + 'profile.csv', 'w') as f: writer = csv.writer(f, lineterminator='\n') writer.writerows(result_array) logger.log(99, 'Profile Completed.') progress(None) return True
def _wait(): import time count = 0 while not _finish: if count > 10000: logger.log(99, "STALLED MPI RANK {}".format(comm.rank)) os.kill(os.getpid(), 9) time.sleep(0.01) count += 1
def train_command(args): logger.log(99, 'Train with contexts {}'.format(available_contexts)) configure_progress(os.path.join(args.outdir, 'progress.txt')) files = [] files.append(args.config) if args.param: files.append(args.param) class TrainConfig: pass config = TrainConfig() info = load.load(files) config.global_config = info.global_config config.training_config = info.training_config class OptConfig: pass config.optimizers = OrderedDict() for name, opt in info.optimizers.items(): o = OptConfig() o.optimizer = opt o.data_iterator = None config.optimizers[name] = o class MonConfig: pass config.monitors = OrderedDict() for name, mon in info.monitors.items(): m = MonConfig() m.monitor = mon m.data_iterator = None config.monitors[name] = m # Training max_iter = config.training_config.max_epoch * \ config.training_config.iter_per_epoch if max_iter > 0: data_iterators = {'optimizer': {}, 'monitor': {}} with ExitStack() as stack: for name, o in config.optimizers.items(): o.data_iterator = stack.enter_context( o.optimizer.data_iterator()) for name, m in config.monitors.items(): m.data_iterator = stack.enter_context( m.monitor.data_iterator()) train(args, config) else: # save parameters without training (0 epoch learning) save_parameters(os.path.join( args.outdir, 'parameters.h5')) logger.log(99, 'Training Completed.') progress(None)
def _wait(): import time import sys count = 0 while not _finish: if count > 10000: logger.log(99, "STALLED MPI RANK {}".format(comm.rank)) sys.exit(-1) time.sleep(0.01) count += 1
def _create_dataset(uri, batch_size, shuffle, no_image_normalization, cache_dir, overwrite_cache, create_cache_explicitly, prepare_data_iterator): class Dataset: pass dataset = Dataset() dataset.uri = uri dataset.normalize = not no_image_normalization comm = current_communicator() # use same random state for each process until slice is called rng = numpy.random.RandomState(0) use_memory_cache = comm.size == 1 if comm else True if prepare_data_iterator: if cache_dir == '': cache_dir = None # Disable implicit cache creation when MPI is available. if cache_dir and (create_cache_explicitly or comm): cache_index = os.path.join(cache_dir, "cache_index.csv") if not os.path.exists(cache_index) or overwrite_cache: if single_or_rankzero(): logger.log(99, 'Creating cache data for "' + uri + '"') try: os.makedirs(cache_dir) except OSError: pass # python2 does not support exists_ok arg with data_iterator_csv_dataset(uri, batch_size, shuffle, rng=rng, normalize=False, cache_dir=cache_dir, with_memory_cache=False) as di: pass rng = numpy.random.RandomState(0) dataset.data_iterator = (lambda: data_iterator_cache( cache_dir, batch_size, shuffle, rng=rng, normalize=dataset.normalize, with_memory_cache=use_memory_cache)) elif not cache_dir or overwrite_cache or not os.path.exists(cache_dir) or len(os.listdir(cache_dir)) == 0: if comm: logger.critical( 'Implicit cache creation does not support with MPI') import sys sys.exit(-1) else: if cache_dir: try: os.makedirs(cache_dir) except OSError: pass # python2 does not support exists_ok arg dataset.data_iterator = (lambda: data_iterator_csv_dataset( uri, batch_size, shuffle, rng=rng, normalize=dataset.normalize, cache_dir=cache_dir)) else: dataset.data_iterator = (lambda: data_iterator_cache( cache_dir, batch_size, shuffle, rng=rng, normalize=dataset.normalize, with_memory_cache=use_memory_cache)) else: dataset.data_iterator = None return dataset
def create_data_csv(seed): path = os.path.abspath(os.path.dirname(__file__)) base_dir = os.path.join(path, 'stl10') ensure_dir(base_dir) # Create original training set logger.log(99, 'Downloading STL10 dataset...') output_dir = os.path.join(path, 'download') train_di = data_iterator_stl10(5000, True, None, False, output_dir=output_dir) logger.log(99, 'Creating "stl10_training.csv"... ') train_csv = data_iterator_to_csv(base_dir, 'stl10_training.csv', 'training', train_di) train_csv, val_csv = split_data_into_train_val(train_csv, val_size=1000, seed=seed) save_list_to_csv(train_csv, base_dir, 'stl10_training' + '_' + str(seed) + '.csv') save_list_to_csv(val_csv, base_dir, 'stl10_validation' + '_' + str(seed) + '.csv') # Validation validation_di = data_iterator_stl10(8000, False, None, False, output_dir=output_dir) logger.log(99, 'Creating "stl10_test.csv"... ') _ = data_iterator_to_csv(base_dir, 'stl10_test.csv', 'validation', validation_di) logger.log(99, 'Dataset creation completed successfully.')
def profile_command(args): configure_progress(os.path.join(args.outdir, 'progress.txt')) files = [] files.append(args.config) class TrainConfig: pass config = TrainConfig() info = load.load(files) config.global_config = info.global_config config.training_config = info.training_config class OptConfig: pass config.optimizers = OrderedDict() for name, opt in info.optimizers.items(): o = OptConfig() o.optimizer = opt o.data_iterator = None config.optimizers[name] = o class MonConfig: pass config.monitors = OrderedDict() for name, mon in info.monitors.items(): m = MonConfig() m.monitor = mon m.data_iterator = None config.monitors[name] = m result_array = [['time in ms']] # Profile Optimizer with ExitStack() as stack: for name, o in config.optimizers.items(): o.data_iterator = stack.enter_context( o.optimizer.data_iterator()) result_array = profile_optimizer(config, result_array) # Write profiling result import csv with open(args.outdir + os.sep + 'profile.csv', 'w') as f: writer = csv.writer(f, lineterminator='\n') writer.writerows(result_array) logger.log(99, 'Profile Completed.') progress(None)
def read_s3_object(self, key): retry = 1 result = '' while True: if retry > 10: logger.log(99, 'read_s3_object() retry count over give up.') raise try: result = self._s3_bucket.Object(key).get()['Body'].read() break except: logger.log( 99, 'read_s3_object() fails retrying count {}/10.'.format(retry)) retry += 1 return result
def train(args, config): max_iter = config.training_config.max_epoch * \ config.training_config.iter_per_epoch logger.log( 99, 'Training epoch 1 of {} begin'.format( config.training_config.max_epoch)) class Cost: pass cost = Cost() cost.sum_epoch = 0.0 cost.sum_iter = 0.0 cost.variables = None best_error = None for iter in range(max_iter): cost = _update(iter, config, cost) if (iter + 1) % config.training_config.iter_per_epoch == 0: # End of epoch epoch = iter / config.training_config.iter_per_epoch + 1 cost_avg_epoch = cost.sum_epoch / config.training_config.iter_per_epoch monitoring_report = [] # Evaluation error_str = '' if epoch % 10 == 0 or epoch <= 5: best_error, error_str = _evaluate(args, config, monitoring_report, best_error) # Write to monitoring_report.yml f = open(os.path.join(args.outdir, 'monitoring_report.yml'), 'a') f.write('{}:\n'.format(epoch - 1)) f.write(' cost: {}\n'.format(cost_avg_epoch)) for str in monitoring_report: f.write(str) f.close() cost.sum_epoch = 0 logger.log( 99, 'epoch {} of {} cost={:.6f} {}'.format( epoch, config.training_config.max_epoch, cost_avg_epoch, error_str))
def get_data(args): pos = args[0] q = args[1] retry = 1 while True: if retry > 10: logger.log( 99, '_get_current_data() retry count over give up.') raise d = self._data_source._get_data(pos) if d is not None: break logger.log(99, '_get_data() fails. retrying count {}/10.'.format( retry)) retry += 1 q.put((pos, d))
def _get_current_parameter(args): globname = os.path.join(args.outdir, 'results_current_*.nnp') exists = glob.glob(globname) if len(exists) > 0: ex_list = {} for ex in exists: n = int(ex.rsplit('_', 1)[1].rsplit('.', 1)[0]) ex_list[n] = ex last_epoch = sorted(ex_list.keys())[0] last_parameter = ex_list[last_epoch] logger.log(99, "Load parameter from [{}]".format( os.path.basename(last_parameter))) load.load([last_parameter], parameter_only=True) return last_epoch return 0
def train(args, config): max_iter = config.training_config.max_epoch * \ config.training_config.iter_per_epoch logger.log(99, 'Training epoch 1 of {} begin'.format( config.training_config.max_epoch)) class Cost: pass cost = Cost() cost.sum_epoch = 0.0 cost.sum_iter = 0.0 cost.variables = None best_error = None for iter in range(max_iter): cost = _update(iter, config, cost) if (iter + 1) % config.training_config.iter_per_epoch == 0: # End of epoch epoch = iter / config.training_config.iter_per_epoch + 1 cost_avg_epoch = cost.sum_epoch / config.training_config.iter_per_epoch monitoring_report = [] # Evaluation error_str = '' if epoch % 10 == 0 or epoch <= 5: best_error, error_str = _evaluate( args, config, monitoring_report, best_error) # Write to monitoring_report.yml f = open(os.path.join(args.outdir, 'monitoring_report.yml'), 'a') f.write('{}:\n'.format(epoch - 1)) f.write(' cost: {}\n'.format(cost_avg_epoch)) for str in monitoring_report: f.write(str) f.close() cost.sum_epoch = 0 logger.log(99, 'epoch {} of {} cost={:.6f} {}'.format( epoch, config.training_config.max_epoch, cost_avg_epoch, error_str))
def create_communicator(ignore_error=False): global _current_communicator from nnabla.ext_utils import get_extension_context extension_module = "cudnn" context = get_extension_context(extension_module) try: logger.log(99, 'Create communicator with contexts {}'.format(context)) _current_communicator = C.MultiProcessCommunicator(context) _current_communicator.init() context.device_id = str(_current_communicator.rank % _current_communicator.size) if _current_communicator.size == 1: _current_communicator = None except: if not ignore_error: raise logger.warning("Failed to initialize nnabla.communicators.") _current_communicator = None return _current_communicator
def _get_next_data(self, filename, file_names_to_prefetch, retry=1): if retry > 10: logger.log(99, '_get_next_data() retry count over give up.') raise if self._cache_type == '.npy': next_data = self._cache_reader_with_prefetch.open_and_prefetch_cache( filename, file_names_to_prefetch) else: # h5 format next_data = {} with self._filereader.open_cache(filename) as cache: for k, v in cache.items(): next_data[k] = v[()] if current_communicator(): if set(self._variables) != set(next_data.keys()): logger.log(99, '_get_next_data() fails at worker {} retrying count {}/10.'.format( current_communicator().rank, retry)) sleep(0.01) return self._get_next_data(filename, file_names_to_prefetch, retry+1) return next_data
def profile(config, name, func, result_dict, synchromize): # Warm-up func() synchromize() # Profile start_0 = time.time() result = 0 count = 0 while time.time() < start_0 + 1.0 or count < 100: start = time.time() func() synchromize() stop = time.time() result += stop - start count += 1 t = result * 1000 / count logger.log(99, '%s %f(ms)' % (name, t)) result_dict[name] = t return result_dict
def next(self): '''next It generates tuple of data. For example, if :py:meth:`self._variables == ('x', 'y')` This method returns :py:meth:` ( [[X] * batch_size], [[Y] * batch_size] )` Returns: tuple: tuple of data for mini-batch in numpy.ndarray. ''' if not self._use_thread: self._next() data, n_reset = self._queue.get() self._queue.task_done() if self._use_thread: self._next_thread.join() if data is None: if self._stop_exhausted and self._data_source.position + self._batch_size >= self._size: raise StopIteration if self._use_thread: logger.log(99, 'next() got None retrying...') self._next_thread = threading.Thread(target=self._next) self._next_thread.start() data, n_reset = self._queue.get() self._queue.task_done() self._next_thread.join() if self._use_thread: self._next_thread = threading.Thread(target=self._next) self._next_thread.start() for _ in range(n_reset): if self._current_epoch >= 0: self._callback_epoch_end() self._current_epoch += 1 self._callback_epoch_begin() return data
def compare_optimizer(config, parameters, config_cpu, parameters_cpu, result_array): loaded_datas = {} for opt, opt_cpu in zip(config.optimizers.values(), config_cpu.optimizers.values()): o = opt.optimizer o_cpu = opt.optimizer opts = [o, o_cpu] result_name = "optimizer '%s' with network '%s'" % ( o.name, o.network.name) result_dict = OrderedDict() logger.log(99, 'Comparing ' + result_name + ' ...') logger.log( 99, 'process(func, variable), norm_diff, current_context_std, cpu_std, diff_std') # Start comparison with same parameters for p, p_cpu in zip(parameters.values(), parameters_cpu.values()): p_cpu.d = p.d # Load dataset di = opt.data_iterator if di not in loaded_datas: loaded_datas[di] = di.next() datas = loaded_datas[di] for v, d in o.dataset_assign.items(): let_data_to_variable(v.variable_instance, datas[ di.variables.index(d)]) for v, d in o_cpu.dataset_assign.items(): let_data_to_variable(v.variable_instance, datas[ di.variables.index(d)]) # Generate data generated = {} for v, generator in o.generator_assign.items(): generated[v.name] = generator(v.shape) dest_context = config.global_config.default_context if not o.forward_sequence or v not in o.forward_sequence[ 0].inputs else None let_data_to_variable(v.variable_instance, data=generated[v.name], ctx=dest_context) for v, generator in o_cpu.generator_assign.items(): dest_context = config.global_config.default_context if not o.forward_sequence or v not in o.forward_sequence[ 0].inputs else None let_data_to_variable(v.variable_instance, data=generated[v.name], ctx=dest_context) last_max_diff = 1e-5 # Forward for func, func_cpu in zip(o.forward_sequence, o_cpu.forward_sequence): o.network.forward_function(func) o_cpu.network.forward_function(func_cpu) large_diff = False for v, v_cpu in zip(func.outputs, func_cpu.outputs): name = 'forward_function (%s, %s)' % (func.name, v.name) if v.variable_instance.d.shape != v_cpu.variable_instance.d.shape: logger.log(99, 'Variable shape is different in %s (current_context=%s, cpu=%s)' % ( v.name, str(v.variable_instance.d.shape), str(v_cpu.variable_instance.d.shape))) norm_diff, std1, std2, diff_std = calc_norm_diff( v.variable_instance.d, v_cpu.variable_instance.d) logger.log(99, '%s, %f, %f, %f, %f' % (name, norm_diff, std1, std2, diff_std)) result_dict[name] = norm_diff if norm_diff > last_max_diff: if norm_diff > last_max_diff * 10: logger.log(99, ' current_context(data)=' + str(v.variable_instance.d.flatten())) logger.log(99, ' cpu(data)=' + str(v_cpu.variable_instance.d.flatten())) large_diff = True last_max_diff = norm_diff if large_diff: logger.log(99, ' x_data:') for v, v_cpu in zip(func.inputs, func_cpu.inputs): logger.log(99, ' current_context(%s.d)=%s' % (v.name, str(v.variable_instance.d.flatten()))) logger.log(99, ' cpu(%s.d)=%s' % ( v_cpu.name, str(v_cpu.variable_instance.d.flatten()))) # Backward o.network.prepare_backward(o.backward_sequence) o_cpu.network.prepare_backward(o_cpu.backward_sequence) for seq, seq_cpu in zip(o.backward_sequence.sequence, o_cpu.backward_sequence.sequence): o.network.backward_function(seq) o_cpu.network.backward_function(seq_cpu) large_diff = False for v, v_cpu in zip(seq.func.inputs, seq_cpu.func.inputs): if v.variable_instance.need_grad: name = 'backward_function (%s, %s)' % ( seq.func.name, v.name) norm_diff, std1, std2, diff_std = calc_norm_diff( v.variable_instance.g, v_cpu.variable_instance.g) logger.log(99, '%s, %f, %f, %f, %f' % (name, norm_diff, std1, std2, diff_std)) result_dict[name] = norm_diff if norm_diff > last_max_diff: if norm_diff > last_max_diff * 10: logger.log(99, ' current_context(diff)=' + str( v.variable_instance) + str(v.variable_instance.g.flatten())) logger.log(99, ' cpu(diff)=' + str(v_cpu.variable_instance) + str(v_cpu.variable_instance.g.flatten())) large_diff = True last_max_diff = norm_diff if large_diff: logger.log(99, ' x_data:') for v, v_cpu in zip(seq.func.inputs, seq_cpu.func.inputs): logger.log(99, ' current_context(%s.d)=%s' % (v.name, str(v.variable_instance.d.flatten()))) logger.log(99, ' cpu(%s.d)=%s' % ( v_cpu.name, str(v_cpu.variable_instance.d.flatten()))) logger.log(99, ' y_diff:') for v, v_cpu in zip(seq.func.outputs, seq_cpu.func.outputs): logger.log(99, ' current_context(%s.g)=%s' % (v.name, str(v.variable_instance.g.flatten()))) logger.log(99, ' cpu(%s.g)=%s' % ( v_cpu.name, str(v_cpu.variable_instance.g.flatten()))) # Update (weight decay) if o.weight_decay > 0: o.solver.weight_decay(o.weight_decay) o_cpu.solver.weight_decay(o_cpu.weight_decay) # Update o.solver.update() o_cpu.solver.update() for i, (v, lr) in enumerate(o.parameter_learning_rate_multipliers.items()): v_cpu = o_cpu.parameter_learning_rate_multipliers.items()[i][0] if lr > 0: name = 'update (%s, %s)' % (o.solver.name, v.name) norm_diff, std1, std2, diff_std = calc_norm_diff( v.variable_instance.d, v_cpu.variable_instance.d) logger.log(99, '%s, %f, %f, %f, %f' % (name, norm_diff, std1, std2, diff_std)) result_dict[name] = norm_diff result_array = add_result(result_name, result_dict, result_array) return result_array
def train_command(args): callback.update_status(args) if single_or_rankzero(): configure_progress(os.path.join(args.outdir, 'progress.txt')) info = load.load([args.config], prepare_data_iterator=None, exclude_parameter=True) # Check dataset uri is empty. dataset_error = False for dataset in info.datasets.values(): if dataset.uri.strip() == '': dataset_error = True if dataset_error: logger.log(99, 'Fatal error. Dataset URI is empty.') return False class TrainConfig: pass config = TrainConfig() config.timelimit = -1 if args.param: load.load([args.param], parameter_only=True) config.timelimit = callback.get_timelimit(args) config.global_config = info.global_config config.training_config = info.training_config if single_or_rankzero(): logger.log(99, 'Train with contexts {}'.format(available_contexts)) class OptConfig: pass config.optimizers = OrderedDict() for name, opt in info.optimizers.items(): o = OptConfig() o.optimizer = opt o.data_iterators = [] config.optimizers[name] = o class MonConfig: pass config.monitors = OrderedDict() for name, mon in info.monitors.items(): m = MonConfig() m.monitor = mon m.data_iterators = [] config.monitors[name] = m # Training comm = current_communicator() config.training_config.iter_per_epoch //= comm.size if comm else 1 max_iteration = config.training_config.max_epoch * \ config.training_config.iter_per_epoch global _save_parameter_info _save_parameter_info = {} _, config_ext = os.path.splitext(args.config) if config_ext == '.prototxt' or config_ext == '.nntxt': _save_parameter_info['config'] = args.config elif config_ext == '.nnp': with zipfile.ZipFile(args.config, 'r') as nnp: for name in nnp.namelist(): _, ext = os.path.splitext(name) if ext == '.nntxt' or ext == '.prototxt': nnp.extract(name, args.outdir) _save_parameter_info['config'] = os.path.join( args.outdir, name) result = False restart = False if max_iteration > 0: rng = np.random.RandomState(comm.rank if comm else 0) with ExitStack() as stack: # Create data_iterator instance only once for each dataset in optimizers optimizer_data_iterators = {} for name, o in config.optimizers.items(): for di in o.optimizer.data_iterators.values(): if di not in optimizer_data_iterators: di_instance = stack.enter_context(di()) if comm and comm.size > 1: di_instance = di_instance.slice( rng, comm.size, comm.rank) optimizer_data_iterators[di] = di_instance else: di_instance = optimizer_data_iterators[di] o.data_iterators.append(di_instance) # Create data_iterator instance only once for each dataset in monitors monitor_data_iterators = {} for name, m in config.monitors.items(): for di in m.monitor.data_iterators.values(): if di not in monitor_data_iterators: di_instance = stack.enter_context(di()) if comm and comm.size > 1: di_instance = di_instance.slice( rng, comm.size, comm.rank) monitor_data_iterators[di] = di_instance else: di_instance = monitor_data_iterators[di] m.data_iterators.append(di_instance) monitor_data_iterators.update(optimizer_data_iterators) result, restart = _train(args, config) else: # save parameters without training (0 epoch learning) logger.log(99, '0 epoch learning. (Just save parameter.)') if single_or_rankzero(): _save_parameters(args, None, 0, config, True) result = True if single_or_rankzero() and not restart: if result: logger.log(99, 'Training Completed.') callback.update_status('finished') else: logger.log(99, 'Training Incompleted.') callback.update_status('failed') if single_or_rankzero(): progress(None) return True
def _train(args, config): global _save_parameter_info comm = current_communicator() _CGLOAD_LOG_INTERVAL = 20 best_epoch = None best_error = None last_epoch = 0 if args.resume: last_epoch, best_epoch, best_error = _get_current_parameter(args) if best_epoch is not None: logger.log( 99, "Best error {} recorded at epoch {} in previous training.". format(best_error, best_epoch)) if best_epoch > last_epoch: logger.log( 99, "Resumed epoch is {} but this training keep this result.". format(last_epoch)) logger.log(99, "Resume from epoch {}".format(last_epoch + 1)) callback.update_status(('epoch.max', config.training_config.max_epoch)) callback.update_status( ('epoch.current', last_epoch + 1 if last_epoch < config.training_config.max_epoch else config.training_config.max_epoch)) max_iteration = config.training_config.max_epoch * \ config.training_config.iter_per_epoch if single_or_rankzero(): logger.log( 99, 'Training epoch {} of {} begin'.format( last_epoch + 1, config.training_config.max_epoch)) class Cost: pass cost = Cost() cost.sum_epoch = 0.0 cost.num_iteration = 0 cost.sum_iteration = 0.0 cost.variables = None class TimeInfo: pass timeinfo = TimeInfo() timeinfo.past_time = 0 timeinfo.estimate_time = 0 timeinfo.last_past_time = None if max_iteration > 0: last_iteration = last_epoch * config.training_config.iter_per_epoch if last_iteration < max_iteration: timeinfo.start_time = time.time() timeinfo.last_epoch_start_time = timeinfo.start_time callback.update_status('processing', True, timeinfo.start_time) for iteration in range(last_iteration, max_iteration): # instant load measurement measure_cpu_gpu_instant_load() cost = _update(iteration, config, cost) if np.isnan(cost.sum_epoch) or np.isinf(cost.sum_epoch): logger.log(99, 'Cost is Nan') return False, False timeinfo = _calc_estimate_time(timeinfo, max_iteration, last_iteration, iteration + 1) callback.update_time_train(prediction=timeinfo.estimate_time) if 0 < config.timelimit < timeinfo.estimate_time: logger.log( 99, 'Expected training time ({:.3f}s) will exceed time limit ({}s).' .format(timeinfo.estimate_time, config.timelimit)) return False, False if (iteration + 1) % config.training_config.iter_per_epoch == 0: last_past_time = -1 # End of epoch epoch = iteration // config.training_config.iter_per_epoch + 1 cost_avg_epoch = cost.sum_epoch / cost.num_iteration if cost.num_iteration else 0 cost.sum_epoch = 0.0 cost.num_iteration = 0 monitoring_report = [] # Evaluation error_str = '' if epoch % config.training_config.monitor_interval == 0 or epoch <= 5: best_error, error_str = _evaluate( args, config, monitoring_report, best_error, epoch) # Cpu/Gpu average load cg_load_str = '' cgload_log = '' cg_load = get_cpu_gpu_average_load() if cg_load: cg_load_str = 'epoch {} average_load_matrix: {}'.format( epoch, cg_load) span = _calc_epoch_span(timeinfo) if span > _CGLOAD_LOG_INTERVAL: cgload_log = _format_cgload_log(cg_load) if single_or_rankzero(): # Write to monitoring_report.yml f = open( os.path.join(args.outdir, 'monitoring_report.yml'), 'a') f.write('{}:\n'.format(epoch - 1)) f.write(' cost: {}\n'.format(cost_avg_epoch)) for s in monitoring_report: f.write(s) f.close() callback.update_status( (['monitoring_report', epoch, 'cost'], cost_avg_epoch)) _save_parameters(args, 'current', epoch, config) callback.update_status(('epoch.current', epoch)) callback.update_status() logger.log( 99, 'epoch {} of {} cost={:.6f} {} time=({:.1f}s /{:.1f}s) {}' .format(epoch, config.training_config.max_epoch, cost_avg_epoch, error_str, timeinfo.past_time, timeinfo.estimate_time, cgload_log)) if cg_load_str: # cpu_gpu_average_load record at epoch level callback.update_status( (['cpu_gpu_epoch_load', epoch], cg_load)) progress(cg_load_str, 1) if not callback.check_training_time( args, config, timeinfo, epoch, last_epoch): _save_parameters(args, 'current', epoch, config, True) return False, True if single_or_rankzero(): _save_parameters(args, 'current', epoch, config, True) return True, False
def _evaluate(args, config, monitoring_report, best_error, epoch): comm = current_communicator() error_str = '' valid_error = 0.0 def _sum_error(sum, error): ret = None if comm: # logger.log(99, "Calc error with communicator") var = [nn.NdArray()] var[0].data = error _all_reduce(comm, var, division=False, inplace=True) ret = sum + var[0].data else: ret = sum + error return ret for name, mon in config.monitors.items(): m = mon.monitor error_sum_monitor = 0.0 error_count = 0 data_size = max([di.size for di in mon.data_iterators]) batch_size = max([di.batch_size for di in mon.data_iterators]) for i in range(data_size // batch_size): # Load dataset data = OrderedDict() for di in mon.data_iterators: data.update(zip(di.variables, di.next())) # Set data to variable for v, d in m.dataset_assign.items(): dest_context = config.global_config.default_context if not m.forward_sequence or v not in m.forward_sequence[ 0].inputs else None let_data_to_variable(v.variable_instance, data[d], ctx=dest_context, data_name=d, variable_name=v.name) # Generate data for v, generator in m.generator_assign.items(): dest_context = config.global_config.default_context if not m.forward_sequence or v not in m.forward_sequence[ 0].inputs else None let_data_to_variable(v.variable_instance, data=generator(v.shape), ctx=dest_context, variable_name=v.name) # Sum error before forward to prepare input data while processing # on GPU if error_count > 0: error_sum = 0.0 for v in m.monitor_variables: error_sum += np.mean(v.variable_instance.d) # v.variable_instance.data.zero() error_sum_monitor = _sum_error(error_sum_monitor, error_sum) if single_or_rankzero(): progress( 'Evaluating "{0}"'.format(name) + ' : error={0:0.6f}'.format( error_sum_monitor / error_count), di.position * 1.0 / di.size) error_count += comm.size if comm else 1 # Forward recursive m.network.forward(m.forward_sequence) # Sum error at the end of dataset error_sum = 0.0 for v in m.monitor_variables: error_sum += np.mean(v.variable_instance.d) # v.variable_instance.data.zero() error_sum_monitor = _sum_error(error_sum_monitor, error_sum) if error_count == 0: error = 0 else: error = error_sum_monitor / error_count if np.isnan(error) or np.isinf(error): logger.log(99, 'Validation error is Nan') error = 0.0 monitoring_report.append(' {}: {}\n'.format(name, error)) callback.update_status((['monitoring_report', epoch, name], error)) callback.update_status((['last', name], error)) # save last value if error_str != '': error_str += ', ' else: error_str = ' {' error_str += '{}={:.6f}'.format(name, error) if name == 'valid_error': valid_error = error if error_str != '': error_str += '}' # Save Parameters if single_or_rankzero(): if (not config.training_config.save_best) or \ (not best_error) or \ (best_error is not None and valid_error <= best_error): best_error = valid_error callback.update_status(('best.valid_error', best_error)) callback.update_status(('best.epoch', epoch)) _save_parameters(args, 'best', epoch, config, True) return best_error, error_str
def profile_optimizer(config, result_array): # Profile Training for opt in config.optimizers.values(): o = opt.optimizer result_name = "optimizer '%s' with network '%s'" % ( o.name, o.network.name) result_dict = OrderedDict() logger.log(99, 'Profiling ' + result_name + ' ...') # Load dataset def load_dataset(): loaded_datas = {} di = opt.data_iterator loaded_datas[di] = di.next() return loaded_datas profile(config, 'load_dataset', load_dataset, result_dict) # Let data loaded_datas = load_dataset() for v, d in o.dataset_assign.items(): def let_data(): try: data = loaded_datas[opt.data_iterator][ opt.data_iterator.variables.index(d)] except: print(opt.data_iterator.variables) raise ValueError( 'Data "' + d + '" is not found in dataset.') let_data_to_variable(v.variable_instance, data=data) profile(config, 'let_data (%s to %s)' % (d, v.name), let_data, result_dict) # Generate data for v, generator in o.generator_assign.items(): def generate_data(): let_data_to_variable(v.variable_instance, data=generator(v.shape)) profile(config, 'generate_data (%s)' % v.name, generate_data, result_dict) # Setup (detail) for func in o.forward_sequence: def setup(): o.network.setup_function(func) profile(config, 'setup_function (%s : %s)' % ( func.name, func.function_instance.name), setup, result_dict) # Forward (detail) for func in o.forward_sequence: def forward(): o.network.forward_function(func) in_place_str = ' : in_place' if func.function_instance.inplace_data( 0) > 0 else '' profile(config, 'forward_function (%s : %s%s)' % ( func.name, func.function_instance.name, in_place_str), forward, result_dict) # Backward (detail) def prepare_backward(): o.network.prepare_backward(o.backward_sequence) profile(config, 'prepare_backward', prepare_backward, result_dict) for seq in o.backward_sequence.sequence: o.network.prepare_backward(o.backward_sequence) def backward(): o.network.backward_function(seq) in_place_str = ' : in_place' if seq.func.function_instance.inplace_grad( 0) > 0 else '' profile(config, 'backward_function (%s : %s%s)' % ( seq.func.name, seq.func.function_instance.name, in_place_str), backward, result_dict) # Forward (all) def forward_all(): o.network.forward(o.forward_sequence) profile(config, 'forward_all', forward_all, result_dict) # Backward (all) def backward_all(): o.network.backward(o.backward_sequence) profile(config, 'backward_all', backward_all, result_dict) # Backward (all) def backward_all_wo_zero_grad(): o.network.backward(o.backward_sequence, parameter_zero_grad=False) profile(config, 'backward_all(wo param zero_grad)', backward_all_wo_zero_grad, result_dict) # Update (weight decay) if o.weight_decay > 0: def weight_decay(): o.solver.weight_decay(o.weight_decay) profile(config, 'weight_decay (%s)' % o.solver.name, weight_decay, result_dict) # Update def update(): o.solver.update() profile(config, 'update (%s)' % o.solver.name, update, result_dict) # Monitor loss def monitor_loss(): for l in o.loss_variables: np.mean(l.variable_instance.d) profile(config, 'monitor_loss', monitor_loss, result_dict) result_array = add_result(result_name, result_dict, result_array) return result_array
def forward_command(args): configure_progress(os.path.join(args.outdir, 'progress.txt')) files = [] files.append(args.config) if args.param: files.append(args.param) class ForwardConfig: pass config = ForwardConfig info = load.load(files, prepare_data_iterator=False) config.global_config = info.global_config config.executors = info.executors.values() config.networks = [] for e in config.executors: if e.network.name in info.networks.keys(): config.networks.append(info.networks[e.network.name]) else: logger.critical('Network {} does not found.'.format( config.executor.network.name)) return normalize = True for d in info.datasets.values(): if d.uri == args.dataset: normalize = d.normalize data_iterator = (lambda: data_iterator_csv_dataset( args.dataset, config.networks[0].batch_size, False, normalize=normalize)) # load dataset as csv with open(args.dataset, 'rt') as f: rows = [row for row in csv.reader(f)] row0 = rows.pop(0) root_path = os.path.dirname(args.dataset) root_path = os.path.abspath(root_path.replace('/|\\', os.path.sep)) rows = list(map(lambda row: list(map(lambda x: x if is_float( x) else compute_full_path(root_path, x), row)), rows)) with data_iterator() as di: index = 0 while index < di.size: data = di.next() result, outputs = forward(args, index, config, data, di.variables) if index == 0: for name, dim in zip(result.names, result.dims): if dim == 1: row0.append(name) else: for d in range(dim): row0.append(name + '__' + str(d)) for i, output in enumerate(outputs): if index + i < len(rows): rows[index + i].extend(output) index += len(outputs) logger.log( 99, 'data {} / {}'.format(min([index, len(rows)]), len(rows))) with open(os.path.join(args.outdir, 'output_result.csv'), 'w') as f: writer = csv.writer(f, lineterminator='\n') writer.writerow(row0) writer.writerows(rows) logger.log(99, 'Forward Completed.') progress(None)
def forward_command(args): callback.update_status(args) configure_progress(os.path.join(args.outdir, 'progress.txt')) files = [] files.append(args.config) if args.param: files.append(args.param) batch_size = args.batch_size if batch_size < 1: batch_size = None class ForwardConfig: pass config = ForwardConfig info = load.load(files, prepare_data_iterator=False, batch_size=batch_size) config.global_config = info.global_config config.executors = info.executors.values() config.networks = [] for e in config.executors: if e.network.name in info.networks.keys(): config.networks.append(info.networks[e.network.name]) else: logger.critical('Network {} is not found.'.format( config.executor.network.name)) return False normalize = True for d in info.datasets.values(): if d.uri == args.dataset or d.cache_dir == args.dataset: normalize = d.normalize for e in config.executors: normalize = normalize and not e.no_image_normalization orders = {} # With CSV if os.path.splitext(args.dataset)[1] == '.csv': data_iterator = (lambda: data_iterator_csv_dataset( uri=args.dataset, batch_size=config.networks[0].batch_size, shuffle=False, normalize=normalize, with_memory_cache=False, with_file_cache=False)) # load dataset as csv filereader = FileReader(args.dataset) with filereader.open(textmode=True, encoding='utf-8-sig') as f: rows = [row for row in csv.reader(f)] row0 = rows.pop(0) if args.replace_path: root_path = os.path.dirname(args.dataset) root_path = os.path.abspath(root_path.replace('/|\\', os.path.sep)) else: root_path = '.' rows = [row for row in rows if len(row)] rows = list( map( lambda row: list( map( lambda i, x: x if row0[i][0] == '#' or is_float( x) else compute_full_path(root_path, x), range(len(row)), row)), rows)) for i in range(len(rows)): orders[i] = i # With Cache elif os.path.splitext(args.dataset)[1] == '.cache': data_iterator = (lambda: data_iterator_cache(uri=args.dataset, batch_size=config. networks[0].batch_size, shuffle=False, normalize=normalize)) # Get original CSV original_csv = os.path.join(args.dataset, 'original.csv') try: # load dataset as csv filereader = FileReader(original_csv) with filereader.open(textmode=True, encoding='utf-8-sig') as f: rows = [row for row in csv.reader(f)] row0 = rows.pop(0) root_path = '.' rows = list( map( lambda row: list( map( lambda x: x if is_float(x) else compute_full_path( root_path, x), row)), rows)) except: print('Cannot open', original_csv) pass # Get original Data order. order_csv = os.path.join(args.dataset, 'order.csv') try: filereader = FileReader(order_csv) with filereader.open(textmode=True) as f: for original, shuffled in [[int(x) for x in row] for row in csv.reader(f)]: orders[original] = shuffled except: print('Cannot open', order_csv) for i in range(len(rows)): orders[i] = i else: print('Unsupported extension "{}" in "{}".'.format( os.path.splitext(args.dataset)[1], args.dataset)) callback.update_status(('data.max', len(rows))) callback.update_status(('data.current', 0)) callback.update_status('processing', True) result_csv_filename = os.path.join(args.outdir, args.outfile) with open(result_csv_filename, 'w', encoding='utf-8') as f: writer = csv.writer(f, lineterminator='\n') with data_iterator() as di: index = 0 while index < di.size: data = di.next() result, outputs = _forward(args, index, config, data, di.variables) if index == 0: for name, dim in zip(result.names, result.dims): if dim == 1: if e.repeat_evaluation_type == "std": name = "Uncertainty(Std)" row0.append(name) else: for d in range(dim): row0.append(name + '__' + str(d)) writer.writerow(row0) for i, output in enumerate(outputs): if index + i < len(rows): import copy row = copy.deepcopy(rows[orders[index + i]]) row.extend(output) writer.writerow(row) index += len(outputs) callback.update_status(('data.current', min([index, len(rows)]))) callback.update_forward_time() callback.update_status() logger.log( 99, 'data {} / {}'.format(min([index, len(rows)]), len(rows))) callback.process_evaluation_result(args.outdir, result_csv_filename) logger.log(99, 'Forward Completed.') progress(None) callback.update_status(('output_result.csv_header', ','.join(row0))) callback.update_status(('output_result.column_num', len(row0))) callback.update_status(('output_result.data_num', len(rows))) callback.update_status('finished') return True
def profile_optimizer(config, result_array, synchronize): # Profile Training for opt in config.optimizers.values(): o = opt.optimizer result_name = "optimizer '%s' with network '%s'" % (o.name, o.network.name) result_dict = OrderedDict() logger.log(99, 'Profiling ' + result_name + ' ...') # Clear weight for name, p in o.parameters.items(): if name[-2:] in ('/W', '/b'): p.data.zero() # Load dataset def load_dataset(): loaded_data = {} di = opt.data_iterator loaded_data[di] = di.next() return loaded_data profile(config, 'load_dataset', load_dataset, result_dict, synchronize) # Let data loaded_data = load_dataset() for v, d in o.dataset_assign.items(): def let_data(): try: data = loaded_data[opt.data_iterator][ opt.data_iterator.variables.index(d)] except: print(opt.data_iterator.variables) raise ValueError('Data "' + d + '" is not found in dataset.') let_data_to_variable(v.variable_instance, data=data, data_name=d, variable_name=v.name) profile(config, 'let_data (%s to %s)' % (d, v.name), let_data, result_dict, synchronize) # Generate data for v, generator in o.generator_assign.items(): def generate_data(): let_data_to_variable(v.variable_instance, data=generator(v.shape), variable_name=v.name) profile(config, 'generate_data (%s)' % v.name, generate_data, result_dict, synchronize) ''' # Setup (detail) for func in o.forward_sequence: def setup(): o.network.setup_function(func) profile(config, 'setup_function (%s : %s)' % ( func.name, func.function_instance.name), setup, result_dict, synchronize) ''' # Warm-up o.network.forward(o.forward_sequence) o.network.prepare_backward(o.backward_sequence) o.network.backward(o.backward_sequence) # Forward (detail) for func in o.forward_sequence: def forward(): o.network.forward_function(func) in_place_str = ' : in_place' if func.function_instance.inplace_data( 0) > 0 else '' profile( config, 'forward_function (%s : %s%s)' % (func.name, func.function_instance.name, in_place_str), forward, result_dict, synchronize) # Backward (detail) def prepare_backward(): o.network.prepare_backward(o.backward_sequence) profile(config, 'prepare_backward', prepare_backward, result_dict, synchronize) for seq in o.backward_sequence.sequence: o.network.prepare_backward(o.backward_sequence) def backward(): o.network.backward_function(seq) in_place_str = ' : in_place' if seq.func.function_instance.inplace_grad( 0) > 0 else '' profile( config, 'backward_function (%s : %s%s)' % (seq.func.name, seq.func.function_instance.name, in_place_str), backward, result_dict, synchronize) # Forward (all) def forward_all(): o.network.forward(o.forward_sequence) profile(config, 'forward_all', forward_all, result_dict, synchronize) # Backward (all) def backward_all(): o.network.backward(o.backward_sequence) profile(config, 'backward_all', backward_all, result_dict, synchronize) # Backward (all) def backward_all_wo_zero_grad(): o.network.backward(o.backward_sequence, parameter_zero_grad=False) profile(config, 'backward_all(wo param zero_grad)', backward_all_wo_zero_grad, result_dict, synchronize) # Update (weight decay) if o.weight_decay > 0: def weight_decay(): o.solver.weight_decay(o.weight_decay) profile(config, 'weight_decay (%s)' % o.solver.name, weight_decay, result_dict, synchronize) # Update def update(): o.solver.update() profile(config, 'update (%s)' % o.solver.name, update, result_dict, synchronize) # Monitor loss def monitor_loss(): for l in o.loss_variables: np.mean(l.variable_instance.d) profile(config, 'monitor_loss', monitor_loss, result_dict, synchronize) result_array = add_result(result_name, result_dict, result_array) return result_array
def train_command(args): if single_or_rankzero(): configure_progress(os.path.join(args.outdir, 'progress.txt')) info = load.load([args.config], exclude_parameter=True) # Check dataset uri is empty. dataset_error = False for dataset in info.datasets.values(): if dataset.uri.strip() == '': dataset_error = True if dataset_error: logger.log(99, 'Fatal error. Dataset URI is empty.') return False class TrainConfig: pass config = TrainConfig() config.timelimit = -1 if args.param: load.load([args.param], parameter_only=True) config.global_config = info.global_config config.training_config = info.training_config if single_or_rankzero(): logger.log(99, 'Train with contexts {}'.format(available_contexts)) class OptConfig: pass config.optimizers = OrderedDict() for name, opt in info.optimizers.items(): o = OptConfig() o.optimizer = opt o.data_iterator = None config.optimizers[name] = o class MonConfig: pass config.monitors = OrderedDict() for name, mon in info.monitors.items(): m = MonConfig() m.monitor = mon m.data_iterator = None config.monitors[name] = m # Training comm = current_communicator() config.training_config.iter_per_epoch //= comm.size if comm else 1 max_iteration = config.training_config.max_epoch * \ config.training_config.iter_per_epoch global _save_parameter_info _save_parameter_info = {} _, config_ext = os.path.splitext(args.config) if config_ext == '.prototxt' or config_ext == '.nntxt': _save_parameter_info['config'] = args.config elif config_ext == '.nnp': with zipfile.ZipFile(args.config, 'r') as nnp: for name in nnp.namelist(): _, ext = os.path.splitext(name) if ext == '.nntxt' or ext == '.prototxt': nnp.extract(name, args.outdir) _save_parameter_info['config'] = os.path.join( args.outdir, name) result = False if max_iteration > 0: data_iterators = {'optimizer': {}, 'monitor': {}} rng = np.random.RandomState(comm.rank if comm else 0) with ExitStack() as stack: for name, o in config.optimizers.items(): o.data_iterator = stack.enter_context( o.optimizer.data_iterator()) if comm and comm.size > 1: o.data_iterator = o.data_iterator.slice( rng, comm.size, comm.rank) for name, m in config.monitors.items(): m.data_iterator = stack.enter_context( m.monitor.data_iterator()) if comm and comm.size > 1: m.data_iterator = m.data_iterator.slice( rng, comm.size, comm.rank) result = _train(args, config) else: # save parameters without training (0 epoch learning) logger.log(99, '0 epoch learning. (Just save parameter.)') if single_or_rankzero(): _save_parameters(args, 'current', 0, True) result = True if single_or_rankzero(): if result: logger.log(99, 'Training Completed.') else: logger.log(99, 'Training Incompleted.') if single_or_rankzero(): progress(None) return True
def compare_with_cpu_command(args): configure_progress(os.path.join(args.outdir, 'progress.txt')) class TrainConfig: pass class OptConfig: pass class MonConfig: pass # Load config with current context files = [] files.append(args.config) with nn.parameter_scope('current'): info = load.load(files) parameters = get_parameters(grad_only=False) config = TrainConfig() config.global_config = info.global_config config.training_config = info.training_config config.optimizers = OrderedDict() for name, opt in info.optimizers.items(): o = OptConfig() o.optimizer = opt o.data_iterator = None config.optimizers[name] = o config.monitors = OrderedDict() for name, mon in info.monitors.items(): m = MonConfig() m.monitor = mon m.data_iterator = None config.monitors[name] = m # Load config with cpu context files = [] files.append(args.config2) with nn.parameter_scope('cpu'): info_cpu = load.load(files) cpu_parameters = get_parameters(grad_only=False) config_cpu = TrainConfig() config_cpu.global_config = info_cpu.global_config config_cpu.training_config = info_cpu.training_config config_cpu.optimizers = OrderedDict() for name, opt in info_cpu.optimizers.items(): o = OptConfig() o.optimizer = opt o.data_iterator = None config_cpu.optimizers[name] = o config_cpu.monitors = OrderedDict() for name, mon in info_cpu.monitors.items(): m = MonConfig() m.monitor = mon m.data_iterator = None config_cpu.monitors[name] = m result_array = [['1-Correl']] # Profile Optimizer with ExitStack() as stack: for name, o in config.optimizers.items(): o.data_iterator = stack.enter_context( o.optimizer.data_iterator()) for name, o in config_cpu.optimizers.items(): o.data_iterator = stack.enter_context( o.optimizer.data_iterator()) result_array = compare_optimizer( config, parameters, config_cpu, cpu_parameters, result_array) # Write profiling result import csv with open(args.outdir + os.sep + 'compare_with_cpu.csv', 'w') as f: writer = csv.writer(f, lineterminator='\n') writer.writerows(result_array) logger.log(99, 'Compare with CPU Completed.') progress(None)