def before_run(self, run_context): if self._need_sync: self._do_sync_offset(run_context.session) _log_event('BEFORE _sync_state_op') run_context.session.run(self._sync_state_op) _log_event('AFTER _sync_state_op') self._need_sync = False self._profiler.end()
def _do_sync_offset(self, sess): _log_event('BEFORE _sync_offset_op(%s)' % (self._trained_samples)) new_offset = sess.run( self._sync_offset_op, feed_dict={self._trained_samples_place: self._trained_samples}) print('sync offset %d -> %d on step %d' % (self._trained_samples, new_offset, self._step)) self._trained_samples = new_offset
def main(): _log_event('BEGIN :: main') args = parse_args() tf_methods = { 'simple': run_simple_session, 'monitored': run_with_session_and_hooks, 'estimator': run_with_estimator, } tf_methods[args.tf_method](args) _log_event('END :: main')
def before_run(self, run_context): if self._step >= self._max_step: # shouldn't happen print('request_stop before kungfu_step: %d' % (self._step)) # run_context.request_stop() # FIXME: force quit if self._need_sync: is_first = self._step == 0 if is_first: _log_event('BEFORE first _sync_step_op') self._step = run_context.session.run( self._sync_step_op, feed_dict={self._step_place: self._step}) if is_first: _log_event('BEFORE first _sync_op') run_context.session.run(self._sync_op) if is_first: _log_event('AFTER first _sync_op') self._need_sync = False
def run_with_estimator(args): _log_event('BEGIN :: run_with_estimator') _log_event('BEGIN :: build_estimator') classifier = build_estimator(args) _log_event('END :: build_estimator') hooks = [ debug_hooks.LogStepHook(), ] if args.show_training_throughput: hooks.append(debug_hooks.LogPerfHook(args.batch_size)) if args.elastic: from kungfu.tensorflow.experimental.hook import ElasticHook elastic_hook = ElasticHook(args.batch_size, args.epochs, args.epoch_size) hooks.append(elastic_hook) schedule = parse_scheule(args.resize_schedule) profile_resize_hook = debug_hooks.ProfileResizeHook(schedule) hooks.append(profile_resize_hook) input_fn = build_input_fn(args.batch_size) classifier.train(input_fn, hooks=hooks) else: input_fn = build_input_fn(args.batch_size, args.train_steps) sync_step_hook = debug_hooks.SyncStepHook() if args.sync_step: hooks.append(sync_step_hook) _log_event('BEGIN :: classifier.train') classifier.train(input_fn, hooks=hooks, max_steps=args.train_steps) _log_event('END :: classifier.train') _log_event('END :: run_with_estimator')
def before_run(self, run_context): if self._step == 0: _log_event('before_run_step_0') print('%s::%s %d steps' % ('LogStepHook', 'before_run', self._step))
def end(self, run_context): _log_event('SyncStepHook::end')
def after_create_session(self, sess, coord): gs = sess.run(self._sync_step_op) sess.run(self._sync_state_op) print('_sync_step_op result %d' % (gs)) _log_event('AFTER _sync_step_op')