def test(action_set, level_names): """Test.""" Agent = agent_factory(FLAGS.agent_name) level_returns = {level_name: [] for level_name in level_names} with tf.Graph().as_default(): agent = Agent(len(action_set)) outputs = {} for level_name in level_names: env = create_environment(level_name, seed=1, is_test=True) outputs[level_name] = build_actor(agent, env, level_name, action_set) with tf.train.SingularMonitoredSession(checkpoint_dir=FLAGS.logdir, hooks=[ py_process.PyProcessHook() ]) as session: for level_name in level_names: tf.logging.info('Testing level: %s', level_name) while True: done_v, infos_v = session.run( (outputs[level_name].env_outputs.done, outputs[level_name].env_outputs.info)) returns = level_returns[level_name] returns.extend(infos_v.episode_return[1:][done_v[1:]]) if len(returns) >= FLAGS.test_num_episodes: tf.logging.info('Mean episode return: %f', np.mean(returns)) break
def test(action_set, level_names): """Test.""" level_returns = {level_name: [] for level_name in level_names} with tf.Graph().as_default(): agent = Agent(len(action_set)) outputs = {} for level_name in level_names: env = create_environment(level_name, seed=1, is_test=True) outputs[level_name] = build_actor(agent, env, level_name, action_set) with tf.train.SingularMonitoredSession( checkpoint_dir=FLAGS.logdir, hooks=[py_process.PyProcessHook()]) as session: for level_name in level_names: tf.logging.info('Testing level: %s', level_name) while True: done_v, infos_v = session.run(( outputs[level_name].env_outputs.done, outputs[level_name].env_outputs.info )) returns = level_returns[level_name] returns.extend(infos_v.episode_return[1:][done_v[1:]]) if len(returns) >= FLAGS.test_num_episodes: tf.logging.info('Mean episode return: %f', np.mean(returns)) break if FLAGS.level_name == 'dmlab30': no_cap = dmlab30.compute_human_normalized_score(level_returns, per_level_cap=None) cap_100 = dmlab30.compute_human_normalized_score(level_returns, per_level_cap=100) tf.logging.info('No cap.: %f Cap 100: %f', no_cap, cap_100)
def test_small(self): class Example(object): def __init__(self, a): self._a = a def inc(self): self._a += 1 def compute(self, b): return np.array(self._a + b, dtype=np.int32) @staticmethod def _tensor_specs(method_name, unused_args, unused_constructor_kwargs): if method_name == 'compute': return tf.contrib.framework.TensorSpec([], tf.int32) elif method_name == 'inc': return () with tf.Graph().as_default(): p = py_process.PyProcess(Example, 1) inc = p.proxy.inc() compute = p.proxy.compute(2) with tf.train.SingularMonitoredSession( hooks=[py_process.PyProcessHook()]) as session: self.assertTrue(isinstance(inc, tf.Operation)) session.run(inc) self.assertEqual([], compute.shape) self.assertEqual(4, session.run(compute))
def test_args(self): class Example(object): def __init__(self, dim0): self._dim0 = dim0 def compute(self, dim1): return np.zeros([self._dim0, dim1], dtype=np.int32) @staticmethod def _tensor_specs(method_name, kwargs, constructor_kwargs): dim0 = constructor_kwargs['dim0'] dim1 = kwargs['dim1'] if method_name == 'compute': return tf.contrib.framework.TensorSpec([dim0, dim1], tf.int32) with tf.Graph().as_default(): p = py_process.PyProcess(Example, 1) result = p.proxy.compute(2) with tf.train.SingularMonitoredSession( hooks=[py_process.PyProcessHook()]) as session: self.assertEqual([1, 2], result.shape) self.assertAllEqual([[0, 0]], session.run(result))
def test_close_on_error(self): with tempfile.NamedTemporaryFile() as tmp: class Example(object): def __init__(self, filename): self._filename = filename def something(self): raise ValueError('foo') def close(self): with tf.gfile.Open(self._filename, 'w') as f: f.write('was_closed') @staticmethod def _tensor_specs(method_name, unused_kwargs, unused_constructor_kwargs): if method_name == 'something': return () with tf.Graph().as_default(): p = py_process.PyProcess(Example, tmp.name) result = p.proxy.something() with tf.train.SingularMonitoredSession( hooks=[py_process.PyProcessHook()]) as session: with self.assertRaisesRegexp(Exception, 'foo'): session.run(result) self.assertEqual('was_closed', tmp.read())
def test(): """Test.""" with tf.Graph().as_default(): agent = Agent((6, 8, 8)) env = create_environment({'adversarial': False}, is_test=True) outputs = build_actor(agent, env)[0] returns = [] with tf.train.SingularMonitoredSession(checkpoint_dir=FLAGS.logdir, hooks=[ py_process.PyProcessHook() ]) as session: tf.logging.info('Testing:') while True: done_v, infos_v = session.run( (outputs.env_outputs.done, outputs.env_outputs.info)) returns.extend(infos_v.episode_return[1:][done_v[1:]]) if len(returns) >= FLAGS.test_num_episodes: tf.logging.info('Mean episode return: %f', np.mean(returns)) break
def test_threading(self): class Example(object): def __init__(self): pass def wait(self): time.sleep(.2) return None @staticmethod def _tensor_specs(method_name, unused_args, unused_constructor_kwargs): if method_name == 'wait': return tf.contrib.framework.TensorSpec([], tf.int32) with tf.Graph().as_default(): p = py_process.PyProcess(Example) wait = p.proxy.wait() hook = py_process.PyProcessHook() with tf.train.SingularMonitoredSession(hooks=[hook]) as session: def run(): with self.assertRaises(tf.errors.OutOfRangeError): session.run(wait) t = self.checkedThread(target=run) t.start() time.sleep(.1) t.join()
def test_close(self): with tempfile.NamedTemporaryFile() as tmp: class Example(object): def __init__(self, filename): self._filename = filename def close(self): with tf.gfile.Open(self._filename, 'w') as f: f.write('was_closed') @staticmethod def _tensor_specs(method_name, unused_kwargs, unused_constructor_kwargs): if method_name == 'something': return () with tf.Graph().as_default(): py_process.PyProcess(Example, tmp.name) with tf.train.SingularMonitoredSession( hooks=[py_process.PyProcessHook()]): pass self.assertEqual('was_closed', tmp.read())
def benchmark_one(self): with tf.Graph().as_default(): p = py_process.PyProcess(PyProcessBenchmarks.Example) compute = p.proxy.compute(2) with tf.train.SingularMonitoredSession( hooks=[py_process.PyProcessHook()]) as session: self.run_op_benchmark(name='process_one', sess=session, op_or_tensor=compute, burn_iters=10, min_iters=5000)
def benchmark_many(self): with tf.Graph().as_default(): ps = [ py_process.PyProcess(PyProcessBenchmarks.Example) for _ in range(200) ] compute_ops = [p.proxy.compute(2) for p in ps] compute = tf.group(*compute_ops) with tf.train.SingularMonitoredSession( hooks=[py_process.PyProcessHook()]) as session: self.run_op_benchmark(name='process_many', sess=session, op_or_tensor=compute, burn_iters=10, min_iters=500)
def test_close(self): with tempfile.NamedTemporaryFile() as tmp: class Example(object): def __init__(self, filename): self._filename = filename def close(self): with tf.gfile.Open(self._filename, 'w') as f: f.write('was_closed') with tf.Graph().as_default(): py_process.PyProcess(Example, tmp.name) with tf.train.SingularMonitoredSession( hooks=[py_process.PyProcessHook()]): pass self.assertEqual('was_closed', tmp.read())
def test_error_handling_constructor(self): class Example(object): def __init__(self): raise ValueError('foo') def something(self): pass @staticmethod def _tensor_specs(method_name, unused_kwargs, unused_constructor_kwargs): if method_name == 'something': return () with tf.Graph().as_default(): py_process.PyProcess(Example, 1) with self.assertRaisesRegexp(Exception, 'foo'): with tf.train.SingularMonitoredSession( hooks=[py_process.PyProcessHook()]): pass
def test(game_name): all_returns = {game_name: []} action_size = 4 with tf.Graph().as_default(): agent = Agent(action_size) env = create_environment(game_name) output = build_actor(agent, env, game_name, action_size) with tf.train.SingularMonitoredSession(checkpoint_dir=FLAGS.logdir, hooks=[ py_process.PyProcessHook() ]) as session: while True: done_v, infos_v = session.run( (output.env_outputs.done, output.env_outputs.info)) returns = all_returns[game_name] returns.extend(infos_v.episode_return[1:][done_v[1:]]) if len(returns) >= FLAGS.test_num_episodes: tf.logging.info('Mean episode return: %f', np.mean(returns)) break
def train(action_set, level_names): """Train.""" if is_single_machine(): local_job_device = '' shared_job_device = '' is_actor_fn = lambda i: True is_learner = True global_variable_device = '/gpu' server = tf.train.Server.create_local_server() filters = [] else: local_job_device = '/job:%s/task:%d' % (FLAGS.job_name, FLAGS.task) shared_job_device = '/job:learner/task:0' is_actor_fn = lambda i: FLAGS.job_name == 'actor' and i == FLAGS.task is_learner = FLAGS.job_name == 'learner' # Placing the variable on CPU, makes it cheaper to send it to all the # actors. Continual copying the variables from the GPU is slow. global_variable_device = shared_job_device + '/cpu' cluster = tf.train.ClusterSpec({ 'actor': ['localhost:%d' % (8001 + i) for i in range(FLAGS.num_actors)], 'learner': ['localhost:8000'] }) server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task) filters = [shared_job_device, local_job_device] # Only used to find the actor output structure. with tf.Graph().as_default(): agent = Agent(len(action_set)) env = create_environment(level_names[0], seed=1) structure = build_actor(agent, env, level_names[0], action_set) flattened_structure = nest.flatten(structure) dtypes = [t.dtype for t in flattened_structure] shapes = [t.shape.as_list() for t in flattened_structure] with tf.Graph().as_default(), \ tf.device(local_job_device + '/cpu'), \ pin_global_variables(global_variable_device): tf.set_random_seed(FLAGS.seed) # Makes initialization deterministic. # Create Queue and Agent on the learner. with tf.device(shared_job_device): queue = tf.FIFOQueue(1, dtypes, shapes, shared_name='buffer') agent = Agent(len(action_set)) if is_single_machine() and 'dynamic_batching' in sys.modules: # For single machine training, we use dynamic batching for improved GPU # utilization. The semantics of single machine training are slightly # different from the distributed setting because within a single unroll # of an environment, the actions may be computed using different weights # if an update happens within the unroll. old_build = agent._build @dynamic_batching.batch_fn def build(*args): with tf.device('/gpu'): return old_build(*args) tf.logging.info('Using dynamic batching.') agent._build = build # Build actors and ops to enqueue their output. enqueue_ops = [] for i in range(FLAGS.num_actors): if is_actor_fn(i): level_name = level_names[i % len(level_names)] tf.logging.info('Creating actor %d with level %s', i, level_name) env = create_environment(level_name, seed=i + 1) actor_output = build_actor(agent, env, level_name, action_set) with tf.device(shared_job_device): enqueue_ops.append(queue.enqueue(nest.flatten(actor_output))) # If running in a single machine setup, run actors with QueueRunners # (separate threads). if is_learner and enqueue_ops: tf.train.add_queue_runner(tf.train.QueueRunner(queue, enqueue_ops)) # Build learner. if is_learner: # Create global step, which is the number of environment frames processed. tf.get_variable( 'num_environment_frames', initializer=tf.zeros_initializer(), shape=[], dtype=tf.int64, trainable=False, collections=[tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES]) # Create batch (time major) and recreate structure. dequeued = queue.dequeue_many(FLAGS.batch_size) dequeued = nest.pack_sequence_as(structure, dequeued) def make_time_major(s): return nest.map_structure( lambda t: tf.transpose(t, [1, 0] + list(range(t.shape.ndims))[2:]), s) dequeued = dequeued._replace( env_outputs=make_time_major(dequeued.env_outputs), agent_outputs=make_time_major(dequeued.agent_outputs)) with tf.device('/gpu'): # Using StagingArea allows us to prepare the next batch and send it to # the GPU while we're performing a training step. This adds up to 1 step # policy lag. flattened_output = nest.flatten(dequeued) area = tf.contrib.staging.StagingArea( [t.dtype for t in flattened_output], [t.shape for t in flattened_output]) stage_op = area.put(flattened_output) data_from_actors = nest.pack_sequence_as(structure, area.get()) # Unroll agent on sequence, create losses and update ops. output = build_learner(agent, data_from_actors.agent_state, data_from_actors.env_outputs, data_from_actors.agent_outputs) # Create MonitoredSession (to run the graph, checkpoint and log). tf.logging.info('Creating MonitoredSession, is_chief %s', is_learner) config = tf.ConfigProto(allow_soft_placement=True, device_filters=filters) with tf.train.MonitoredTrainingSession( server.target, is_chief=is_learner, checkpoint_dir=FLAGS.logdir, save_checkpoint_secs=600, save_summaries_secs=30, log_step_count_steps=50000, config=config, hooks=[py_process.PyProcessHook()]) as session: if is_learner: # Logging. level_returns = {level_name: [] for level_name in level_names} summary_writer = tf.summary.FileWriterCache.get(FLAGS.logdir) # Prepare data for first run. session.run_step_fn( lambda step_context: step_context.session.run(stage_op)) # Execute learning and track performance. num_env_frames_v = 0 while num_env_frames_v < FLAGS.total_environment_frames: level_names_v, done_v, infos_v, num_env_frames_v, _ = session.run( (data_from_actors.level_name,) + output + (stage_op,)) level_names_v = np.repeat([level_names_v], done_v.shape[0], 0) for level_name, episode_return, episode_step in zip( level_names_v[done_v], infos_v.episode_return[done_v], infos_v.episode_step[done_v]): episode_frames = episode_step * FLAGS.num_action_repeats tf.logging.info('Level: %s Episode return: %f', level_name, episode_return) summary = tf.summary.Summary() summary.value.add(tag=level_name + '/episode_return', simple_value=episode_return) summary.value.add(tag=level_name + '/episode_frames', simple_value=episode_frames) summary_writer.add_summary(summary, num_env_frames_v) if FLAGS.level_name == 'dmlab30': level_returns[level_name].append(episode_return) print("(experiment.py) level_returns: ", level_returns) if (FLAGS.level_name == 'dmlab30' and min(map(len, level_returns.values())) >= 1): no_cap = dmlab30.compute_human_normalized_score(level_returns, per_level_cap=None) # print("(experiment) No cap: ", no_cap) cap_100 = dmlab30.compute_human_normalized_score(level_returns, per_level_cap=100) with open("normalized_scores.txt", "a+") as f: f.write("num env frames: %d\n" % num_env_frames_v) f.write("no cap: %f\n" % no_cap) f.write("cap 100: %f\n" % cap_100) summary = tf.summary.Summary() summary.value.add( tag='dmlab30/training_no_cap', simple_value=no_cap) summary.value.add( tag='dmlab30/training_cap_100', simple_value=cap_100) summary_writer.add_summary(summary, num_env_frames_v) # Clear level scores. level_returns = {level_name: [] for level_name in level_names} else: # Execute actors (they just need to enqueue their output). while True: session.run(enqueue_ops)
def train(action_set, level_names): """Train.""" local_job_device = '/job:%s/task:%d' % (FLAGS.job_name, FLAGS.task) shared_job_device = '/job:learner/task:0' is_actor_fn = lambda i: FLAGS.job_name == 'actor' and i == FLAGS.task is_learner = FLAGS.job_name == 'learner' actor_hosts = FLAGS.actor_hosts.split(',') num_actors = len(actor_hosts) learner_host = FLAGS.learner_host.split(',') assert (len(learner_host) == 1) if is_learner: assert (FLAGS.task == 0) assert (has_horovod == True) hvd.init() # Placing the variable on CPU, makes it cheaper to send it to all the # actors. Continual copying the variables from the GPU is slow. global_variable_device = '/job:learner/task:0' + '/cpu' filters = [shared_job_device, local_job_device] cluster = tf.train.ClusterSpec({ 'actor': actor_hosts, 'learner': learner_host }) config = tf.ConfigProto(allow_soft_placement=True, device_filters=filters) if is_learner: config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task, config=config) # Only used to find the actor output structure. Agent = agent_factory(FLAGS.agent_name) with tf.Graph().as_default(): agent = Agent(len(action_set)) env = create_environment(level_names[0], seed=1) structure = build_actor(agent, env, level_names[0], action_set) flattened_structure = nest.flatten(structure) dtypes = [t.dtype for t in flattened_structure] shapes = [t.shape.as_list() for t in flattened_structure] # build graph for actor or learner with tf.Graph().as_default(), \ tf.device(local_job_device + '/cpu'), \ pin_global_variables(global_variable_device): tf.set_random_seed(FLAGS.seed) # Makes initialization deterministic. # Create Queue and Agent on the learner. with tf.device(shared_job_device): queue = tf.FIFOQueue(1, dtypes, shapes, shared_name='buffer') agent = Agent(len(action_set)) # Build actors and ops to enqueue their output. enqueue_ops = [] for i in range(num_actors): if is_actor_fn(i): level_name = level_names[i % len(level_names)] tf.logging.info('Creating actor %d with level %s', i, level_name) env = create_environment(level_name, seed=i + 1) actor_output = build_actor(agent, env, level_name, action_set) with tf.device(shared_job_device): enqueue_ops.append( queue.enqueue(nest.flatten(actor_output))) # Build learner. if is_learner: # Create global step, which is the number of environment frames # processed. g_step = tf.get_variable('num_environment_frames', initializer=tf.zeros_initializer(), shape=[], dtype=tf.int64, trainable=False, collections=[ tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES ]) # Create batch (time major) and recreate structure. dequeued = queue.dequeue_many(FLAGS.batch_size) dequeued = nest.pack_sequence_as(structure, dequeued) def make_time_major(s): return nest.map_structure( lambda t: tf.transpose(t, [1, 0] + list( range(t.shape.ndims))[2:]), s) dequeued = dequeued._replace( env_outputs=make_time_major(dequeued.env_outputs), agent_outputs=make_time_major(dequeued.agent_outputs)) with tf.device("/gpu"): # Using StagingArea allows us to prepare the next batch and send it to # the GPU while we're performing a training step. This adds up to 1 # step policy lag. flattened_output = nest.flatten(dequeued) area = tf.contrib.staging.StagingArea( [t.dtype for t in flattened_output], [t.shape for t in flattened_output]) stage_op = area.put(flattened_output) data_from_actors = nest.pack_sequence_as(structure, area.get()) # Unroll agent on sequence, create losses and update ops. if hasattr(data_from_actors, 'agent_state'): agent_state = data_from_actors.agent_state else: agent_state = agent.initial_state(1) output, optimizer = build_learner( agent, agent_state=agent_state, env_outputs=data_from_actors.env_outputs, agent_outputs=data_from_actors.agent_outputs, g_step=g_step) # Create MonitoredSession (to run the graph, checkpoint and log). is_chief = is_learner # MonitoredTrainingSession inits all global variables hooks = [py_process.PyProcessHook()] if is_learner: # for variable initialization across learners hooks.append(hvd.BroadcastGlobalVariablesHook(0)) tf.logging.info('Creating MonitoredSession, is_chief %s', is_chief) if is_learner: tf.logging.info('At rank %d', hvd.rank()) # rank 0 takes care of ckpt saving checkpoint_dir = FLAGS.logdir if is_learner and hvd.rank( ) == 0 else None with tf.train.MonitoredTrainingSession(server.target, is_chief=is_chief, checkpoint_dir=checkpoint_dir, save_checkpoint_secs=600, save_summaries_secs=30, log_step_count_steps=50000, config=config, hooks=hooks) as session: if is_learner: # tb Logging summary_writer = (tf.summary.FileWriterCache.get(FLAGS.logdir) if hvd.rank() == 0 else None) # Prepare data for first run. session.run_step_fn( lambda step_context: step_context.session.run(stage_op)) # Execute learning and track performance. num_env_frames_v = 0 while num_env_frames_v < FLAGS.total_environment_frames: level_names_v, done_v, infos_v, num_env_frames_v, _ = session.run( (data_from_actors.level_name, ) + output + (stage_op, )) level_names_v = np.repeat([level_names_v], done_v.shape[0], 0) for level_name, episode_return, episode_step in zip( level_names_v[done_v], infos_v.episode_return[done_v], infos_v.episode_step[done_v]): episode_frames = episode_step tf.logging.info( 'learner rank: %d, Env: %s Episode return: %f', hvd.rank(), level_name, episode_return) if hvd.rank() == 0: # tb Logging summary = tf.summary.Summary() summary.value.add(tag=level_name + '/episode_return', simple_value=episode_return) summary.value.add(tag=level_name + '/episode_frames', simple_value=episode_frames) summary_writer.add_summary(summary, num_env_frames_v) else: # Execute actors (they just need to enqueue their output). while True: session.run(enqueue_ops)
def train(game_name): action_size, state_size = find_size(game_name) """Train.""" if is_single_machine(): local_job_device = '' shared_job_device = '' is_actor_fn = lambda i: True is_learner = True global_variable_device = '/gpu' server = tf.train.Server.create_local_server() filters = [] else: pass # Only used to find the actor output structure. with tf.Graph().as_default(): agent = Agent(action_size) env = create_environment(game_name, state_size) structure = build_actor(agent, env, game_name, action_size) flattened_structure = nest.flatten(structure) dtypes = [t.dtype for t in flattened_structure] shapes = [t.shape.as_list() for t in flattened_structure] with tf.Graph().as_default(), \ tf.device(local_job_device + '/cpu'), \ pin_global_variables(global_variable_device): tf.set_random_seed(FLAGS.seed) # Makes initialization deterministic. with tf.device(shared_job_device): agent = Agent(action_size) tf.logging.info('Creating actor with game %s', game_name) env = create_environment(game_name, state_size) actor_output = build_actor(agent, env, game_name, action_size) # Create global step, which is the number of environment frames processed. tf.get_variable('num_environment_frames', initializer=tf.zeros_initializer(), shape=[], dtype=tf.int64, trainable=False, collections=[ tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES ]) actor_output = nest.map_structure(lambda t: tf.expand_dims(t, 0), actor_output) def make_time_major(s): return nest.map_structure( lambda t: tf.transpose(t, [1, 0] + list(range(t.shape.ndims))[ 2:]), s) actor_output = actor_output._replace( env_outputs=make_time_major(actor_output.env_outputs), agent_outputs=make_time_major(actor_output.agent_outputs)) with tf.device('/gpu'): # Using StagingArea allows us to prepare the next batch and send it to # the GPU while we're performing a training step. This adds up to 1 step # policy lag. flattened_output = nest.flatten(actor_output) area = tf.contrib.staging.StagingArea( [t.dtype for t in flattened_output], [t.shape for t in flattened_output]) stage_op = area.put(flattened_output) data_from_actors = nest.pack_sequence_as(structure, area.get()) output = build_learner(agent, data_from_actors.env_outputs, data_from_actors.agent_outputs) # Create MonitoredSession (to run the graph, checkpoint and log). tf.logging.info('Creating MonitoredSession, is_chief %s', is_learner) config = tf.ConfigProto(allow_soft_placement=True, device_filters=filters) with tf.train.MonitoredTrainingSession( server.target, is_chief=is_learner, checkpoint_dir=FLAGS.logdir, save_checkpoint_secs=600, save_summaries_secs=30, log_step_count_steps=50000, config=config, hooks=[py_process.PyProcessHook()]) as session: # Logging. level_returns = {game_name: []} summary_writer = tf.summary.FileWriterCache.get(FLAGS.logdir) # Prepare data for first run. session.run_step_fn( lambda step_context: step_context.session.run(stage_op)) # Execute learning and track performance. num_env_frames_v = 0 while num_env_frames_v < FLAGS.total_environment_frames: level_names_v, done_v, infos_v, num_env_frames_v, _ = session.run( (actor_output.level_name, ) + output + (stage_op, )) level_names_v = np.repeat([level_names_v], done_v.shape[0], 0) for level_name, episode_return, episode_step in zip( level_names_v[done_v], infos_v.episode_return[done_v], infos_v.episode_step[done_v]): level_name = level_name.decode() episode_frames = episode_step tf.logging.info('Level: %s Episode return: %f', level_name, episode_return) #tf.logging.info('Level: %s Episode frames: %f', # level_name, episode_frames) summary = tf.summary.Summary() summary.value.add(tag=level_name + '/episode_return', simple_value=episode_return) summary.value.add(tag=level_name + '/episode_frames', simple_value=episode_frames) summary_writer.add_summary(summary, num_env_frames_v) level_returns[level_name].append(episode_return)
def train(act_space): local_job_device = '/job:%s/task:%d' % (FLAGS.job_name, FLAGS.task) shared_job_device = '/job:learner/task:0' is_worker_fn = lambda i: FLAGS.job_name == 'worker' and i == FLAGS.task is_learner = FLAGS.job_name == 'learner' # Placing the variable on CPU, makes it cheaper to send it to all the # workers. Continual copying the variables from the GPU is slow. global_variable_device = shared_job_device + '/cpu' cluster = tf.train.ClusterSpec({ 'worker': ['localhost:%d' % (8001 + i) for i in range(FLAGS.num_workers)], 'learner': ['localhost:8000'] }) server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task) filters = [shared_job_device, local_job_device] # Only used to find the worker output structure. with tf.Graph().as_default(): structure = build_worker(FLAGS.datadir, "*.seg") flattened_structure = nest.flatten(structure) dtypes = [t.dtype for t in flattened_structure] shapes = [t.shape.as_list() for t in flattened_structure] with tf.Graph().as_default(), \ tf.device(local_job_device + '/cpu'), \ pin_global_variables(global_variable_device): tf.set_random_seed(FLAGS.seed) # Makes initialization deterministic. # Create Queue and Agent on the learner. with tf.device(shared_job_device): queue = tf.FIFOQueue(2 * FLAGS.batch_size, dtypes, shapes, shared_name='buffer') model = Model(act_space, FLAGS.frames, FLAGS.vf_clip) # Build workers and ops to enqueue their output. enqueue_ops = [] for i in range(FLAGS.num_workers): if is_worker_fn(i): tf.logging.info('Creating worker %d', i) pattern = "*_%s_*.seg" % ((4 - len(str(i))) * "0" + str(i)) worker_output = build_worker(FLAGS.datadir, pattern) with tf.device(shared_job_device): enqueue_ops.append( queue.enqueue(nest.flatten(worker_output))) # Build learner. if is_learner: # Create global step, which is the number of environment frames processed. num_frames = tf.get_variable('num_environment_frames', initializer=tf.zeros_initializer(), shape=[], dtype=tf.int64, trainable=False, collections=[ tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES ]) # Create batch (time major) and recreate structure. dequeued = queue.dequeue_many(FLAGS.batch_size) dequeued = nest.pack_sequence_as(structure, dequeued) with tf.device('/gpu'): # Using StagingArea allows us to prepare the next batch and send it to # the GPU while we're performing a training step. This adds up to 1 step # policy lag. flattened_output = nest.flatten(dequeued) area = tf.contrib.staging.StagingArea( [t.dtype for t in flattened_output], [t.shape for t in flattened_output]) stage_op = area.put(flattened_output) data_from_workers = nest.pack_sequence_as( structure, area.get()) # Unroll agent on sequence, create losses and update ops. output = build_learner(data_from_workers, act_space, num_frames) # Create MonitoredSession (to run the graph, checkpoint and log). tf.logging.info('Creating MonitoredSession, is_chief %s', is_learner) config = tf.ConfigProto(allow_soft_placement=True, device_filters=filters) with tf.train.MonitoredTrainingSession( server.target, is_chief=is_learner, checkpoint_dir=FLAGS.logdir, save_checkpoint_secs=600, save_summaries_secs=30, log_step_count_steps=50000, config=config, hooks=[py_process.PyProcessHook()]) as session: if is_learner: # Logging. # level_returns = {level_name: [] for level_name in level_names} # summary_writer = tf.summary.FileWriterCache.get(FLAGS.logdir) # Prepare data for first run. session.run_step_fn( lambda step_context: step_context.session.run(stage_op)) # Execute learning and track performance. num_env_frames_v = 0 while num_env_frames_v < FLAGS.total_environment_frames: num_env_frames_v, _ = session.run([output, stage_op]) # level_names_v = np.repeat([level_names_v], done_v.shape[0], 0) # # for level_name, episode_return, episode_step in zip( # level_names_v[done_v], # infos_v.episode_return[done_v], # infos_v.episode_step[done_v]): # episode_frames = episode_step * FLAGS.num_action_repeats # # tf.logging.info('Level: %s Episode return: %f', # level_name, episode_return) # # summary = tf.summary.Summary() # summary.value.add(tag=level_name + '/episode_return', # simple_value=episode_return) # summary.value.add(tag=level_name + '/episode_frames', # simple_value=episode_frames) # summary_writer.add_summary(summary, num_env_frames_v) # # if FLAGS.level_name == 'dmlab30': # level_returns[level_name].append(episode_return) # # if (FLAGS.level_name == 'dmlab30' and # min(map(len, level_returns.values())) >= 1): # no_cap = dmlab30.compute_human_normalized_score(level_returns, # per_level_cap=None) # cap_100 = dmlab30.compute_human_normalized_score(level_returns, # per_level_cap=100) # summary = tf.summary.Summary() # summary.value.add( # tag='dmlab30/training_no_cap', simple_value=no_cap) # summary.value.add( # tag='dmlab30/training_cap_100', simple_value=cap_100) # summary_writer.add_summary(summary, num_env_frames_v) # # # Clear level scores. # level_returns = {level_name: [] for level_name in level_names} else: # Execute workers (they just need to enqueue their output). while True: session.run(enqueue_ops)
def train(): """Train.""" if is_single_machine(): tf.logging.info("Running on single machine") local_job_device = '' shared_job_device = '' is_actor_fn = lambda i: True is_learner = True global_variable_device = '/gpu' server = tf.train.Server.create_local_server() filters = [] else: local_job_device = '/job:%s/task:%d' % (FLAGS.job_name, FLAGS.task) shared_job_device = '/job:learner/task:0' is_actor_fn = lambda i: FLAGS.job_name == 'actor' and i == FLAGS.task is_learner = FLAGS.job_name == 'learner' # Placing the variable on CPU, makes it cheaper to send it to all the # actors. Continual copying the variables from the GPU is slow. global_variable_device = shared_job_device + '/cpu' # Represents a cluster as a set of # "tasks", organized into "jobs". # A tf.train.ClusterSpec represents the set of # processes that participate in a distributed # TensorFlow computation. Every tf.train.Server # is constructed in a particular cluster. cluster = tf.train.ClusterSpec({ 'actor': ['localhost:%d' % (8001 + i) for i in range(FLAGS.num_actors)], 'learner': ['localhost:8000'] }) server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task) filters = [shared_job_device, local_job_device] # Only used to find the actor output structure. with tf.Graph().as_default(): agent = Agent(feature_num=FLAGS.feature_num, asset_num=FLAGS.asset_num, window_size=FLAGS.window_size, commission=FLAGS.commission) env = create_environment(DEFAULT_CONFIG) structure = build_actor(agent=agent, env=env, FLAGS=FLAGS) flattened_structure = nest.flatten(structure) dtypes = [t.dtype for t in flattened_structure] shapes = [t.shape.as_list() for t in flattened_structure] with tf.Graph().as_default(), \ tf.device(local_job_device + '/cpu'), \ pin_global_variables(global_variable_device): tf.set_random_seed(FLAGS.seed) # Makes initialization deterministic. # BUILD QUEUE # ===========================================================================> with tf.device(shared_job_device): # A queue implementation that dequeues # elements in first-in first-out order. # Creates a queue that dequeues elements # in a first-in first-out order. # A FIFOQueue has bounded capacity; supports # multiple concurrent producers and consumers; # and provides exactly-once delivery. # A FIFOQueue holds a list of up to capacity # elements. Each element is a fixed-length # tuple of tensors whose dtypes are described # by dtypes, and whose shapes are optionally # described by the shapes argument. # If the shapes argument is specified, each # component of a queue element must have the # respective fixed shape. If it is unspecified, # different queue elements may have different # shapes, but the use of dequeue_many is disallowed. queue = tf.FIFOQueue(capacity=100, dtypes=dtypes, shapes=shapes, shared_name='buffer') agent = Agent(feature_num=FLAGS.feature_num, asset_num=FLAGS.asset_num, window_size=FLAGS.window_size, commission=FLAGS.commission) if is_single_machine() and 'dynamic_batching' in sys.modules: # For single machine training, we use dynamic batching for improved GPU # utilization. The semantics of single machine training are slightly # different from the distributed setting because within a single unroll # of an environment, the actions may be computed using different weights # if an update happens within the unroll. old_build = agent._build @dynamic_batching.batch_fn def build(*args): with tf.device('/gpu'): return old_build(*args) tf.logging.info('Using dynamic batching.') agent._build = build # BUILD ACTORS # ===========================================================================> # Todo make better for real time environment # Build actors and ops to enqueue their output. enqueue_ops = [] for i in range(FLAGS.num_actors): # TODO change to env configurations if is_actor_fn(i): tf.logging.info('Creating actor with config') env = create_environment(DEFAULT_CONFIG) actor_output = build_actor(agent=agent, env=env, FLAGS=FLAGS) # Append the actor outputs to the # FIFOQueue above in order to pass # the environment outputs and action # outputs processed later with tf.device(shared_job_device): enqueue_ops.append( queue.enqueue(nest.flatten(actor_output))) # ADD QUEUE RUNNER # ===========================================================================> # If running in a single machine setup, run actors with QueueRunners # (separate threads). if is_learner and enqueue_ops: # Holds a list of enqueue operations for a queue, each to be run in a thread. # Queues are a convenient TensorFlow mechanism to compute tensors asynchronously # using multiple threads. For example in the canonical 'Input Reader' setup one # set of threads generates filenames in a queue; a second set of threads read # records from the files, processes them, and enqueues tensors on a second queue; # a third set of threads dequeues these input records to construct batches and # runs them through training operations. # There are several delicate issues when running multiple threads that way: # closing the queues in sequence as the input is exhausted, correctly catching # and reporting exceptions, etc. # The QueueRunner, combined with the Coordinator, helps handle these issues. tf.train.add_queue_runner(tf.train.QueueRunner(queue, enqueue_ops)) # BUILD LEARNER # ===========================================================================> if is_learner: # Create global step, which is the number # of environment frames processed. tf.get_variable('num_environment_frames', initializer=tf.zeros_initializer(), shape=[], dtype=tf.int64, trainable=False, collections=[ tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES ]) # Create batch (time major) and recreate structure. dequeued = queue.dequeue_many(FLAGS.batch_size) dequeued = nest.pack_sequence_as(structure, dequeued) def make_time_major(s): return nest.map_structure( lambda t: tf.transpose(t, [1, 0] + list( range(t.shape.ndims))[2:]), s) # Make dequeued time major dequeued = dequeued._replace( env_outputs=make_time_major(dequeued.env_outputs), agent_outputs=make_time_major(dequeued.agent_outputs)) with tf.device('/gpu'): flattened_output = nest.flatten(dequeued) # Using StagingArea allows us to prepare the next batch and send it to # the GPU while we're performing a training step. This adds up to 1 step # policy lag. # Class for staging inputs. No ordering guarantees. # A StagingArea is a TensorFlow data structure that # stores tensors across multiple steps, and exposes # operations that can put and get tensors. # Each StagingArea element is a tuple of one or more # tensors, where each tuple component has a static # dtype, and may have a static shape. # The capacity of a StagingArea may be bounded or # unbounded. It supports multiple concurrent producers # and consumers; and provides exactly-once delivery. # Each element of a StagingArea is a fixed-length tuple # of tensors whose dtypes are described by dtypes, and # whose shapes are optionally described by the shapes # argument. # If the shapes argument is specified, each component # of a staging area element must have the respective # fixed shape. If it is unspecified, different elements # may have different shapes, # It can be configured with a capacity in which case # put(values) will block until space becomes available. area = tf.contrib.staging.StagingArea( [t.dtype for t in flattened_output], [t.shape for t in flattened_output]) # Operation to add flattened output from # dequeued env outputs with their respective # agent outputs stage_op = area.put(flattened_output) # In this instance structure refers to # the output from build actor above data_from_actors = nest.pack_sequence_as(structure, area.get()) # Unroll agent on sequence, # create losses and update ops. output = build_learner( agent=agent, env_outputs=data_from_actors.env_outputs, agent_outputs=data_from_actors.agent_outputs, FLAGS=FLAGS) # Create MonitoredSession (to run the graph, checkpoint and log). tf.logging.info('Creating MonitoredSession, is_chief %s', is_learner) config = tf.ConfigProto(allow_soft_placement=True, device_filters=filters) # RUN GRAPH # ===========================================================================> #Creates a MonitoredSession for training. # For a chief, this utility sets proper session # initializer/restorer. It also creates hooks # related to checkpoint and summary saving. # For workers, this utility sets proper session # creator which waits for the chief to # initialize/restore. Please check # tf.train.MonitoredSession for more information. with tf.train.MonitoredTrainingSession( server.target, is_chief=is_learner, checkpoint_dir=FLAGS.logdir, save_checkpoint_secs=600, save_summaries_secs=30, log_step_count_steps=50000, config=config, hooks=[py_process.PyProcessHook()]) as session: tf.logging.info('Commencing training run') # If the agent is a learner if is_learner: # Logging. summary_writer = tf.summary.FileWriterCache.get(FLAGS.logdir) # Prepare data for first run. tf.logging.info('Preparing data for first run') session.run_step_fn(lambda step_context: step_context.session. run(actor_output)) # Execute learning and track performance. num_env_frames_v = 0 # # =================================================================> while num_env_frames_v < FLAGS.total_environment_frames: done_v, infos_v, num_env_frames_v, _ = session.run( output + (stage_op, )) # TODO add logging and metric storage else: # Execute actors (they just need to enqueue their output). tf.logging.info('Running enqueue ops') while True: session.run(enqueue_ops)
def train(action_set, level_names): """Train.""" if is_single_machine(): local_job_device = '' shared_job_device = '' is_actor_fn = lambda i: True is_learner = True global_variable_device = '/gpu' server = tf.train.Server.create_local_server() filters = [] else: local_job_device = '/job:%s/task:%d' % (FLAGS.job_name, FLAGS.task) shared_job_device = '/job:learner/task:0' is_actor_fn = lambda i: FLAGS.job_name == 'actor' and i == FLAGS.task is_learner = FLAGS.job_name == 'learner' # Placing the variable on CPU, makes it cheaper to send it to all the # actors. Continual copying the variables from the GPU is slow. global_variable_device = shared_job_device + '/cpu' cluster = tf.train.ClusterSpec({ 'actor': ['localhost:%d' % (8001 + i) for i in range(FLAGS.num_actors)], 'learner': ['localhost:8000'] }) server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task) filters = [shared_job_device, local_job_device] # Only used to find the actor output structure. Agent = agent_factory(FLAGS.agent_name) with tf.Graph().as_default(): specific_atari_game = level_names[0] env = create_atari_environment(specific_atari_game, seed=1) agent = Agent(len(action_set)) structure = build_actor(agent, env, specific_atari_game, action_set) flattened_structure = nest.flatten(structure) dtypes = [t.dtype for t in flattened_structure] shapes = [t.shape.as_list() for t in flattened_structure] with tf.Graph().as_default(), \ tf.device(local_job_device + '/cpu'), \ pin_global_variables(global_variable_device): tf.set_random_seed(FLAGS.seed) # Makes initialization deterministic. # Create Queue and Agent on the learner. with tf.device(shared_job_device): queue = tf.FIFOQueue(1, dtypes, shapes, shared_name='buffer') agent = Agent(len(action_set)) if is_single_machine() and 'dynamic_batching' in sys.modules: # For single machine training, we use dynamic batching for improved GPU # utilization. The semantics of single machine training are slightly # different from the distributed setting because within a single unroll # of an environment, the actions may be computed using different weights # if an update happens within the unroll. old_build = agent._build @dynamic_batching.batch_fn def build(*args): with tf.device('/gpu'): return old_build(*args) tf.logging.info('Using dynamic batching.') agent._build = build # Build actors and ops to enqueue their output. enqueue_ops = [] for i in range(FLAGS.num_actors): if is_actor_fn(i): level_name = level_names[i % len(level_names)] tf.logging.info('Creating actor %d with level %s', i, level_name) env = create_atari_environment(level_name, seed=i + 1) actor_output = build_actor(agent, env, level_name, action_set) with tf.device(shared_job_device): enqueue_ops.append( queue.enqueue(nest.flatten(actor_output))) # If running in a single machine setup, run actors with QueueRunners # (separate threads). if is_learner and enqueue_ops: tf.train.add_queue_runner(tf.train.QueueRunner(queue, enqueue_ops)) # Build learner. if is_learner: # Create global step, which is the number of environment frames processed. global_step = tf.get_variable('num_environment_frames', initializer=tf.zeros_initializer(), shape=[], dtype=tf.int64, trainable=False, collections=[ tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES ]) # Create batch (time major) and recreate structure. dequeued = queue.dequeue_many(FLAGS.batch_size) dequeued = nest.pack_sequence_as(structure, dequeued) def make_time_major(s): return nest.map_structure( lambda t: tf.transpose(t, [1, 0] + list( range(t.shape.ndims))[2:]), s) dequeued = dequeued._replace( env_outputs=make_time_major(dequeued.env_outputs), agent_outputs=make_time_major(dequeued.agent_outputs)) with tf.device('/gpu'): # Using StagingArea allows us to prepare the next batch and send it to # the GPU while we're performing a training step. This adds up to 1 step # policy lag. flattened_output = nest.flatten(dequeued) area = tf.contrib.staging.StagingArea( [t.dtype for t in flattened_output], [t.shape for t in flattened_output]) stage_op = area.put(flattened_output) # Returns an ActorOutput tuple -> (level name, agent_state, env_outputs, agent_output) data_from_actors = nest.pack_sequence_as(structure, area.get()) # levels_index = tf.map_fn(lambda y: tf.py_function(lambda x: game_id[x.numpy()], [y], Tout=tf.int32), data_from_actors.level_name, dtype=tf.int32, parallel_iterations=56) # levels_index = tf.reshape(levels_index, [FLAGS.batch_size]) levels_index = data_from_actors.level_id # Unroll agent on sequence, create losses and update ops. output = build_learner(agent, data_from_actors.env_outputs, data_from_actors.agent_outputs, global_step=global_step, levels_index=levels_index) # Create MonitoredSession (to run the graph, checkpoint and log). tf.logging.info('Creating MonitoredSession, is_chief %s', is_learner) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3) config = tf.ConfigProto(allow_soft_placement=True, device_filters=filters, gpu_options=gpu_options) # config.gpu_options.allow_growth = True # config.gpu_options.per_process_gpu_memory_fraction = 0.8 logdir = FLAGS.logdir with tf.train.MonitoredTrainingSession( server.target, is_chief=is_learner, checkpoint_dir=logdir, save_checkpoint_secs=600, save_summaries_secs=30, log_step_count_steps=50000, config=config, hooks=[py_process.PyProcessHook()]) as session: if is_learner: # Logging. level_returns = {level_name: [] for level_name in level_names} summary_dir = os.path.join(FLAGS.logdir, "logging") summary_writer = tf.summary.FileWriterCache.get(summary_dir) # Prepare data for first run. session.run_step_fn( lambda step_context: step_context.session.run(stage_op)) # Execute learning and track performance. num_env_frames_v = 0 # Uncomment these lines to print the number of parameters. # print("total params:", np.sum([np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()])) # vas = tf.trainable_variables() # for elem in vas: # print(elem) # print("Params: ", [v.get_shape().as_list() for v in tf.trainable_variables()]) while num_env_frames_v < FLAGS.total_environment_frames: level_names_v, done_v, infos_v, num_env_frames_v, _ = session.run( (data_from_actors.level_name, ) + output + (stage_op, )) level_names_v = np.repeat([level_names_v], done_v.shape[0], 0) for level_name, episode_return, episode_step, acc_episode_reward, acc_episode_step in zip( level_names_v[done_v], infos_v.episode_return[done_v], infos_v.episode_step[done_v], infos_v.acc_episode_reward[done_v], infos_v.acc_episode_step[done_v]): episode_frames = episode_step * FLAGS.num_action_repeats tf.logging.info( 'Level: %s Episode return: %f Acc return %f after %d frames', level_name, episode_return, acc_episode_reward, num_env_frames_v) summary = tf.summary.Summary() summary.value.add(tag=level_name + '/episode_return', simple_value=episode_return) summary.value.add(tag=level_name + '/episode_frames', simple_value=episode_frames) summary.value.add(tag=level_name + '/acc_episode_return', simple_value=acc_episode_reward) summary.value.add(tag=level_name + '/acc_episode_frames', simple_value=acc_episode_step) summary_writer.add_summary(summary, num_env_frames_v) level_returns[level_name].append(episode_return) current_episode_return_list = min( map(len, level_returns.values())) if FLAGS.multi_task == 1 and current_episode_return_list >= 1: def sum_none(list_): if list_: return sum(list_) else: return None level_returns = { level_name: sum_none(level_returns[level_name]) for level_name in level_names } no_cap = atari_utils.compute_human_normalized_score( level_returns, per_level_cap=None) cap_100 = atari_utils.compute_human_normalized_score( level_returns, per_level_cap=100) summary = tf.summary.Summary() summary.value.add(tag=(level_name + '/training_no_cap'), simple_value=no_cap) summary.value.add(tag=(level_name + '/training_cap_100'), simple_value=cap_100) level_returns = { level_name: [] for level_name in level_names } else: # Execute actors (they just need to enqueue their output). while True: session.run(enqueue_ops)
def test(action_set): """Test.""" with tf.Graph().as_default(): # Get EnvironmentFactory env_sampler = env_factory.EnvironmentFactory( FLAGS.recipes_path, FLAGS.hints_path, max_steps=FLAGS.max_steps, reuse_environments=FLAGS.reuse_environments, seed=1, visualise=True) dummy_env = env_sampler.sample_environment() obs_spec = dummy_env.obs_specs() task_names = sorted(env_sampler.task_names) dummy_env.render_matplotlib() agent = Agent(len(action_set), obs_spec) outputs = {} task_returns = collections.defaultdict(list) # Test on all environments one after another for task_name in task_names: env = create_environment( env_sampler, initial_task_name=task_name, seed=1) outputs[task_name] = build_actor(agent, env, task_name, action_set) with tf.train.SingularMonitoredSession( checkpoint_dir=FLAGS.logdir, hooks=[py_process.PyProcessHook()]) as session: for task_name in task_names: tf.logging.info('Testing task: %s', task_name) returns = task_returns[task_name] while len(returns) < FLAGS.test_num_episodes: rewards_v, done_v, observations_v = session.run( (outputs[task_name].env_outputs.reward, outputs[task_name].env_outputs.done, outputs[task_name].env_outputs.observation)) # Repack the environment outputs rewards_v = rewards_v[1:] done_v = done_v[1:] observations_dict = { obs_name: observations_v[obs_i][1:] for obs_i, obs_name in enumerate(obs_spec.keys()) } # Check the performance episode_returns = rewards_v[done_v] returns.extend(episode_returns) # Visualise render num_episodes_seen = 0 for frame_i, frame in enumerate(observations_dict['image'][:30]): if rewards_v[frame_i]: rewarding_frame = observations_dict['image'][frame_i - 1].copy() rewarding_frame[:40] *= np.array([0, 1, 0]) dummy_env.render_matplotlib( frame=rewarding_frame, delta_time=0.7) else: if frame_i == 0: dummy_env.render_matplotlib(frame=frame, delta_time=1.5) else: dummy_env.render_matplotlib(frame=frame, delta_time=0.3) if done_v[frame_i]: num_episodes_seen += 1 if num_episodes_seen >= FLAGS.test_num_episodes: break returns_avg = np.mean(returns) # Logging tf.logging.info('Evaluating task %s -> episode return: %f', task_name, returns_avg)
def train(action_set): """Train.""" if is_single_machine(): local_job_device = '' shared_job_device = '' def is_actor_fn(i): return True is_learner = True global_variable_device = '/gpu' server = tf.train.Server.create_local_server() filters = [] else: local_job_device = '/job:%s/task:%d' % (FLAGS.job_name, FLAGS.task) shared_job_device = '/job:learner/task:0' def is_actor_fn(i): return FLAGS.job_name == 'actor' and i == FLAGS.task is_learner = FLAGS.job_name == 'learner' # Placing the variable on CPU, makes it cheaper to send it to all the # actors. Continual copying the variables from the GPU is slow. global_variable_device = shared_job_device + '/cpu' cluster = tf.train.ClusterSpec({ 'actor': ['localhost:%d' % (8001 + i) for i in range(FLAGS.num_actors)], 'learner': ['localhost:8000'] }) server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task) filters = [shared_job_device, local_job_device] # Only used to find the actor output structure. with tf.Graph().as_default(): # here the meta learning algorithm should propose the task env_sampler = env_factory.EnvironmentFactory( FLAGS.recipes_path, FLAGS.hints_path, max_steps=FLAGS.max_steps, reuse_environments=FLAGS.reuse_environments, seed=1) dummy_env = env_sampler.sample_environment() obs_spec = dummy_env.obs_specs() env = create_environment(env_sampler, seed=1) teacher = Teacher(env_sampler.task_names, gamma=FLAGS.gamma) agent = Agent(len(action_set), obs_spec) structure = build_actor(agent, env, '', action_set) flattened_structure = nest.flatten(structure) dtypes = [t.dtype for t in flattened_structure] shapes = [t.shape.as_list() for t in flattened_structure] with tf.Graph().as_default(), \ tf.device(local_job_device + '/cpu'), \ pin_global_variables(global_variable_device): tf.set_random_seed(FLAGS.seed) # Makes initialization deterministic. # Create Queue and Agent on the learner. with tf.device(shared_job_device): queue = tf.FIFOQueue(1, dtypes, shapes, shared_name='buffer') agent = Agent(len(action_set), obs_spec) # Setup the task names variables and assignment logic teacher_task_ph = tf.placeholder( dtype=tf.string, shape=(), name='teacher_task_name') task_names = env_sampler.task_names actor_task_name_params = collections.defaultdict(list) for actor_i in range(FLAGS.num_actors): if FLAGS.actors_same_task: # Initialise all actors to the same task initial_task_name = task_names[0] else: # Assign initial task name by round-robin initial_task_name = task_names[actor_i % len(task_names)] assert FLAGS.progress_signal == 'random', ( "Using different tasks per actors with a Teacher hasn't been " "tested. Use progress_signal=random.") # Setup variables and assignment logic actor_task_name_var = tf.get_variable( "task_name_actor_{}".format(actor_i), shape=(), dtype=tf.string, initializer=tf.constant_initializer( initial_task_name, dtype=tf.string), trainable=False, collections=[tf.GraphKeys.GLOBAL_VARIABLES] ) actor_task_name_ph = tf.placeholder( dtype=tf.string, shape=(), name='actor_{}_new_task_name'.format(actor_i)) assign_actor_task_name = tf.assign( actor_task_name_var, actor_task_name_ph, name='update_task_name_actor_{}'.format(actor_i)) actor_task_name_params['task_name'].append(initial_task_name) actor_task_name_params['var'].append(actor_task_name_var) actor_task_name_params['ph'].append(actor_task_name_ph) actor_task_name_params['update'].append(assign_actor_task_name) if is_single_machine() and 'dynamic_batching' in sys.modules: # For single machine training, we use dynamic batching for improved GPU # utilization. The semantics of single machine training are slightly # different from the distributed setting because within a single unroll # of an environment, the actions may be computed using different weights # if an update happens within the unroll. old_build = agent._build @dynamic_batching.batch_fn def build(*args): with tf.device('/gpu'): return old_build(*args) tf.logging.info('Using dynamic batching.') agent._build = build # Build actors and ops to enqueue their output. enqueue_ops = [] for actor_i in range(FLAGS.num_actors): if is_actor_fn(actor_i): env = create_environment(env_sampler, seed=actor_i+1) tf.logging.info('Creating actor %d with level %s', actor_i, actor_task_name_params['task_name'][actor_i]) actor_output = build_actor( agent, env, actor_task_name_params['var'][actor_i].read_value(), action_set) with tf.device(shared_job_device): enqueue_ops.append(queue.enqueue(nest.flatten(actor_output))) # Build evaluation ops for every task, which will keep computing returns # on all tasks. evaluation_output = {} if is_learner: with tf.name_scope("evaluation"): for task_name in task_names: env = create_environment( env_sampler, initial_task_name=task_name, seed=1) evaluation_output[task_name] = build_actor( agent, env, task_name, action_set) # If running in a single machine setup, run actors with QueueRunners # (separate threads). if is_learner and enqueue_ops: tf.train.add_queue_runner(tf.train.QueueRunner(queue, enqueue_ops)) # Build learner. if is_learner: # Create global step, which is the number of environment frames processed. tf.get_variable( 'num_environment_frames', initializer=tf.zeros_initializer(), shape=[], dtype=tf.int64, trainable=False, collections=[tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES]) # Create batch (time major) and recreate structure. dequeued = queue.dequeue_many(FLAGS.batch_size) dequeued = nest.pack_sequence_as(structure, dequeued) def make_time_major(s): return nest.map_structure( lambda t: tf.transpose(t, [1, 0] + list(range(t.shape.ndims))[2:]), s) dequeued = dequeued._replace( env_outputs=make_time_major(dequeued.env_outputs), agent_outputs=make_time_major(dequeued.agent_outputs)) with tf.device('/gpu'): # Using StagingArea allows us to prepare the next batch and send it to # the GPU while we're performing a training step. This adds up to 1 step # policy lag. flattened_output = nest.flatten(dequeued) area = tf.contrib.staging.StagingArea( [t.dtype for t in flattened_output], [t.shape for t in flattened_output]) stage_op = area.put(flattened_output) data_from_actors = nest.pack_sequence_as(structure, area.get()) # Unroll agent on sequence, create losses and update ops. done, infos, num_env_frames_and_train, progress_signal = ( build_learner(agent, data_from_actors.agent_state, data_from_actors.env_outputs, data_from_actors.agent_outputs, teacher_task_ph)) # Create MonitoredSession (to run the graph, checkpoint and log). tf.logging.info('Creating MonitoredSession, is_chief %s', is_learner) config = tf.ConfigProto(allow_soft_placement=True, device_filters=filters) with tf.train.MonitoredTrainingSession( server.target, is_chief=is_learner, checkpoint_dir=FLAGS.logdir, save_checkpoint_secs=600, save_summaries_secs=30, log_step_count_steps=50000, config=config, hooks=[py_process.PyProcessHook()]) as session: if is_learner: summary_writer = tf.summary.FileWriterCache.get(FLAGS.logdir) # Prepare data for first run. session.run_step_fn( lambda step_context: step_context.session.run(stage_op)) # Execute learning and track performance. num_env_frames_v = 0 num_teacher_update = 0 next_task_switch_at = FLAGS.switch_tasks_every_k_frames last_return_tasks = collections.defaultdict(float) task_average_returns = collections.defaultdict(float) advantage_previous_returns = collections.defaultdict(float) progress_since_switch = [] returns_task_since_switch = collections.defaultdict(list) teacher_history = collections.defaultdict(dict) evaluation_task_returns = collections.defaultdict(float) next_evaluation_at = FLAGS.evaluate_every_k_frames teacher_selected_task_name = actor_task_name_params['task_name'][0] while num_env_frames_v < FLAGS.total_environment_frames: # Perform one training step, on a minibatch. (done_v, infos_v, num_env_frames_v, progress_signal_v, _) = session.run( (done, infos, num_env_frames_and_train, progress_signal, stage_op), feed_dict={ teacher_task_ph: teacher_selected_task_name }) # Per task, let's average metrics in the current minibatch. for task_name in task_names: # Only keep part of the minibatch for the current task. done_task = done_v & (infos_v.task_name == task_name) if np.any(done_task): # This task was present in this minibatch task_episode_return = np.mean(infos_v.episode_return[done_task]) task_episode_frames = np.mean( infos_v.episode_step[done_task] * FLAGS.num_action_repeats) if task_name == teacher_selected_task_name: # Keep the progress_signal across training batches. # Only do so if the task corresponds to what the Teacher asked. # This will discard progress_signal_v for minibatches that have # old tasks. progress_since_switch.append(progress_signal_v) # For every task, keep the last returns. last_return_tasks[task_name] = task_episode_return # One summary per task in this minibatch. summary = tf.summary.Summary() summary.value.add( tag=task_name + '/episode_return', simple_value=task_episode_return) summary.value.add( tag=task_name + '/episode_frames', simple_value=task_episode_frames) summary.value.add( tag=task_name + '/progress', simple_value=progress_signal_v) summary.value.add( tag='Teacher/progress_signal_' + FLAGS.progress_signal, simple_value=progress_signal_v) summary.value.add( tag='Teacher/task_selected', simple_value=task_names.index(task_name)) summary_writer.add_summary(summary, num_env_frames_v) # Keep track of returns for all tasks, through time # (default to 0 if the task was never selected yet) # This will keep the last score even when the task is not retrained # on, but that's actually what Tensorboard shows, so it's ok. returns_task_since_switch[task_name].append( last_return_tasks[task_name]) # Perform a full evaluation on all tasks if num_env_frames_v >= next_evaluation_at: summary_evaluator = tf.summary.Summary() for task_name in task_names: returns = [] while len(returns) < FLAGS.test_num_episodes: rewards_v, done_v = session._tf_sess().run( (evaluation_output[task_name].env_outputs.reward, evaluation_output[task_name].env_outputs.done)) # Repack the environment outputs rewards_v = rewards_v[1:] done_v = done_v[1:] # Check the performance episode_returns = rewards_v[done_v] returns.extend(episode_returns) # Store mean returns per task returns_avg = np.mean(returns) evaluation_task_returns[task_name] = returns_avg # Logging/Tensorboard tf.logging.info('[%d] Evaluating task %s -> episode return: %f', num_env_frames_v, task_name, returns_avg) summary_evaluator.value.add( tag='Evaluation/' + task_name + '/episode_return', simple_value=returns_avg) # Also use these evaluation values to bootstrap the Advantage # previous rewards advantage_previous_returns[task_name] = ( 0.8 * advantage_previous_returns[task_name] + 0.2 * returns_avg) summary_writer.add_summary(summary_evaluator, num_env_frames_v) next_evaluation_at += FLAGS.evaluate_every_k_frames # Now ask the Teacher for new tasks to train on! if num_env_frames_v >= next_task_switch_at: print("Let's update the tasks for all actors now!") # Compute average return for ~all tasks since last switch task_average_returns = { task_name: np.mean(returns_task_since_switch[task_name]) for task_name in task_names } # Compute the progress signal for the Teacher if FLAGS.progress_signal == 'advantage': # For the Advantage (reward[T] - reward[T-K]), we need to compare # to "previous" reward values. # Previous rewards are either evaluation_task_returns or # task_average_returns, whichever is "fresher" rewards_post_switch = np.mean(progress_since_switch or 0) progress_for_teacher = np.abs( rewards_post_switch - advantage_previous_returns[teacher_selected_task_name]) # Update last returns advantage_previous_returns[teacher_selected_task_name] = ( 0.9 * advantage_previous_returns[teacher_selected_task_name] + 0.1 * rewards_post_switch) else: # For the other signals, we can use them directly. progress_for_teacher = np.mean(progress_since_switch or 0) # Update Teacher according to the progress signal we got! if FLAGS.progress_signal != 'random': teacher.update(teacher_selected_task_name, progress_for_teacher) # Log / Tensorboard tf.logging.info("[%d][%d] Task: %s, Episode return mean: %.1f, " "\n\tTeacher progress signal %s: %.3f", num_teacher_update, num_env_frames_v, teacher_selected_task_name, task_average_returns[teacher_selected_task_name], FLAGS.progress_signal, progress_for_teacher) summary_teacher = tf.summary.Summary() summary_teacher.value.add( tag='Teacher/at_update_task_returns', simple_value=task_average_returns[teacher_selected_task_name]) summary_teacher.value.add( tag='Teacher/at_update_progress_signal', simple_value=progress_for_teacher) summary_writer.add_summary(summary_teacher, num_env_frames_v) # Keep track of teacher state teacher_history['progress_signal'][num_teacher_update] = ( progress_for_teacher) teacher_history['weights'][num_teacher_update] = ( teacher._log_weights.copy()) teacher_history['arm_probs'][num_teacher_update] = ( teacher.task_probabilities.copy()) teacher_history['teacher_selected_task_name'][num_teacher_update] = ( teacher_selected_task_name) teacher_history['num_env_frames'][num_teacher_update] = ( num_env_frames_v) teacher_history['task_returns'][num_teacher_update] = ( task_average_returns) teacher_history['evaluation_task_returns'][num_teacher_update] = ( evaluation_task_returns.copy()) teacher_history['task_names'] = task_names # Store teacher history for analysis if ((num_teacher_update + 1) % FLAGS.save_every_k_teacher_updates == 0): np.save( os.path.join(FLAGS.logdir, "teaching_output_{}.npy".format( num_teacher_update)), dict(teacher_history)) # Reset teacher history to be super safe teacher_history = collections.defaultdict(dict) # Get new task from the Teacher and update Actors if FLAGS.actors_same_task: teacher_selected_task_name = teacher.get_task() actor_task_assignments = [teacher_selected_task_name] update_all_actors_tasks( actor_task_assignments, actor_task_name_params, session._tf_sess(), single_task=True) else: actor_task_assignments = np.random.choice( task_names, FLAGS.num_actors, replace=FLAGS.num_actors > len(task_names)) update_all_actors_tasks( actor_task_assignments, actor_task_name_params, session._tf_sess(), single_task=False) # ... finish this switch progress_since_switch = [] returns_task_since_switch = collections.defaultdict(list) num_teacher_update += 1 next_task_switch_at += FLAGS.switch_tasks_every_k_frames print("Switching to task {}! Next update at {}".format( actor_task_assignments, next_task_switch_at)) else: # Execute actors (they just need to enqueue their output). while True: session.run(enqueue_ops)
def train(): """Train.""" if is_single_machine(): local_job_device = '' shared_job_device = '' is_actor_fn = lambda i, j: True is_learner = True global_variable_device = '/gpu' server = tf.train.Server.create_local_server() filters = [] else: if False: from mpi4py import MPI comm = MPI.COMM_WORLD rank = comm.Get_rank() #rank = FLAGS.rank if rank == 0: job_name = 'learner' task = 0 else: job_name = 'actor' task = rank - 1 else: job_name = FLAGS.job_name task = FLAGS.task agent1 = task // FLAGS.num_agents agent2 = task % FLAGS.num_agents local_job_device = '/job:%s/task:%d' % (job_name, task) shared_job_device = '/job:learner/task:0' is_actor_fn = lambda i, j: job_name == 'actor' and i == agent1 and j == agent2 is_learner = job_name == 'learner' # Placing the variable on CPU, makes it cheaper to send it to all the # actors. Continual copying the variables from the GPU is slow. global_variable_device = shared_job_device + '/cpu' # cluster = tf.train.ClusterSpec({ # 'actor': ['localhost:%d' % (8001 + i) for i in range(FLAGS.num_agents ** 2)], # 'learner': ['localhost:8000'] # }) # cluster = tf.train.ClusterSpec({ # 'actor': ['10.1.2.25:8000', '10.1.2.24:8000', '10.1.2.15:8000'], # 'learner': ['10.1.2.22:8000'] # }) #cluster = tf.train.ClusterSpec({ # 'actor': ['10.1.2.25:%d' % (8001 + i) for i in range(FLAGS.num_agents ** 2)] + [ # '10.1.2.24:%d' % (8001 + i) for i in range(FLAGS.num_agents ** 2)] + [ # '10.1.2.15:%d' % (8001 + i) for i in range(FLAGS.num_agents ** 2)], # 'learner': ['10.1.2.22:8000'] #}) ''' cluster = tf.train.ClusterSpec({ 'actor': ['10.1.2.25:%d' % (8000 + i) for i in range(FLAGS.num_agents ** 2)] + ['10.1.2.24:%d' % (8000 + i) for i in range(FLAGS.num_agents ** 2)], 'learner': ['10.1.2.22:8000'] }) ''' nodefile = FLAGS.logdir + '/nodeslist.txt' with open(nodefile, 'r') as f: nodes = f.readlines() nodes = [x.strip().split('.')[0] for x in nodes] #nodes = comm.allgather(MPI.Get_processor_name()) counts = defaultdict(int) if False: processes = [] for i, node in enumerate(nodes): processes.append(node + ':' + str(14000 + counts[node])) counts[node] += 1 else: processes = [] for i, node in enumerate(nodes): if i == 0: processes.append(node + ':14000') else: for j in range(FLAGS.processes): processes.append(node + ':' + str(14000 + j)) cluster = tf.train.ClusterSpec({ 'actor': processes[1:], 'learner': [processes[0]] }) import socket print(job_name, task, socket.gethostname()) print({'actor': processes[1:], 'learner': [processes[0]]}) sys.stdout.flush() server = tf.train.Server(cluster, job_name=job_name, task_index=task) print('created server') sys.stdout.flush() filters = [shared_job_device, local_job_device] # Only used to find the actor output structure. with tf.Graph().as_default(): agent = Agent((6, 8, 8)) env = create_environment({'adversarial': False}) structure = build_actor(agent, agent, env) structure = [structure[0], structure[2]] flattened_structure = nest.flatten(structure) dtypes = [t.dtype for t in flattened_structure] shapes = [t.shape.as_list() for t in flattened_structure] with tf.Graph().as_default(), \ tf.device(local_job_device + '/cpu'), \ pin_global_variables(global_variable_device): tf.set_random_seed(FLAGS.seed) # Makes initialization deterministic. # Create Queue and Agent on the learner. with tf.device(shared_job_device): agents = [] queues = [] for i in range(FLAGS.num_agents): agent = Agent((6, 8, 8)) queue = tf.FIFOQueue(1, dtypes, shapes, shared_name='buffer_' + str(i)) if is_single_machine() and 'dynamic_batching' in sys.modules: # For single machine training, we use dynamic batching for improved GPU # utilization. The semantics of single machine training are slightly # different from the distributed setting because within a single unroll # of an environment, the actions may be computed using different weights # if an update happens within the unroll. old_build = agent._build @dynamic_batching.batch_fn def build(*args): with tf.device('/gpu'): return old_build(*args) tf.logging.info('Using dynamic batching.') agent._build = build agents.append(agent) queues.append(queue) # Build actors and ops to enqueue their output. enqueue_ops = [[] for i in range(FLAGS.num_agents)] for i in range(FLAGS.num_agents): for j in range(0, FLAGS.num_agents): if is_actor_fn(i, j): tf.logging.info('Creating actor %d %d', i, j) config = {'adversarial': False} #if i >= FLAGS.num_agents - FLAGS.num_adversarial_agents: # config['adversarial'] = True env = create_environment(config) actor_output = build_actor(agents[i], agents[j], env) actor1_output = [actor_output[0], actor_output[2]] # actor2_output = [actor_output[1], actor_output[3]] with tf.device(shared_job_device): enqueue_ops[i].append(queues[i].enqueue( nest.flatten(actor1_output))) # enqueue_ops[j].append(queues[j].enqueue(nest.flatten(actor2_output))) # If running in a single machine setup, run actors with QueueRunners # (separate threads). if is_single_machine(): if is_learner and enqueue_ops: for i in range(FLAGS.num_agents): tf.train.add_queue_runner( tf.train.QueueRunner(queues[i], enqueue_ops[i])) # Build learner. if is_learner: # Create global step, which is the number of environment frames processed. tf.get_variable('num_environment_frames', initializer=tf.zeros_initializer(), shape=[], dtype=tf.int64, trainable=False, collections=[ tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES ]) # Create batch (time major) and recreate structure. def make_time_major(s): return nest.map_structure( lambda t: tf.transpose(t, [1, 0] + list( range(t.shape.ndims))[2:]), s) dequeued = [] for i in range(FLAGS.num_agents): dequeue = queues[i].dequeue_many(FLAGS.batch_size) dequeue = nest.pack_sequence_as(structure, dequeue) dequeue = [ d._replace(env_outputs=make_time_major(d.env_outputs), agent_outputs=make_time_major(d.agent_outputs)) for d in dequeue ] dequeued.append(dequeue) with tf.device('/gpu'): # Using StagingArea allows us to prepare the next batch and send it to # the GPU while we're performing a training step. This adds up to 1 step # policy lag. num_env_frames = tf.train.get_global_step() learning_rate = tf.train.polynomial_decay( FLAGS.learning_rate, num_env_frames, FLAGS.total_environment_frames, 0) optimizer = tf.train.RMSPropOptimizer(learning_rate, FLAGS.decay, FLAGS.momentum, FLAGS.epsilon) stage_ops = [] outputss = [] for i in range(FLAGS.num_agents): flattened_output = nest.flatten(dequeued[i]) area = tf.contrib.staging.StagingArea( [t.dtype for t in flattened_output], [t.shape for t in flattened_output]) stage_op = area.put(flattened_output) stage_ops.append(stage_op) data_from_actorss = nest.pack_sequence_as( structure, area.get()) outputs = [] for data_from_actors in data_from_actorss: # Unroll agent on sequence, create losses and update ops. print('building learner', i) outputs.append( build_learner(agents[i], data_from_actors.agent_state, data_from_actors.env_outputs, data_from_actors.agent_outputs, optimizer, num_env_frames, learning_rate, index=i)) outputss.append(outputs) # Create MonitoredSession (to run the graph, checkpoint and log). tf.logging.info('Creating MonitoredSession, is_chief %s', is_learner) config = tf.ConfigProto( allow_soft_placement=True, device_filters=filters ) #, gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.2)) with tf.train.MonitoredTrainingSession( server.target, is_chief=is_learner, checkpoint_dir=FLAGS.logdir, save_checkpoint_secs=120, save_summaries_secs=30, log_step_count_steps=50000, config=config, hooks=[py_process.PyProcessHook()]) as session: # session = tf_debug.LocalCLIDebugWrapperSession(session) if is_learner: # Logging. summary_writer = tf.summary.FileWriterCache.get(FLAGS.logdir) # Prepare data for first run. session.run_step_fn( lambda step_context: step_context.session.run(stage_ops)) # Execute learning and track performance. num_env_frames_v = 0 while num_env_frames_v < FLAGS.total_environment_frames: summary = tf.summary.Summary() # print('outputss, stage_ops') output_valuess, _ = session.run((outputss, stage_ops)) for i in range(len(output_valuess)): for j, output in enumerate(output_valuess[i]): done_v, infos_v, num_env_frames_v = output for episode_return, episode_step in zip( infos_v.episode_return[done_v], infos_v.episode_step[done_v]): episode_frames = episode_step * FLAGS.num_action_repeats if j == 0: tf.logging.info('Episode return: %f', episode_return) summary.value.add( tag='/episode_return_' + str(i) + '_' + str(j), simple_value=episode_return) summary.value.add( tag='/episode_frames_' + str(i) + '_' + str(j), simple_value=episode_frames) summary_writer.add_summary(summary, num_env_frames_v) else: # Execute actors (they just need to enqueue their output). while True: #print('.', end='') session.run(enqueue_ops)