Пример #1
0
def test(action_set, level_names):
    """Test."""

    Agent = agent_factory(FLAGS.agent_name)
    level_returns = {level_name: [] for level_name in level_names}
    with tf.Graph().as_default():
        agent = Agent(len(action_set))
        outputs = {}
        for level_name in level_names:
            env = create_environment(level_name, seed=1, is_test=True)
            outputs[level_name] = build_actor(agent, env, level_name,
                                              action_set)

        with tf.train.SingularMonitoredSession(checkpoint_dir=FLAGS.logdir,
                                               hooks=[
                                                   py_process.PyProcessHook()
                                               ]) as session:
            for level_name in level_names:
                tf.logging.info('Testing level: %s', level_name)
                while True:
                    done_v, infos_v = session.run(
                        (outputs[level_name].env_outputs.done,
                         outputs[level_name].env_outputs.info))
                    returns = level_returns[level_name]
                    returns.extend(infos_v.episode_return[1:][done_v[1:]])

                    if len(returns) >= FLAGS.test_num_episodes:
                        tf.logging.info('Mean episode return: %f',
                                        np.mean(returns))
                        break
Пример #2
0
def test(action_set, level_names):
  """Test."""

  level_returns = {level_name: [] for level_name in level_names}
  with tf.Graph().as_default():
    agent = Agent(len(action_set))
    outputs = {}
    for level_name in level_names:
      env = create_environment(level_name, seed=1, is_test=True)
      outputs[level_name] = build_actor(agent, env, level_name, action_set)

    with tf.train.SingularMonitoredSession(
        checkpoint_dir=FLAGS.logdir,
        hooks=[py_process.PyProcessHook()]) as session:
      for level_name in level_names:
        tf.logging.info('Testing level: %s', level_name)
        while True:
          done_v, infos_v = session.run((
              outputs[level_name].env_outputs.done,
              outputs[level_name].env_outputs.info
          ))
          returns = level_returns[level_name]
          returns.extend(infos_v.episode_return[1:][done_v[1:]])

          if len(returns) >= FLAGS.test_num_episodes:
            tf.logging.info('Mean episode return: %f', np.mean(returns))
            break

  if FLAGS.level_name == 'dmlab30':
    no_cap = dmlab30.compute_human_normalized_score(level_returns,
                                                    per_level_cap=None)
    cap_100 = dmlab30.compute_human_normalized_score(level_returns,
                                                     per_level_cap=100)
    tf.logging.info('No cap.: %f Cap 100: %f', no_cap, cap_100)
Пример #3
0
    def test_small(self):
        class Example(object):
            def __init__(self, a):
                self._a = a

            def inc(self):
                self._a += 1

            def compute(self, b):
                return np.array(self._a + b, dtype=np.int32)

            @staticmethod
            def _tensor_specs(method_name, unused_args,
                              unused_constructor_kwargs):
                if method_name == 'compute':
                    return tf.contrib.framework.TensorSpec([], tf.int32)
                elif method_name == 'inc':
                    return ()

        with tf.Graph().as_default():
            p = py_process.PyProcess(Example, 1)
            inc = p.proxy.inc()
            compute = p.proxy.compute(2)

            with tf.train.SingularMonitoredSession(
                    hooks=[py_process.PyProcessHook()]) as session:
                self.assertTrue(isinstance(inc, tf.Operation))
                session.run(inc)

                self.assertEqual([], compute.shape)
                self.assertEqual(4, session.run(compute))
Пример #4
0
  def test_args(self):

    class Example(object):

      def __init__(self, dim0):
        self._dim0 = dim0

      def compute(self, dim1):
        return np.zeros([self._dim0, dim1], dtype=np.int32)

      @staticmethod
      def _tensor_specs(method_name, kwargs, constructor_kwargs):
        dim0 = constructor_kwargs['dim0']
        dim1 = kwargs['dim1']
        if method_name == 'compute':
          return tf.contrib.framework.TensorSpec([dim0, dim1], tf.int32)

    with tf.Graph().as_default():
      p = py_process.PyProcess(Example, 1)
      result = p.proxy.compute(2)

      with tf.train.SingularMonitoredSession(
          hooks=[py_process.PyProcessHook()]) as session:
        self.assertEqual([1, 2], result.shape)
        self.assertAllEqual([[0, 0]], session.run(result))
Пример #5
0
    def test_close_on_error(self):
        with tempfile.NamedTemporaryFile() as tmp:

            class Example(object):
                def __init__(self, filename):
                    self._filename = filename

                def something(self):
                    raise ValueError('foo')

                def close(self):
                    with tf.gfile.Open(self._filename, 'w') as f:
                        f.write('was_closed')

                @staticmethod
                def _tensor_specs(method_name, unused_kwargs,
                                  unused_constructor_kwargs):
                    if method_name == 'something':
                        return ()

            with tf.Graph().as_default():
                p = py_process.PyProcess(Example, tmp.name)
                result = p.proxy.something()

                with tf.train.SingularMonitoredSession(
                        hooks=[py_process.PyProcessHook()]) as session:
                    with self.assertRaisesRegexp(Exception, 'foo'):
                        session.run(result)

            self.assertEqual('was_closed', tmp.read())
Пример #6
0
def test():
    """Test."""

    with tf.Graph().as_default():
        agent = Agent((6, 8, 8))

        env = create_environment({'adversarial': False}, is_test=True)
        outputs = build_actor(agent, env)[0]

        returns = []

        with tf.train.SingularMonitoredSession(checkpoint_dir=FLAGS.logdir,
                                               hooks=[
                                                   py_process.PyProcessHook()
                                               ]) as session:
            tf.logging.info('Testing:')
            while True:
                done_v, infos_v = session.run(
                    (outputs.env_outputs.done, outputs.env_outputs.info))
                returns.extend(infos_v.episode_return[1:][done_v[1:]])

                if len(returns) >= FLAGS.test_num_episodes:
                    tf.logging.info('Mean episode return: %f',
                                    np.mean(returns))
                    break
Пример #7
0
    def test_threading(self):
        class Example(object):
            def __init__(self):
                pass

            def wait(self):
                time.sleep(.2)
                return None

            @staticmethod
            def _tensor_specs(method_name, unused_args,
                              unused_constructor_kwargs):
                if method_name == 'wait':
                    return tf.contrib.framework.TensorSpec([], tf.int32)

        with tf.Graph().as_default():
            p = py_process.PyProcess(Example)
            wait = p.proxy.wait()

            hook = py_process.PyProcessHook()
            with tf.train.SingularMonitoredSession(hooks=[hook]) as session:

                def run():
                    with self.assertRaises(tf.errors.OutOfRangeError):
                        session.run(wait)

                t = self.checkedThread(target=run)
                t.start()
                time.sleep(.1)
            t.join()
Пример #8
0
  def test_close(self):
    with tempfile.NamedTemporaryFile() as tmp:
      class Example(object):

        def __init__(self, filename):
          self._filename = filename

        def close(self):
          with tf.gfile.Open(self._filename, 'w') as f:
            f.write('was_closed')

        @staticmethod
        def _tensor_specs(method_name, unused_kwargs,
                          unused_constructor_kwargs):
          if method_name == 'something':
            return ()

      with tf.Graph().as_default():
        py_process.PyProcess(Example, tmp.name)

        with tf.train.SingularMonitoredSession(
            hooks=[py_process.PyProcessHook()]):
          pass

      self.assertEqual('was_closed', tmp.read())
Пример #9
0
    def benchmark_one(self):
        with tf.Graph().as_default():
            p = py_process.PyProcess(PyProcessBenchmarks.Example)
            compute = p.proxy.compute(2)

            with tf.train.SingularMonitoredSession(
                    hooks=[py_process.PyProcessHook()]) as session:

                self.run_op_benchmark(name='process_one',
                                      sess=session,
                                      op_or_tensor=compute,
                                      burn_iters=10,
                                      min_iters=5000)
Пример #10
0
    def benchmark_many(self):
        with tf.Graph().as_default():
            ps = [
                py_process.PyProcess(PyProcessBenchmarks.Example)
                for _ in range(200)
            ]
            compute_ops = [p.proxy.compute(2) for p in ps]
            compute = tf.group(*compute_ops)

            with tf.train.SingularMonitoredSession(
                    hooks=[py_process.PyProcessHook()]) as session:

                self.run_op_benchmark(name='process_many',
                                      sess=session,
                                      op_or_tensor=compute,
                                      burn_iters=10,
                                      min_iters=500)
Пример #11
0
    def test_close(self):
        with tempfile.NamedTemporaryFile() as tmp:

            class Example(object):
                def __init__(self, filename):
                    self._filename = filename

                def close(self):
                    with tf.gfile.Open(self._filename, 'w') as f:
                        f.write('was_closed')

            with tf.Graph().as_default():
                py_process.PyProcess(Example, tmp.name)

                with tf.train.SingularMonitoredSession(
                        hooks=[py_process.PyProcessHook()]):
                    pass

            self.assertEqual('was_closed', tmp.read())
Пример #12
0
    def test_error_handling_constructor(self):
        class Example(object):
            def __init__(self):
                raise ValueError('foo')

            def something(self):
                pass

            @staticmethod
            def _tensor_specs(method_name, unused_kwargs,
                              unused_constructor_kwargs):
                if method_name == 'something':
                    return ()

        with tf.Graph().as_default():
            py_process.PyProcess(Example, 1)

            with self.assertRaisesRegexp(Exception, 'foo'):
                with tf.train.SingularMonitoredSession(
                        hooks=[py_process.PyProcessHook()]):
                    pass
Пример #13
0
def test(game_name):
    all_returns = {game_name: []}
    action_size = 4
    with tf.Graph().as_default():
        agent = Agent(action_size)
        env = create_environment(game_name)
        output = build_actor(agent, env, game_name, action_size)

        with tf.train.SingularMonitoredSession(checkpoint_dir=FLAGS.logdir,
                                               hooks=[
                                                   py_process.PyProcessHook()
                                               ]) as session:
            while True:
                done_v, infos_v = session.run(
                    (output.env_outputs.done, output.env_outputs.info))
                returns = all_returns[game_name]
                returns.extend(infos_v.episode_return[1:][done_v[1:]])

                if len(returns) >= FLAGS.test_num_episodes:
                    tf.logging.info('Mean episode return: %f',
                                    np.mean(returns))
                    break
Пример #14
0
def train(action_set, level_names):
  """Train."""

  if is_single_machine():
    local_job_device = ''
    shared_job_device = ''
    is_actor_fn = lambda i: True
    is_learner = True
    global_variable_device = '/gpu'
    server = tf.train.Server.create_local_server()
    filters = []
  else:
    local_job_device = '/job:%s/task:%d' % (FLAGS.job_name, FLAGS.task)
    shared_job_device = '/job:learner/task:0'
    is_actor_fn = lambda i: FLAGS.job_name == 'actor' and i == FLAGS.task
    is_learner = FLAGS.job_name == 'learner'

    # Placing the variable on CPU, makes it cheaper to send it to all the
    # actors. Continual copying the variables from the GPU is slow.
    global_variable_device = shared_job_device + '/cpu'
    cluster = tf.train.ClusterSpec({
        'actor': ['localhost:%d' % (8001 + i) for i in range(FLAGS.num_actors)],
        'learner': ['localhost:8000']
    })
    server = tf.train.Server(cluster, job_name=FLAGS.job_name,
                             task_index=FLAGS.task)
    filters = [shared_job_device, local_job_device]

  # Only used to find the actor output structure.
  with tf.Graph().as_default():
    agent = Agent(len(action_set))
    env = create_environment(level_names[0], seed=1)
    structure = build_actor(agent, env, level_names[0], action_set)
    flattened_structure = nest.flatten(structure)
    dtypes = [t.dtype for t in flattened_structure]
    shapes = [t.shape.as_list() for t in flattened_structure]

  with tf.Graph().as_default(), \
       tf.device(local_job_device + '/cpu'), \
       pin_global_variables(global_variable_device):
    tf.set_random_seed(FLAGS.seed)  # Makes initialization deterministic.

    # Create Queue and Agent on the learner.
    with tf.device(shared_job_device):
      queue = tf.FIFOQueue(1, dtypes, shapes, shared_name='buffer')
      agent = Agent(len(action_set))

      if is_single_machine() and 'dynamic_batching' in sys.modules:
        # For single machine training, we use dynamic batching for improved GPU
        # utilization. The semantics of single machine training are slightly
        # different from the distributed setting because within a single unroll
        # of an environment, the actions may be computed using different weights
        # if an update happens within the unroll.
        old_build = agent._build
        @dynamic_batching.batch_fn
        def build(*args):
          with tf.device('/gpu'):
            return old_build(*args)
        tf.logging.info('Using dynamic batching.')
        agent._build = build

    # Build actors and ops to enqueue their output.
    enqueue_ops = []
    for i in range(FLAGS.num_actors):
      if is_actor_fn(i):
        level_name = level_names[i % len(level_names)]
        tf.logging.info('Creating actor %d with level %s', i, level_name)
        env = create_environment(level_name, seed=i + 1)
        actor_output = build_actor(agent, env, level_name, action_set)
        with tf.device(shared_job_device):
          enqueue_ops.append(queue.enqueue(nest.flatten(actor_output)))

    # If running in a single machine setup, run actors with QueueRunners
    # (separate threads).
    if is_learner and enqueue_ops:
      tf.train.add_queue_runner(tf.train.QueueRunner(queue, enqueue_ops))

    # Build learner.
    if is_learner:
      # Create global step, which is the number of environment frames processed.
      tf.get_variable(
          'num_environment_frames',
          initializer=tf.zeros_initializer(),
          shape=[],
          dtype=tf.int64,
          trainable=False,
          collections=[tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES])

      # Create batch (time major) and recreate structure.
      dequeued = queue.dequeue_many(FLAGS.batch_size)
      dequeued = nest.pack_sequence_as(structure, dequeued)

      def make_time_major(s):
        return nest.map_structure(
            lambda t: tf.transpose(t, [1, 0] + list(range(t.shape.ndims))[2:]), s)

      dequeued = dequeued._replace(
          env_outputs=make_time_major(dequeued.env_outputs),
          agent_outputs=make_time_major(dequeued.agent_outputs))

      with tf.device('/gpu'):
        # Using StagingArea allows us to prepare the next batch and send it to
        # the GPU while we're performing a training step. This adds up to 1 step
        # policy lag.
        flattened_output = nest.flatten(dequeued)
        area = tf.contrib.staging.StagingArea(
            [t.dtype for t in flattened_output],
            [t.shape for t in flattened_output])
        stage_op = area.put(flattened_output)

        data_from_actors = nest.pack_sequence_as(structure, area.get())

        # Unroll agent on sequence, create losses and update ops.
        output = build_learner(agent, data_from_actors.agent_state,
                               data_from_actors.env_outputs,
                               data_from_actors.agent_outputs)

    # Create MonitoredSession (to run the graph, checkpoint and log).
    tf.logging.info('Creating MonitoredSession, is_chief %s', is_learner)
    config = tf.ConfigProto(allow_soft_placement=True, device_filters=filters)
    with tf.train.MonitoredTrainingSession(
        server.target,
        is_chief=is_learner,
        checkpoint_dir=FLAGS.logdir,
        save_checkpoint_secs=600,
        save_summaries_secs=30,
        log_step_count_steps=50000,
        config=config,
        hooks=[py_process.PyProcessHook()]) as session:

      if is_learner:
        # Logging.
        level_returns = {level_name: [] for level_name in level_names}
        summary_writer = tf.summary.FileWriterCache.get(FLAGS.logdir)

        # Prepare data for first run.
        session.run_step_fn(
            lambda step_context: step_context.session.run(stage_op))

        # Execute learning and track performance.
        num_env_frames_v = 0
        while num_env_frames_v < FLAGS.total_environment_frames:
          level_names_v, done_v, infos_v, num_env_frames_v, _ = session.run(
              (data_from_actors.level_name,) + output + (stage_op,))
          level_names_v = np.repeat([level_names_v], done_v.shape[0], 0)

          for level_name, episode_return, episode_step in zip(
              level_names_v[done_v],
              infos_v.episode_return[done_v],
              infos_v.episode_step[done_v]):
            episode_frames = episode_step * FLAGS.num_action_repeats

            tf.logging.info('Level: %s Episode return: %f',
                            level_name, episode_return)

            summary = tf.summary.Summary()
            summary.value.add(tag=level_name + '/episode_return',
                              simple_value=episode_return)
            summary.value.add(tag=level_name + '/episode_frames',
                              simple_value=episode_frames)
            summary_writer.add_summary(summary, num_env_frames_v)

            if FLAGS.level_name == 'dmlab30':
              level_returns[level_name].append(episode_return)
              print("(experiment.py) level_returns: ", level_returns)

          if (FLAGS.level_name == 'dmlab30' and
              min(map(len, level_returns.values())) >= 1):
            no_cap = dmlab30.compute_human_normalized_score(level_returns,
                                                            per_level_cap=None)
            # print("(experiment) No cap: ", no_cap)
                                                            
            cap_100 = dmlab30.compute_human_normalized_score(level_returns,
                                                             per_level_cap=100)
            with open("normalized_scores.txt", "a+") as f:
              f.write("num env frames: %d\n" % num_env_frames_v)
              f.write("no cap: %f\n" % no_cap)
              f.write("cap 100: %f\n" % cap_100)

            summary = tf.summary.Summary()
            summary.value.add(
                tag='dmlab30/training_no_cap', simple_value=no_cap)
            summary.value.add(
                tag='dmlab30/training_cap_100', simple_value=cap_100)
            summary_writer.add_summary(summary, num_env_frames_v)

            # Clear level scores.
            level_returns = {level_name: [] for level_name in level_names}

      else:
        # Execute actors (they just need to enqueue their output).
        while True:
          session.run(enqueue_ops)
Пример #15
0
def train(action_set, level_names):
    """Train."""

    local_job_device = '/job:%s/task:%d' % (FLAGS.job_name, FLAGS.task)
    shared_job_device = '/job:learner/task:0'
    is_actor_fn = lambda i: FLAGS.job_name == 'actor' and i == FLAGS.task
    is_learner = FLAGS.job_name == 'learner'
    actor_hosts = FLAGS.actor_hosts.split(',')
    num_actors = len(actor_hosts)
    learner_host = FLAGS.learner_host.split(',')
    assert (len(learner_host) == 1)
    if is_learner:
        assert (FLAGS.task == 0)
        assert (has_horovod == True)
        hvd.init()

    # Placing the variable on CPU, makes it cheaper to send it to all the
    # actors. Continual copying the variables from the GPU is slow.
    global_variable_device = '/job:learner/task:0' + '/cpu'
    filters = [shared_job_device, local_job_device]
    cluster = tf.train.ClusterSpec({
        'actor': actor_hosts,
        'learner': learner_host
    })
    config = tf.ConfigProto(allow_soft_placement=True, device_filters=filters)
    if is_learner:
        config.gpu_options.allow_growth = True
        config.gpu_options.visible_device_list = str(hvd.local_rank())
    server = tf.train.Server(cluster,
                             job_name=FLAGS.job_name,
                             task_index=FLAGS.task,
                             config=config)

    # Only used to find the actor output structure.
    Agent = agent_factory(FLAGS.agent_name)
    with tf.Graph().as_default():
        agent = Agent(len(action_set))
        env = create_environment(level_names[0], seed=1)
        structure = build_actor(agent, env, level_names[0], action_set)
        flattened_structure = nest.flatten(structure)
        dtypes = [t.dtype for t in flattened_structure]
        shapes = [t.shape.as_list() for t in flattened_structure]

    # build graph for actor or learner
    with tf.Graph().as_default(), \
         tf.device(local_job_device + '/cpu'), \
         pin_global_variables(global_variable_device):
        tf.set_random_seed(FLAGS.seed)  # Makes initialization deterministic.

        # Create Queue and Agent on the learner.
        with tf.device(shared_job_device):
            queue = tf.FIFOQueue(1, dtypes, shapes, shared_name='buffer')
            agent = Agent(len(action_set))

        # Build actors and ops to enqueue their output.
        enqueue_ops = []
        for i in range(num_actors):
            if is_actor_fn(i):
                level_name = level_names[i % len(level_names)]
                tf.logging.info('Creating actor %d with level %s', i,
                                level_name)
                env = create_environment(level_name, seed=i + 1)
                actor_output = build_actor(agent, env, level_name, action_set)
                with tf.device(shared_job_device):
                    enqueue_ops.append(
                        queue.enqueue(nest.flatten(actor_output)))

        # Build learner.
        if is_learner:
            # Create global step, which is the number of environment frames
            # processed.
            g_step = tf.get_variable('num_environment_frames',
                                     initializer=tf.zeros_initializer(),
                                     shape=[],
                                     dtype=tf.int64,
                                     trainable=False,
                                     collections=[
                                         tf.GraphKeys.GLOBAL_STEP,
                                         tf.GraphKeys.GLOBAL_VARIABLES
                                     ])
            # Create batch (time major) and recreate structure.
            dequeued = queue.dequeue_many(FLAGS.batch_size)
            dequeued = nest.pack_sequence_as(structure, dequeued)

            def make_time_major(s):
                return nest.map_structure(
                    lambda t: tf.transpose(t, [1, 0] + list(
                        range(t.shape.ndims))[2:]), s)

            dequeued = dequeued._replace(
                env_outputs=make_time_major(dequeued.env_outputs),
                agent_outputs=make_time_major(dequeued.agent_outputs))

            with tf.device("/gpu"):
                # Using StagingArea allows us to prepare the next batch and send it to
                # the GPU while we're performing a training step. This adds up to 1
                # step policy lag.
                flattened_output = nest.flatten(dequeued)
                area = tf.contrib.staging.StagingArea(
                    [t.dtype for t in flattened_output],
                    [t.shape for t in flattened_output])
                stage_op = area.put(flattened_output)
                data_from_actors = nest.pack_sequence_as(structure, area.get())
                # Unroll agent on sequence, create losses and update ops.
                if hasattr(data_from_actors, 'agent_state'):
                    agent_state = data_from_actors.agent_state
                else:
                    agent_state = agent.initial_state(1)
                output, optimizer = build_learner(
                    agent,
                    agent_state=agent_state,
                    env_outputs=data_from_actors.env_outputs,
                    agent_outputs=data_from_actors.agent_outputs,
                    g_step=g_step)

        # Create MonitoredSession (to run the graph, checkpoint and log).
        is_chief = is_learner  # MonitoredTrainingSession inits all global variables
        hooks = [py_process.PyProcessHook()]
        if is_learner:
            # for variable initialization across learners
            hooks.append(hvd.BroadcastGlobalVariablesHook(0))
        tf.logging.info('Creating MonitoredSession, is_chief %s', is_chief)
        if is_learner:
            tf.logging.info('At rank %d', hvd.rank())
        # rank 0 takes care of ckpt saving
        checkpoint_dir = FLAGS.logdir if is_learner and hvd.rank(
        ) == 0 else None
        with tf.train.MonitoredTrainingSession(server.target,
                                               is_chief=is_chief,
                                               checkpoint_dir=checkpoint_dir,
                                               save_checkpoint_secs=600,
                                               save_summaries_secs=30,
                                               log_step_count_steps=50000,
                                               config=config,
                                               hooks=hooks) as session:

            if is_learner:
                # tb Logging
                summary_writer = (tf.summary.FileWriterCache.get(FLAGS.logdir)
                                  if hvd.rank() == 0 else None)

                # Prepare data for first run.
                session.run_step_fn(
                    lambda step_context: step_context.session.run(stage_op))

                # Execute learning and track performance.
                num_env_frames_v = 0
                while num_env_frames_v < FLAGS.total_environment_frames:
                    level_names_v, done_v, infos_v, num_env_frames_v, _ = session.run(
                        (data_from_actors.level_name, ) + output +
                        (stage_op, ))
                    level_names_v = np.repeat([level_names_v], done_v.shape[0],
                                              0)

                    for level_name, episode_return, episode_step in zip(
                            level_names_v[done_v],
                            infos_v.episode_return[done_v],
                            infos_v.episode_step[done_v]):
                        episode_frames = episode_step

                        tf.logging.info(
                            'learner rank: %d, Env: %s Episode return: %f',
                            hvd.rank(), level_name, episode_return)

                        if hvd.rank() == 0:  # tb Logging
                            summary = tf.summary.Summary()
                            summary.value.add(tag=level_name +
                                              '/episode_return',
                                              simple_value=episode_return)
                            summary.value.add(tag=level_name +
                                              '/episode_frames',
                                              simple_value=episode_frames)
                            summary_writer.add_summary(summary,
                                                       num_env_frames_v)
            else:
                # Execute actors (they just need to enqueue their output).
                while True:
                    session.run(enqueue_ops)
Пример #16
0
def train(game_name):
    action_size, state_size = find_size(game_name)
    """Train."""
    if is_single_machine():
        local_job_device = ''
        shared_job_device = ''
        is_actor_fn = lambda i: True
        is_learner = True
        global_variable_device = '/gpu'
        server = tf.train.Server.create_local_server()
        filters = []
    else:
        pass

    # Only used to find the actor output structure.
    with tf.Graph().as_default():
        agent = Agent(action_size)
        env = create_environment(game_name, state_size)
        structure = build_actor(agent, env, game_name, action_size)
        flattened_structure = nest.flatten(structure)
        dtypes = [t.dtype for t in flattened_structure]
        shapes = [t.shape.as_list() for t in flattened_structure]

    with tf.Graph().as_default(), \
         tf.device(local_job_device + '/cpu'), \
         pin_global_variables(global_variable_device):
        tf.set_random_seed(FLAGS.seed)  # Makes initialization deterministic.

        with tf.device(shared_job_device):
            agent = Agent(action_size)

        tf.logging.info('Creating actor with game %s', game_name)
        env = create_environment(game_name, state_size)
        actor_output = build_actor(agent, env, game_name, action_size)
        # Create global step, which is the number of environment frames processed.
        tf.get_variable('num_environment_frames',
                        initializer=tf.zeros_initializer(),
                        shape=[],
                        dtype=tf.int64,
                        trainable=False,
                        collections=[
                            tf.GraphKeys.GLOBAL_STEP,
                            tf.GraphKeys.GLOBAL_VARIABLES
                        ])

        actor_output = nest.map_structure(lambda t: tf.expand_dims(t, 0),
                                          actor_output)

        def make_time_major(s):
            return nest.map_structure(
                lambda t: tf.transpose(t, [1, 0] + list(range(t.shape.ndims))[
                    2:]), s)

        actor_output = actor_output._replace(
            env_outputs=make_time_major(actor_output.env_outputs),
            agent_outputs=make_time_major(actor_output.agent_outputs))

        with tf.device('/gpu'):
            # Using StagingArea allows us to prepare the next batch and send it to
            # the GPU while we're performing a training step. This adds up to 1 step
            # policy lag.
            flattened_output = nest.flatten(actor_output)
            area = tf.contrib.staging.StagingArea(
                [t.dtype for t in flattened_output],
                [t.shape for t in flattened_output])
            stage_op = area.put(flattened_output)

            data_from_actors = nest.pack_sequence_as(structure, area.get())

        output = build_learner(agent, data_from_actors.env_outputs,
                               data_from_actors.agent_outputs)

        # Create MonitoredSession (to run the graph, checkpoint and log).
        tf.logging.info('Creating MonitoredSession, is_chief %s', is_learner)
        config = tf.ConfigProto(allow_soft_placement=True,
                                device_filters=filters)
        with tf.train.MonitoredTrainingSession(
                server.target,
                is_chief=is_learner,
                checkpoint_dir=FLAGS.logdir,
                save_checkpoint_secs=600,
                save_summaries_secs=30,
                log_step_count_steps=50000,
                config=config,
                hooks=[py_process.PyProcessHook()]) as session:

            # Logging.
            level_returns = {game_name: []}
            summary_writer = tf.summary.FileWriterCache.get(FLAGS.logdir)

            # Prepare data for first run.
            session.run_step_fn(
                lambda step_context: step_context.session.run(stage_op))

            # Execute learning and track performance.
            num_env_frames_v = 0
            while num_env_frames_v < FLAGS.total_environment_frames:
                level_names_v, done_v, infos_v, num_env_frames_v, _ = session.run(
                    (actor_output.level_name, ) + output + (stage_op, ))
                level_names_v = np.repeat([level_names_v], done_v.shape[0], 0)
                for level_name, episode_return, episode_step in zip(
                        level_names_v[done_v], infos_v.episode_return[done_v],
                        infos_v.episode_step[done_v]):
                    level_name = level_name.decode()
                    episode_frames = episode_step

                    tf.logging.info('Level: %s Episode return: %f', level_name,
                                    episode_return)
                    #tf.logging.info('Level: %s Episode frames: %f',
                    #                level_name, episode_frames)

                    summary = tf.summary.Summary()
                    summary.value.add(tag=level_name + '/episode_return',
                                      simple_value=episode_return)
                    summary.value.add(tag=level_name + '/episode_frames',
                                      simple_value=episode_frames)
                    summary_writer.add_summary(summary, num_env_frames_v)

                    level_returns[level_name].append(episode_return)
Пример #17
0
def train(act_space):
    local_job_device = '/job:%s/task:%d' % (FLAGS.job_name, FLAGS.task)
    shared_job_device = '/job:learner/task:0'
    is_worker_fn = lambda i: FLAGS.job_name == 'worker' and i == FLAGS.task
    is_learner = FLAGS.job_name == 'learner'

    # Placing the variable on CPU, makes it cheaper to send it to all the
    # workers. Continual copying the variables from the GPU is slow.
    global_variable_device = shared_job_device + '/cpu'
    cluster = tf.train.ClusterSpec({
        'worker':
        ['localhost:%d' % (8001 + i) for i in range(FLAGS.num_workers)],
        'learner': ['localhost:8000']
    })
    server = tf.train.Server(cluster,
                             job_name=FLAGS.job_name,
                             task_index=FLAGS.task)
    filters = [shared_job_device, local_job_device]

    # Only used to find the worker output structure.
    with tf.Graph().as_default():
        structure = build_worker(FLAGS.datadir, "*.seg")
        flattened_structure = nest.flatten(structure)
        dtypes = [t.dtype for t in flattened_structure]
        shapes = [t.shape.as_list() for t in flattened_structure]

    with tf.Graph().as_default(), \
         tf.device(local_job_device + '/cpu'), \
         pin_global_variables(global_variable_device):
        tf.set_random_seed(FLAGS.seed)  # Makes initialization deterministic.

        # Create Queue and Agent on the learner.
        with tf.device(shared_job_device):
            queue = tf.FIFOQueue(2 * FLAGS.batch_size,
                                 dtypes,
                                 shapes,
                                 shared_name='buffer')
            model = Model(act_space, FLAGS.frames, FLAGS.vf_clip)

        # Build workers and ops to enqueue their output.
        enqueue_ops = []
        for i in range(FLAGS.num_workers):
            if is_worker_fn(i):
                tf.logging.info('Creating worker %d', i)
                pattern = "*_%s_*.seg" % ((4 - len(str(i))) * "0" + str(i))
                worker_output = build_worker(FLAGS.datadir, pattern)
                with tf.device(shared_job_device):
                    enqueue_ops.append(
                        queue.enqueue(nest.flatten(worker_output)))

        # Build learner.
        if is_learner:
            # Create global step, which is the number of environment frames processed.
            num_frames = tf.get_variable('num_environment_frames',
                                         initializer=tf.zeros_initializer(),
                                         shape=[],
                                         dtype=tf.int64,
                                         trainable=False,
                                         collections=[
                                             tf.GraphKeys.GLOBAL_STEP,
                                             tf.GraphKeys.GLOBAL_VARIABLES
                                         ])

            # Create batch (time major) and recreate structure.
            dequeued = queue.dequeue_many(FLAGS.batch_size)
            dequeued = nest.pack_sequence_as(structure, dequeued)

            with tf.device('/gpu'):
                # Using StagingArea allows us to prepare the next batch and send it to
                # the GPU while we're performing a training step. This adds up to 1 step
                # policy lag.
                flattened_output = nest.flatten(dequeued)
                area = tf.contrib.staging.StagingArea(
                    [t.dtype for t in flattened_output],
                    [t.shape for t in flattened_output])
                stage_op = area.put(flattened_output)

                data_from_workers = nest.pack_sequence_as(
                    structure, area.get())

                # Unroll agent on sequence, create losses and update ops.
                output = build_learner(data_from_workers, act_space,
                                       num_frames)

        # Create MonitoredSession (to run the graph, checkpoint and log).
        tf.logging.info('Creating MonitoredSession, is_chief %s', is_learner)
        config = tf.ConfigProto(allow_soft_placement=True,
                                device_filters=filters)
        with tf.train.MonitoredTrainingSession(
                server.target,
                is_chief=is_learner,
                checkpoint_dir=FLAGS.logdir,
                save_checkpoint_secs=600,
                save_summaries_secs=30,
                log_step_count_steps=50000,
                config=config,
                hooks=[py_process.PyProcessHook()]) as session:

            if is_learner:
                # Logging.
                # level_returns = {level_name: [] for level_name in level_names}
                # summary_writer = tf.summary.FileWriterCache.get(FLAGS.logdir)

                # Prepare data for first run.
                session.run_step_fn(
                    lambda step_context: step_context.session.run(stage_op))

                # Execute learning and track performance.
                num_env_frames_v = 0
                while num_env_frames_v < FLAGS.total_environment_frames:
                    num_env_frames_v, _ = session.run([output, stage_op])
                    # level_names_v = np.repeat([level_names_v], done_v.shape[0], 0)
                    #
                    # for level_name, episode_return, episode_step in zip(
                    #         level_names_v[done_v],
                    #         infos_v.episode_return[done_v],
                    #         infos_v.episode_step[done_v]):
                    #     episode_frames = episode_step * FLAGS.num_action_repeats
                    #
                    #     tf.logging.info('Level: %s Episode return: %f',
                    #                     level_name, episode_return)
                    #
                    #     summary = tf.summary.Summary()
                    #     summary.value.add(tag=level_name + '/episode_return',
                    #                       simple_value=episode_return)
                    #     summary.value.add(tag=level_name + '/episode_frames',
                    #                       simple_value=episode_frames)
                    #     summary_writer.add_summary(summary, num_env_frames_v)
                    #
                    #     if FLAGS.level_name == 'dmlab30':
                    #         level_returns[level_name].append(episode_return)
                    #
                    # if (FLAGS.level_name == 'dmlab30' and
                    #         min(map(len, level_returns.values())) >= 1):
                    #     no_cap = dmlab30.compute_human_normalized_score(level_returns,
                    #                                                     per_level_cap=None)
                    #     cap_100 = dmlab30.compute_human_normalized_score(level_returns,
                    #                                                      per_level_cap=100)
                    #     summary = tf.summary.Summary()
                    #     summary.value.add(
                    #         tag='dmlab30/training_no_cap', simple_value=no_cap)
                    #     summary.value.add(
                    #         tag='dmlab30/training_cap_100', simple_value=cap_100)
                    #     summary_writer.add_summary(summary, num_env_frames_v)
                    #
                    #     # Clear level scores.
                    #     level_returns = {level_name: [] for level_name in level_names}
            else:
                # Execute workers (they just need to enqueue their output).
                while True:
                    session.run(enqueue_ops)
Пример #18
0
def train():
    """Train."""

    if is_single_machine():
        tf.logging.info("Running on single machine")
        local_job_device = ''
        shared_job_device = ''
        is_actor_fn = lambda i: True
        is_learner = True
        global_variable_device = '/gpu'
        server = tf.train.Server.create_local_server()
        filters = []
    else:
        local_job_device = '/job:%s/task:%d' % (FLAGS.job_name, FLAGS.task)
        shared_job_device = '/job:learner/task:0'
        is_actor_fn = lambda i: FLAGS.job_name == 'actor' and i == FLAGS.task
        is_learner = FLAGS.job_name == 'learner'

        # Placing the variable on CPU, makes it cheaper to send it to all the
        # actors. Continual copying the variables from the GPU is slow.
        global_variable_device = shared_job_device + '/cpu'

        # Represents a cluster as a set of
        # "tasks", organized into "jobs".
        # A tf.train.ClusterSpec represents the set of
        # processes that participate in a distributed
        # TensorFlow computation. Every tf.train.Server
        # is constructed in a particular cluster.
        cluster = tf.train.ClusterSpec({
            'actor':
            ['localhost:%d' % (8001 + i) for i in range(FLAGS.num_actors)],
            'learner': ['localhost:8000']
        })

        server = tf.train.Server(cluster,
                                 job_name=FLAGS.job_name,
                                 task_index=FLAGS.task)

        filters = [shared_job_device, local_job_device]

    # Only used to find the actor output structure.
    with tf.Graph().as_default():

        agent = Agent(feature_num=FLAGS.feature_num,
                      asset_num=FLAGS.asset_num,
                      window_size=FLAGS.window_size,
                      commission=FLAGS.commission)

        env = create_environment(DEFAULT_CONFIG)

        structure = build_actor(agent=agent, env=env, FLAGS=FLAGS)

        flattened_structure = nest.flatten(structure)
        dtypes = [t.dtype for t in flattened_structure]
        shapes = [t.shape.as_list() for t in flattened_structure]

    with tf.Graph().as_default(), \
         tf.device(local_job_device + '/cpu'), \
         pin_global_variables(global_variable_device):

        tf.set_random_seed(FLAGS.seed)  # Makes initialization deterministic.

        # BUILD QUEUE
        # ===========================================================================>
        with tf.device(shared_job_device):
            # A queue implementation that dequeues
            # elements in first-in first-out order.
            # Creates a queue that dequeues elements
            # in a first-in first-out order.
            # A FIFOQueue has bounded capacity; supports
            # multiple concurrent producers and consumers;
            # and provides exactly-once delivery.
            # A FIFOQueue holds a list of up to capacity
            # elements. Each element is a fixed-length
            # tuple of tensors whose dtypes are described
            # by dtypes, and whose shapes are optionally
            # described by the shapes argument.
            # If the shapes argument is specified, each
            # component of a queue element must have the
            # respective fixed shape. If it is unspecified,
            # different queue elements may have different
            # shapes, but the use of dequeue_many is disallowed.
            queue = tf.FIFOQueue(capacity=100,
                                 dtypes=dtypes,
                                 shapes=shapes,
                                 shared_name='buffer')

            agent = Agent(feature_num=FLAGS.feature_num,
                          asset_num=FLAGS.asset_num,
                          window_size=FLAGS.window_size,
                          commission=FLAGS.commission)

            if is_single_machine() and 'dynamic_batching' in sys.modules:
                # For single machine training, we use dynamic batching for improved GPU
                # utilization. The semantics of single machine training are slightly
                # different from the distributed setting because within a single unroll
                # of an environment, the actions may be computed using different weights
                # if an update happens within the unroll.
                old_build = agent._build

                @dynamic_batching.batch_fn
                def build(*args):
                    with tf.device('/gpu'):
                        return old_build(*args)

                tf.logging.info('Using dynamic batching.')
                agent._build = build

        # BUILD ACTORS
        # ===========================================================================>

        # Todo make better for real time environment
        # Build actors and ops to enqueue their output.
        enqueue_ops = []
        for i in range(FLAGS.num_actors):  # TODO change to env configurations
            if is_actor_fn(i):

                tf.logging.info('Creating actor with config')

                env = create_environment(DEFAULT_CONFIG)

                actor_output = build_actor(agent=agent, env=env, FLAGS=FLAGS)

                # Append the actor outputs to the
                # FIFOQueue above in order to pass
                # the environment outputs and action
                # outputs processed later
                with tf.device(shared_job_device):
                    enqueue_ops.append(
                        queue.enqueue(nest.flatten(actor_output)))

        # ADD QUEUE RUNNER
        # ===========================================================================>

        # If running in a single machine setup, run actors with QueueRunners
        # (separate threads).
        if is_learner and enqueue_ops:
            # Holds a list of enqueue operations for a queue, each to be run in a thread.
            # Queues are a convenient TensorFlow mechanism to compute tensors asynchronously
            # using multiple threads. For example in the canonical 'Input Reader' setup one
            # set of threads generates filenames in a queue; a second set of threads read
            # records from the files, processes them, and enqueues tensors on a second queue;
            # a third set of threads dequeues these input records to construct batches and
            # runs them through training operations.
            # There are several delicate issues when running multiple threads that way:
            # closing the queues in sequence as the input is exhausted, correctly catching
            # and reporting exceptions, etc.
            # The QueueRunner, combined with the Coordinator, helps handle these issues.
            tf.train.add_queue_runner(tf.train.QueueRunner(queue, enqueue_ops))

        # BUILD LEARNER
        # ===========================================================================>
        if is_learner:
            # Create global step, which is the number
            # of environment frames processed.
            tf.get_variable('num_environment_frames',
                            initializer=tf.zeros_initializer(),
                            shape=[],
                            dtype=tf.int64,
                            trainable=False,
                            collections=[
                                tf.GraphKeys.GLOBAL_STEP,
                                tf.GraphKeys.GLOBAL_VARIABLES
                            ])

            # Create batch (time major) and recreate structure.
            dequeued = queue.dequeue_many(FLAGS.batch_size)
            dequeued = nest.pack_sequence_as(structure, dequeued)

            def make_time_major(s):
                return nest.map_structure(
                    lambda t: tf.transpose(t, [1, 0] + list(
                        range(t.shape.ndims))[2:]), s)

            # Make dequeued time major
            dequeued = dequeued._replace(
                env_outputs=make_time_major(dequeued.env_outputs),
                agent_outputs=make_time_major(dequeued.agent_outputs))

            with tf.device('/gpu'):
                flattened_output = nest.flatten(dequeued)
                # Using StagingArea allows us to prepare the next batch and send it to
                # the GPU while we're performing a training step. This adds up to 1 step
                # policy lag.
                # Class for staging inputs. No ordering guarantees.
                # A StagingArea is a TensorFlow data structure that
                # stores tensors across multiple steps, and exposes
                # operations that can put and get tensors.
                # Each StagingArea element is a tuple of one or more
                # tensors, where each tuple component has a static
                # dtype, and may have a static shape.
                # The capacity of a StagingArea may be bounded or
                # unbounded. It supports multiple concurrent producers
                # and consumers; and provides exactly-once delivery.
                # Each element of a StagingArea is a fixed-length tuple
                # of tensors whose dtypes are described by dtypes, and
                # whose shapes are optionally described by the shapes
                # argument.
                # If the shapes argument is specified, each component
                # of a staging area element must have the respective
                # fixed shape. If it is unspecified, different elements
                # may have different shapes,
                # It can be configured with a capacity in which case
                # put(values) will block until space becomes available.
                area = tf.contrib.staging.StagingArea(
                    [t.dtype for t in flattened_output],
                    [t.shape for t in flattened_output])

                # Operation to add flattened output from
                # dequeued env outputs with their respective
                # agent outputs
                stage_op = area.put(flattened_output)

                # In this instance structure refers to
                # the output from build actor above
                data_from_actors = nest.pack_sequence_as(structure, area.get())

                # Unroll agent on sequence,
                # create losses and update ops.
                output = build_learner(
                    agent=agent,
                    env_outputs=data_from_actors.env_outputs,
                    agent_outputs=data_from_actors.agent_outputs,
                    FLAGS=FLAGS)

        # Create MonitoredSession (to run the graph, checkpoint and log).
        tf.logging.info('Creating MonitoredSession, is_chief %s', is_learner)

        config = tf.ConfigProto(allow_soft_placement=True,
                                device_filters=filters)

        # RUN GRAPH
        # ===========================================================================>

        #Creates a MonitoredSession for training.
        # For a chief, this utility sets proper session
        # initializer/restorer. It also creates hooks
        # related to checkpoint and summary saving.
        # For workers, this utility sets proper session
        # creator which waits for the chief to
        # initialize/restore. Please check
        # tf.train.MonitoredSession for more information.
        with tf.train.MonitoredTrainingSession(
                server.target,
                is_chief=is_learner,
                checkpoint_dir=FLAGS.logdir,
                save_checkpoint_secs=600,
                save_summaries_secs=30,
                log_step_count_steps=50000,
                config=config,
                hooks=[py_process.PyProcessHook()]) as session:

            tf.logging.info('Commencing training run')

            # If the agent is a learner
            if is_learner:
                # Logging.
                summary_writer = tf.summary.FileWriterCache.get(FLAGS.logdir)

                # Prepare data for first run.
                tf.logging.info('Preparing data for first run')

                session.run_step_fn(lambda step_context: step_context.session.
                                    run(actor_output))

                # Execute learning and track performance.
                num_env_frames_v = 0

                #
                # =================================================================>
                while num_env_frames_v < FLAGS.total_environment_frames:
                    done_v, infos_v, num_env_frames_v, _ = session.run(
                        output + (stage_op, ))

                    # TODO add logging and metric storage

            else:
                # Execute actors (they just need to enqueue their output).
                tf.logging.info('Running enqueue ops')
                while True:
                    session.run(enqueue_ops)
Пример #19
0
def train(action_set, level_names):
    """Train."""
    if is_single_machine():
        local_job_device = ''
        shared_job_device = ''
        is_actor_fn = lambda i: True
        is_learner = True
        global_variable_device = '/gpu'
        server = tf.train.Server.create_local_server()
        filters = []
    else:
        local_job_device = '/job:%s/task:%d' % (FLAGS.job_name, FLAGS.task)
        shared_job_device = '/job:learner/task:0'
        is_actor_fn = lambda i: FLAGS.job_name == 'actor' and i == FLAGS.task
        is_learner = FLAGS.job_name == 'learner'

        # Placing the variable on CPU, makes it cheaper to send it to all the
        # actors. Continual copying the variables from the GPU is slow.
        global_variable_device = shared_job_device + '/cpu'
        cluster = tf.train.ClusterSpec({
            'actor':
            ['localhost:%d' % (8001 + i) for i in range(FLAGS.num_actors)],
            'learner': ['localhost:8000']
        })
        server = tf.train.Server(cluster,
                                 job_name=FLAGS.job_name,
                                 task_index=FLAGS.task)
        filters = [shared_job_device, local_job_device]

    # Only used to find the actor output structure.
    Agent = agent_factory(FLAGS.agent_name)
    with tf.Graph().as_default():
        specific_atari_game = level_names[0]
        env = create_atari_environment(specific_atari_game, seed=1)
        agent = Agent(len(action_set))
        structure = build_actor(agent, env, specific_atari_game, action_set)
        flattened_structure = nest.flatten(structure)
        dtypes = [t.dtype for t in flattened_structure]
        shapes = [t.shape.as_list() for t in flattened_structure]

    with tf.Graph().as_default(), \
         tf.device(local_job_device + '/cpu'), \
         pin_global_variables(global_variable_device):
        tf.set_random_seed(FLAGS.seed)  # Makes initialization deterministic.

        # Create Queue and Agent on the learner.
        with tf.device(shared_job_device):
            queue = tf.FIFOQueue(1, dtypes, shapes, shared_name='buffer')
            agent = Agent(len(action_set))

            if is_single_machine() and 'dynamic_batching' in sys.modules:
                # For single machine training, we use dynamic batching for improved GPU
                # utilization. The semantics of single machine training are slightly
                # different from the distributed setting because within a single unroll
                # of an environment, the actions may be computed using different weights
                # if an update happens within the unroll.
                old_build = agent._build

                @dynamic_batching.batch_fn
                def build(*args):

                    with tf.device('/gpu'):
                        return old_build(*args)

                tf.logging.info('Using dynamic batching.')
                agent._build = build

        # Build actors and ops to enqueue their output.
        enqueue_ops = []
        for i in range(FLAGS.num_actors):
            if is_actor_fn(i):
                level_name = level_names[i % len(level_names)]
                tf.logging.info('Creating actor %d with level %s', i,
                                level_name)
                env = create_atari_environment(level_name, seed=i + 1)
                actor_output = build_actor(agent, env, level_name, action_set)
                with tf.device(shared_job_device):
                    enqueue_ops.append(
                        queue.enqueue(nest.flatten(actor_output)))

        # If running in a single machine setup, run actors with QueueRunners
        # (separate threads).
        if is_learner and enqueue_ops:

            tf.train.add_queue_runner(tf.train.QueueRunner(queue, enqueue_ops))

        # Build learner.
        if is_learner:
            # Create global step, which is the number of environment frames processed.
            global_step = tf.get_variable('num_environment_frames',
                                          initializer=tf.zeros_initializer(),
                                          shape=[],
                                          dtype=tf.int64,
                                          trainable=False,
                                          collections=[
                                              tf.GraphKeys.GLOBAL_STEP,
                                              tf.GraphKeys.GLOBAL_VARIABLES
                                          ])

            # Create batch (time major) and recreate structure.
            dequeued = queue.dequeue_many(FLAGS.batch_size)
            dequeued = nest.pack_sequence_as(structure, dequeued)

            def make_time_major(s):
                return nest.map_structure(
                    lambda t: tf.transpose(t, [1, 0] + list(
                        range(t.shape.ndims))[2:]), s)

            dequeued = dequeued._replace(
                env_outputs=make_time_major(dequeued.env_outputs),
                agent_outputs=make_time_major(dequeued.agent_outputs))

            with tf.device('/gpu'):
                # Using StagingArea allows us to prepare the next batch and send it to
                # the GPU while we're performing a training step. This adds up to 1 step
                # policy lag.
                flattened_output = nest.flatten(dequeued)
                area = tf.contrib.staging.StagingArea(
                    [t.dtype for t in flattened_output],
                    [t.shape for t in flattened_output])
                stage_op = area.put(flattened_output)

                # Returns an ActorOutput tuple -> (level name, agent_state, env_outputs, agent_output)
                data_from_actors = nest.pack_sequence_as(structure, area.get())

                # levels_index = tf.map_fn(lambda y: tf.py_function(lambda x: game_id[x.numpy()], [y], Tout=tf.int32), data_from_actors.level_name, dtype=tf.int32, parallel_iterations=56)
                # levels_index = tf.reshape(levels_index, [FLAGS.batch_size])
                levels_index = data_from_actors.level_id
                # Unroll agent on sequence, create losses and update ops.
                output = build_learner(agent,
                                       data_from_actors.env_outputs,
                                       data_from_actors.agent_outputs,
                                       global_step=global_step,
                                       levels_index=levels_index)

        # Create MonitoredSession (to run the graph, checkpoint and log).
        tf.logging.info('Creating MonitoredSession, is_chief %s', is_learner)
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3)
        config = tf.ConfigProto(allow_soft_placement=True,
                                device_filters=filters,
                                gpu_options=gpu_options)
        # config.gpu_options.allow_growth = True
        # config.gpu_options.per_process_gpu_memory_fraction = 0.8
        logdir = FLAGS.logdir
        with tf.train.MonitoredTrainingSession(
                server.target,
                is_chief=is_learner,
                checkpoint_dir=logdir,
                save_checkpoint_secs=600,
                save_summaries_secs=30,
                log_step_count_steps=50000,
                config=config,
                hooks=[py_process.PyProcessHook()]) as session:

            if is_learner:
                # Logging.
                level_returns = {level_name: [] for level_name in level_names}
                summary_dir = os.path.join(FLAGS.logdir, "logging")
                summary_writer = tf.summary.FileWriterCache.get(summary_dir)
                # Prepare data for first run.
                session.run_step_fn(
                    lambda step_context: step_context.session.run(stage_op))
                # Execute learning and track performance.
                num_env_frames_v = 0

                # Uncomment these lines to print the number of parameters.
                # print("total params:", np.sum([np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()]))
                # vas = tf.trainable_variables()
                # for elem in vas:
                #   print(elem)
                # print("Params: ", [v.get_shape().as_list() for v in tf.trainable_variables()])

                while num_env_frames_v < FLAGS.total_environment_frames:
                    level_names_v, done_v, infos_v, num_env_frames_v, _ = session.run(
                        (data_from_actors.level_name, ) + output +
                        (stage_op, ))

                    level_names_v = np.repeat([level_names_v], done_v.shape[0],
                                              0)
                    for level_name, episode_return, episode_step, acc_episode_reward, acc_episode_step in zip(
                            level_names_v[done_v],
                            infos_v.episode_return[done_v],
                            infos_v.episode_step[done_v],
                            infos_v.acc_episode_reward[done_v],
                            infos_v.acc_episode_step[done_v]):

                        episode_frames = episode_step * FLAGS.num_action_repeats
                        tf.logging.info(
                            'Level: %s Episode return: %f Acc return %f after %d frames',
                            level_name, episode_return, acc_episode_reward,
                            num_env_frames_v)

                        summary = tf.summary.Summary()
                        summary.value.add(tag=level_name + '/episode_return',
                                          simple_value=episode_return)
                        summary.value.add(tag=level_name + '/episode_frames',
                                          simple_value=episode_frames)
                        summary.value.add(tag=level_name +
                                          '/acc_episode_return',
                                          simple_value=acc_episode_reward)
                        summary.value.add(tag=level_name +
                                          '/acc_episode_frames',
                                          simple_value=acc_episode_step)
                        summary_writer.add_summary(summary, num_env_frames_v)

                        level_returns[level_name].append(episode_return)

                    current_episode_return_list = min(
                        map(len, level_returns.values()))
                    if FLAGS.multi_task == 1 and current_episode_return_list >= 1:

                        def sum_none(list_):
                            if list_:
                                return sum(list_)
                            else:
                                return None

                        level_returns = {
                            level_name: sum_none(level_returns[level_name])
                            for level_name in level_names
                        }

                        no_cap = atari_utils.compute_human_normalized_score(
                            level_returns, per_level_cap=None)
                        cap_100 = atari_utils.compute_human_normalized_score(
                            level_returns, per_level_cap=100)

                        summary = tf.summary.Summary()
                        summary.value.add(tag=(level_name +
                                               '/training_no_cap'),
                                          simple_value=no_cap)
                        summary.value.add(tag=(level_name +
                                               '/training_cap_100'),
                                          simple_value=cap_100)

                        level_returns = {
                            level_name: []
                            for level_name in level_names
                        }
            else:
                # Execute actors (they just need to enqueue their output).
                while True:
                    session.run(enqueue_ops)
Пример #20
0
def test(action_set):
  """Test."""
  with tf.Graph().as_default():
    # Get EnvironmentFactory
    env_sampler = env_factory.EnvironmentFactory(
        FLAGS.recipes_path, FLAGS.hints_path, max_steps=FLAGS.max_steps,
        reuse_environments=FLAGS.reuse_environments, seed=1, visualise=True)
    dummy_env = env_sampler.sample_environment()
    obs_spec = dummy_env.obs_specs()
    task_names = sorted(env_sampler.task_names)
    dummy_env.render_matplotlib()

    agent = Agent(len(action_set), obs_spec)
    outputs = {}
    task_returns = collections.defaultdict(list)

    # Test on all environments one after another
    for task_name in task_names:
      env = create_environment(
          env_sampler, initial_task_name=task_name, seed=1)
      outputs[task_name] = build_actor(agent, env, task_name, action_set)

    with tf.train.SingularMonitoredSession(
            checkpoint_dir=FLAGS.logdir,
            hooks=[py_process.PyProcessHook()]) as session:
      for task_name in task_names:
        tf.logging.info('Testing task: %s', task_name)
        returns = task_returns[task_name]
        while len(returns) < FLAGS.test_num_episodes:
          rewards_v, done_v, observations_v = session.run(
              (outputs[task_name].env_outputs.reward,
               outputs[task_name].env_outputs.done,
               outputs[task_name].env_outputs.observation))

          # Repack the environment outputs
          rewards_v = rewards_v[1:]
          done_v = done_v[1:]
          observations_dict = {
            obs_name: observations_v[obs_i][1:]
            for obs_i, obs_name in enumerate(obs_spec.keys())
          }

          # Check the performance
          episode_returns = rewards_v[done_v]
          returns.extend(episode_returns)

          # Visualise render
          num_episodes_seen = 0
          for frame_i, frame in enumerate(observations_dict['image'][:30]):
            if rewards_v[frame_i]:
              rewarding_frame = observations_dict['image'][frame_i - 1].copy()
              rewarding_frame[:40] *= np.array([0, 1, 0])
              dummy_env.render_matplotlib(
                  frame=rewarding_frame, delta_time=0.7)
            else:
              if frame_i == 0:
                dummy_env.render_matplotlib(frame=frame, delta_time=1.5)
              else:
                dummy_env.render_matplotlib(frame=frame, delta_time=0.3)
            if done_v[frame_i]:
              num_episodes_seen += 1

            if num_episodes_seen >= FLAGS.test_num_episodes:
              break

        returns_avg = np.mean(returns)
        # Logging
        tf.logging.info('Evaluating task %s -> episode return: %f',
                        task_name, returns_avg)
Пример #21
0
def train(action_set):
  """Train."""

  if is_single_machine():
    local_job_device = ''
    shared_job_device = ''

    def is_actor_fn(i): return True
    is_learner = True
    global_variable_device = '/gpu'
    server = tf.train.Server.create_local_server()
    filters = []
  else:
    local_job_device = '/job:%s/task:%d' % (FLAGS.job_name, FLAGS.task)
    shared_job_device = '/job:learner/task:0'

    def is_actor_fn(i): return FLAGS.job_name == 'actor' and i == FLAGS.task
    is_learner = FLAGS.job_name == 'learner'

    # Placing the variable on CPU, makes it cheaper to send it to all the
    # actors. Continual copying the variables from the GPU is slow.
    global_variable_device = shared_job_device + '/cpu'
    cluster = tf.train.ClusterSpec({
        'actor': ['localhost:%d' % (8001 + i) for i in range(FLAGS.num_actors)],
        'learner': ['localhost:8000']
    })
    server = tf.train.Server(cluster, job_name=FLAGS.job_name,
                             task_index=FLAGS.task)
    filters = [shared_job_device, local_job_device]

  # Only used to find the actor output structure.
  with tf.Graph().as_default():
    # here the meta learning algorithm should propose the task

    env_sampler = env_factory.EnvironmentFactory(
        FLAGS.recipes_path, FLAGS.hints_path, max_steps=FLAGS.max_steps,
        reuse_environments=FLAGS.reuse_environments, seed=1)
    dummy_env = env_sampler.sample_environment()
    obs_spec = dummy_env.obs_specs()
    env = create_environment(env_sampler, seed=1)

    teacher = Teacher(env_sampler.task_names, gamma=FLAGS.gamma)

    agent = Agent(len(action_set), obs_spec)
    structure = build_actor(agent, env, '', action_set)
    flattened_structure = nest.flatten(structure)
    dtypes = [t.dtype for t in flattened_structure]
    shapes = [t.shape.as_list() for t in flattened_structure]

  with tf.Graph().as_default(), \
          tf.device(local_job_device + '/cpu'), \
          pin_global_variables(global_variable_device):
    tf.set_random_seed(FLAGS.seed)  # Makes initialization deterministic.

    # Create Queue and Agent on the learner.
    with tf.device(shared_job_device):
      queue = tf.FIFOQueue(1, dtypes, shapes, shared_name='buffer')
      agent = Agent(len(action_set), obs_spec)

      # Setup the task names variables and assignment logic
      teacher_task_ph = tf.placeholder(
            dtype=tf.string, shape=(), name='teacher_task_name')
      task_names = env_sampler.task_names
      actor_task_name_params = collections.defaultdict(list)
      for actor_i in range(FLAGS.num_actors):
        if FLAGS.actors_same_task:
          # Initialise all actors to the same task
          initial_task_name = task_names[0]
        else:
          # Assign initial task name by round-robin
          initial_task_name = task_names[actor_i % len(task_names)]
          assert FLAGS.progress_signal == 'random', (
              "Using different tasks per actors with a Teacher hasn't been "
              "tested. Use progress_signal=random.")

        # Setup variables and assignment logic
        actor_task_name_var = tf.get_variable(
            "task_name_actor_{}".format(actor_i),
            shape=(),
            dtype=tf.string,
            initializer=tf.constant_initializer(
                initial_task_name, dtype=tf.string),
            trainable=False,
            collections=[tf.GraphKeys.GLOBAL_VARIABLES]
        )
        actor_task_name_ph = tf.placeholder(
            dtype=tf.string, shape=(), name='actor_{}_new_task_name'.format(actor_i))
        assign_actor_task_name = tf.assign(
            actor_task_name_var, actor_task_name_ph,
            name='update_task_name_actor_{}'.format(actor_i))
        actor_task_name_params['task_name'].append(initial_task_name)
        actor_task_name_params['var'].append(actor_task_name_var)
        actor_task_name_params['ph'].append(actor_task_name_ph)
        actor_task_name_params['update'].append(assign_actor_task_name)

      if is_single_machine() and 'dynamic_batching' in sys.modules:
        # For single machine training, we use dynamic batching for improved GPU
        # utilization. The semantics of single machine training are slightly
        # different from the distributed setting because within a single unroll
        # of an environment, the actions may be computed using different weights
        # if an update happens within the unroll.
        old_build = agent._build

        @dynamic_batching.batch_fn
        def build(*args):
          with tf.device('/gpu'):
            return old_build(*args)
        tf.logging.info('Using dynamic batching.')
        agent._build = build

    # Build actors and ops to enqueue their output.
    enqueue_ops = []
    for actor_i in range(FLAGS.num_actors):
      if is_actor_fn(actor_i):
        env = create_environment(env_sampler, seed=actor_i+1)
        tf.logging.info('Creating actor %d with level %s',
                        actor_i, actor_task_name_params['task_name'][actor_i])
        actor_output = build_actor(
            agent, env, actor_task_name_params['var'][actor_i].read_value(), action_set)
        with tf.device(shared_job_device):
          enqueue_ops.append(queue.enqueue(nest.flatten(actor_output)))

    # Build evaluation ops for every task, which will keep computing returns
    # on all tasks.
    evaluation_output = {}
    if is_learner:
      with tf.name_scope("evaluation"):
        for task_name in task_names:
          env = create_environment(
              env_sampler, initial_task_name=task_name, seed=1)
          evaluation_output[task_name] = build_actor(
              agent, env, task_name, action_set)

    # If running in a single machine setup, run actors with QueueRunners
    # (separate threads).
    if is_learner and enqueue_ops:
      tf.train.add_queue_runner(tf.train.QueueRunner(queue, enqueue_ops))

    # Build learner.
    if is_learner:
      # Create global step, which is the number of environment frames processed.
      tf.get_variable(
          'num_environment_frames',
          initializer=tf.zeros_initializer(),
          shape=[],
          dtype=tf.int64,
          trainable=False,
          collections=[tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES])

      # Create batch (time major) and recreate structure.
      dequeued = queue.dequeue_many(FLAGS.batch_size)
      dequeued = nest.pack_sequence_as(structure, dequeued)

      def make_time_major(s):
        return nest.map_structure(
            lambda t: tf.transpose(t, [1, 0] + list(range(t.shape.ndims))[2:]), s)

      dequeued = dequeued._replace(
          env_outputs=make_time_major(dequeued.env_outputs),
          agent_outputs=make_time_major(dequeued.agent_outputs))

      with tf.device('/gpu'):
        # Using StagingArea allows us to prepare the next batch and send it to
        # the GPU while we're performing a training step. This adds up to 1 step
        # policy lag.
        flattened_output = nest.flatten(dequeued)
        area = tf.contrib.staging.StagingArea(
            [t.dtype for t in flattened_output],
            [t.shape for t in flattened_output])
        stage_op = area.put(flattened_output)

        data_from_actors = nest.pack_sequence_as(structure, area.get())

        # Unroll agent on sequence, create losses and update ops.
        done, infos, num_env_frames_and_train, progress_signal = (
            build_learner(agent, data_from_actors.agent_state,
                          data_from_actors.env_outputs,
                          data_from_actors.agent_outputs, teacher_task_ph))

    # Create MonitoredSession (to run the graph, checkpoint and log).
    tf.logging.info('Creating MonitoredSession, is_chief %s', is_learner)
    config = tf.ConfigProto(allow_soft_placement=True, device_filters=filters)
    with tf.train.MonitoredTrainingSession(
            server.target,
            is_chief=is_learner,
            checkpoint_dir=FLAGS.logdir,
            save_checkpoint_secs=600,
            save_summaries_secs=30,
            log_step_count_steps=50000,
            config=config,
            hooks=[py_process.PyProcessHook()]) as session:

      if is_learner:
        summary_writer = tf.summary.FileWriterCache.get(FLAGS.logdir)

        # Prepare data for first run.
        session.run_step_fn(
            lambda step_context: step_context.session.run(stage_op))

        # Execute learning and track performance.
        num_env_frames_v = 0
        num_teacher_update = 0
        next_task_switch_at = FLAGS.switch_tasks_every_k_frames
        last_return_tasks = collections.defaultdict(float)
        task_average_returns = collections.defaultdict(float)
        advantage_previous_returns = collections.defaultdict(float)
        progress_since_switch = []
        returns_task_since_switch = collections.defaultdict(list)
        teacher_history = collections.defaultdict(dict)
        evaluation_task_returns = collections.defaultdict(float)
        next_evaluation_at = FLAGS.evaluate_every_k_frames

        teacher_selected_task_name = actor_task_name_params['task_name'][0]

        while num_env_frames_v < FLAGS.total_environment_frames:
          # Perform one training step, on a minibatch.
          (done_v, infos_v, num_env_frames_v, progress_signal_v,
           _) = session.run(
               (done, infos,
                num_env_frames_and_train, progress_signal, stage_op),
               feed_dict={
                   teacher_task_ph: teacher_selected_task_name
               })

          # Per task, let's average metrics in the current minibatch.
          for task_name in task_names:
            # Only keep part of the minibatch for the current task.
            done_task = done_v & (infos_v.task_name == task_name)
            if np.any(done_task):
              # This task was present in this minibatch
              task_episode_return = np.mean(infos_v.episode_return[done_task])
              task_episode_frames = np.mean(
                  infos_v.episode_step[done_task] * FLAGS.num_action_repeats)

              if task_name == teacher_selected_task_name:
                # Keep the progress_signal across training batches.
                # Only do so if the task corresponds to what the Teacher asked.
                # This will discard progress_signal_v for minibatches that have
                # old tasks.
                progress_since_switch.append(progress_signal_v)

              # For every task, keep the last returns.
              last_return_tasks[task_name] = task_episode_return

              # One summary per task in this minibatch.
              summary = tf.summary.Summary()
              summary.value.add(
                  tag=task_name + '/episode_return',
                  simple_value=task_episode_return)
              summary.value.add(
                  tag=task_name + '/episode_frames',
                  simple_value=task_episode_frames)
              summary.value.add(
                  tag=task_name + '/progress',
                  simple_value=progress_signal_v)
              summary.value.add(
                  tag='Teacher/progress_signal_' + FLAGS.progress_signal,
                  simple_value=progress_signal_v)
              summary.value.add(
                  tag='Teacher/task_selected',
                  simple_value=task_names.index(task_name))
              summary_writer.add_summary(summary, num_env_frames_v)

            # Keep track of returns for all tasks, through time
            # (default to 0 if the task was never selected yet)
            # This will keep the last score even when the task is not retrained
            # on, but that's actually what Tensorboard shows, so it's ok.
            returns_task_since_switch[task_name].append(
                last_return_tasks[task_name])

          # Perform a full evaluation on all tasks
          if num_env_frames_v >= next_evaluation_at:
            summary_evaluator = tf.summary.Summary()

            for task_name in task_names:
              returns = []
              while len(returns) < FLAGS.test_num_episodes:
                rewards_v, done_v = session._tf_sess().run(
                    (evaluation_output[task_name].env_outputs.reward,
                     evaluation_output[task_name].env_outputs.done))

                # Repack the environment outputs
                rewards_v = rewards_v[1:]
                done_v = done_v[1:]

                # Check the performance
                episode_returns = rewards_v[done_v]
                returns.extend(episode_returns)

              # Store mean returns per task
              returns_avg = np.mean(returns)
              evaluation_task_returns[task_name] = returns_avg

              # Logging/Tensorboard
              tf.logging.info('[%d] Evaluating task %s -> episode return: %f',
                              num_env_frames_v, task_name, returns_avg)
              summary_evaluator.value.add(
                  tag='Evaluation/' + task_name + '/episode_return',
                  simple_value=returns_avg)

              # Also use these evaluation values to bootstrap the Advantage
              # previous rewards
              advantage_previous_returns[task_name] = (
                  0.8 * advantage_previous_returns[task_name]
                  + 0.2 * returns_avg)

            summary_writer.add_summary(summary_evaluator, num_env_frames_v)
            next_evaluation_at += FLAGS.evaluate_every_k_frames

          # Now ask the Teacher for new tasks to train on!
          if num_env_frames_v >= next_task_switch_at:
            print("Let's update the tasks for all actors now!")

            # Compute average return for ~all tasks since last switch
            task_average_returns = {
                task_name: np.mean(returns_task_since_switch[task_name])
                for task_name in task_names
            }

            # Compute the progress signal for the Teacher
            if FLAGS.progress_signal == 'advantage':
              # For the Advantage (reward[T] - reward[T-K]), we need to compare
              # to "previous" reward values.
              # Previous rewards are either evaluation_task_returns or
              # task_average_returns, whichever is "fresher"
              rewards_post_switch = np.mean(progress_since_switch or 0)
              progress_for_teacher = np.abs(
                  rewards_post_switch -
                  advantage_previous_returns[teacher_selected_task_name])

              # Update last returns
              advantage_previous_returns[teacher_selected_task_name] = (
                  0.9 * advantage_previous_returns[teacher_selected_task_name]
                  + 0.1 * rewards_post_switch)
            else:
              # For the other signals, we can use them directly.
              progress_for_teacher = np.mean(progress_since_switch or 0)

            # Update Teacher according to the progress signal we got!
            if FLAGS.progress_signal != 'random':
              teacher.update(teacher_selected_task_name, progress_for_teacher)

            # Log / Tensorboard
            tf.logging.info("[%d][%d] Task: %s, Episode return mean: %.1f, "
                            "\n\tTeacher progress signal %s: %.3f",
                            num_teacher_update, num_env_frames_v,
                            teacher_selected_task_name,
                            task_average_returns[teacher_selected_task_name],
                            FLAGS.progress_signal,
                            progress_for_teacher)
            summary_teacher = tf.summary.Summary()
            summary_teacher.value.add(
                tag='Teacher/at_update_task_returns',
                simple_value=task_average_returns[teacher_selected_task_name])
            summary_teacher.value.add(
                tag='Teacher/at_update_progress_signal',
                simple_value=progress_for_teacher)
            summary_writer.add_summary(summary_teacher, num_env_frames_v)

            # Keep track of teacher state
            teacher_history['progress_signal'][num_teacher_update] = (
                progress_for_teacher)
            teacher_history['weights'][num_teacher_update] = (
                teacher._log_weights.copy())
            teacher_history['arm_probs'][num_teacher_update] = (
                teacher.task_probabilities.copy())
            teacher_history['teacher_selected_task_name'][num_teacher_update] = (
                teacher_selected_task_name)
            teacher_history['num_env_frames'][num_teacher_update] = (
                num_env_frames_v)
            teacher_history['task_returns'][num_teacher_update] = (
                task_average_returns)
            teacher_history['evaluation_task_returns'][num_teacher_update] = (
                evaluation_task_returns.copy())
            teacher_history['task_names'] = task_names

            # Store teacher history for analysis
            if ((num_teacher_update + 1) %
                FLAGS.save_every_k_teacher_updates == 0):
              np.save(
                  os.path.join(FLAGS.logdir, "teaching_output_{}.npy".format(
                      num_teacher_update)),
                  dict(teacher_history))
              # Reset teacher history to be super safe
              teacher_history = collections.defaultdict(dict)

            # Get new task from the Teacher and update Actors
            if FLAGS.actors_same_task:
              teacher_selected_task_name = teacher.get_task()
              actor_task_assignments = [teacher_selected_task_name]
              update_all_actors_tasks(
                  actor_task_assignments,
                  actor_task_name_params,
                  session._tf_sess(),
                  single_task=True)
            else:
              actor_task_assignments = np.random.choice(
                  task_names,
                  FLAGS.num_actors,
                  replace=FLAGS.num_actors > len(task_names))
              update_all_actors_tasks(
                  actor_task_assignments,
                  actor_task_name_params,
                  session._tf_sess(),
                  single_task=False)

            # ... finish this switch
            progress_since_switch = []
            returns_task_since_switch = collections.defaultdict(list)
            num_teacher_update += 1
            next_task_switch_at += FLAGS.switch_tasks_every_k_frames
            print("Switching to task {}! Next update at {}".format(
                actor_task_assignments, next_task_switch_at))


      else:
        # Execute actors (they just need to enqueue their output).
        while True:
          session.run(enqueue_ops)
Пример #22
0
def train():
    """Train."""

    if is_single_machine():
        local_job_device = ''
        shared_job_device = ''
        is_actor_fn = lambda i, j: True
        is_learner = True
        global_variable_device = '/gpu'
        server = tf.train.Server.create_local_server()
        filters = []
    else:
        if False:
            from mpi4py import MPI
            comm = MPI.COMM_WORLD
            rank = comm.Get_rank()

            #rank = FLAGS.rank

            if rank == 0:
                job_name = 'learner'
                task = 0
            else:
                job_name = 'actor'
                task = rank - 1
        else:
            job_name = FLAGS.job_name
            task = FLAGS.task

        agent1 = task // FLAGS.num_agents
        agent2 = task % FLAGS.num_agents

        local_job_device = '/job:%s/task:%d' % (job_name, task)
        shared_job_device = '/job:learner/task:0'
        is_actor_fn = lambda i, j: job_name == 'actor' and i == agent1 and j == agent2
        is_learner = job_name == 'learner'

        # Placing the variable on CPU, makes it cheaper to send it to all the
        # actors. Continual copying the variables from the GPU is slow.
        global_variable_device = shared_job_device + '/cpu'

        # cluster = tf.train.ClusterSpec({
        #    'actor': ['localhost:%d' % (8001 + i) for i in range(FLAGS.num_agents ** 2)],
        #    'learner': ['localhost:8000']
        # })

        # cluster = tf.train.ClusterSpec({
        #    'actor': ['10.1.2.25:8000', '10.1.2.24:8000', '10.1.2.15:8000'],
        #    'learner': ['10.1.2.22:8000']
        # })

        #cluster = tf.train.ClusterSpec({
        #    'actor': ['10.1.2.25:%d' % (8001 + i) for i in range(FLAGS.num_agents ** 2)] + [
        #        '10.1.2.24:%d' % (8001 + i) for i in range(FLAGS.num_agents ** 2)] + [
        #                 '10.1.2.15:%d' % (8001 + i) for i in range(FLAGS.num_agents ** 2)],
        #    'learner': ['10.1.2.22:8000']
        #})
        '''
        cluster = tf.train.ClusterSpec({
            'actor': ['10.1.2.25:%d' % (8000 + i) for i in range(FLAGS.num_agents ** 2)] +
                     ['10.1.2.24:%d' % (8000 + i) for i in range(FLAGS.num_agents ** 2)],
            'learner': ['10.1.2.22:8000']
        }) '''

        nodefile = FLAGS.logdir + '/nodeslist.txt'
        with open(nodefile, 'r') as f:
            nodes = f.readlines()
        nodes = [x.strip().split('.')[0] for x in nodes]

        #nodes = comm.allgather(MPI.Get_processor_name())
        counts = defaultdict(int)

        if False:
            processes = []
            for i, node in enumerate(nodes):
                processes.append(node + ':' + str(14000 + counts[node]))
                counts[node] += 1
        else:
            processes = []
            for i, node in enumerate(nodes):
                if i == 0:
                    processes.append(node + ':14000')
                else:
                    for j in range(FLAGS.processes):
                        processes.append(node + ':' + str(14000 + j))

        cluster = tf.train.ClusterSpec({
            'actor': processes[1:],
            'learner': [processes[0]]
        })

        import socket

        print(job_name, task, socket.gethostname())
        print({'actor': processes[1:], 'learner': [processes[0]]})

        sys.stdout.flush()

        server = tf.train.Server(cluster, job_name=job_name, task_index=task)

        print('created server')

        sys.stdout.flush()

        filters = [shared_job_device, local_job_device]

    # Only used to find the actor output structure.
    with tf.Graph().as_default():
        agent = Agent((6, 8, 8))
        env = create_environment({'adversarial': False})
        structure = build_actor(agent, agent, env)
        structure = [structure[0], structure[2]]
        flattened_structure = nest.flatten(structure)
        dtypes = [t.dtype for t in flattened_structure]
        shapes = [t.shape.as_list() for t in flattened_structure]

    with tf.Graph().as_default(), \
         tf.device(local_job_device + '/cpu'), \
         pin_global_variables(global_variable_device):
        tf.set_random_seed(FLAGS.seed)  # Makes initialization deterministic.

        # Create Queue and Agent on the learner.
        with tf.device(shared_job_device):
            agents = []
            queues = []
            for i in range(FLAGS.num_agents):
                agent = Agent((6, 8, 8))
                queue = tf.FIFOQueue(1,
                                     dtypes,
                                     shapes,
                                     shared_name='buffer_' + str(i))

                if is_single_machine() and 'dynamic_batching' in sys.modules:
                    # For single machine training, we use dynamic batching for improved GPU
                    # utilization. The semantics of single machine training are slightly
                    # different from the distributed setting because within a single unroll
                    # of an environment, the actions may be computed using different weights
                    # if an update happens within the unroll.
                    old_build = agent._build

                    @dynamic_batching.batch_fn
                    def build(*args):
                        with tf.device('/gpu'):
                            return old_build(*args)

                    tf.logging.info('Using dynamic batching.')
                    agent._build = build

                agents.append(agent)
                queues.append(queue)

        # Build actors and ops to enqueue their output.
        enqueue_ops = [[] for i in range(FLAGS.num_agents)]
        for i in range(FLAGS.num_agents):
            for j in range(0, FLAGS.num_agents):
                if is_actor_fn(i, j):
                    tf.logging.info('Creating actor %d %d', i, j)
                    config = {'adversarial': False}
                    #if i >= FLAGS.num_agents - FLAGS.num_adversarial_agents:
                    #    config['adversarial'] = True
                    env = create_environment(config)
                    actor_output = build_actor(agents[i], agents[j], env)
                    actor1_output = [actor_output[0], actor_output[2]]
                    # actor2_output = [actor_output[1], actor_output[3]]

                    with tf.device(shared_job_device):
                        enqueue_ops[i].append(queues[i].enqueue(
                            nest.flatten(actor1_output)))
                        # enqueue_ops[j].append(queues[j].enqueue(nest.flatten(actor2_output)))

        # If running in a single machine setup, run actors with QueueRunners
        # (separate threads).
        if is_single_machine():
            if is_learner and enqueue_ops:
                for i in range(FLAGS.num_agents):
                    tf.train.add_queue_runner(
                        tf.train.QueueRunner(queues[i], enqueue_ops[i]))

        # Build learner.
        if is_learner:
            # Create global step, which is the number of environment frames processed.
            tf.get_variable('num_environment_frames',
                            initializer=tf.zeros_initializer(),
                            shape=[],
                            dtype=tf.int64,
                            trainable=False,
                            collections=[
                                tf.GraphKeys.GLOBAL_STEP,
                                tf.GraphKeys.GLOBAL_VARIABLES
                            ])

            # Create batch (time major) and recreate structure.

            def make_time_major(s):
                return nest.map_structure(
                    lambda t: tf.transpose(t, [1, 0] + list(
                        range(t.shape.ndims))[2:]), s)

            dequeued = []
            for i in range(FLAGS.num_agents):
                dequeue = queues[i].dequeue_many(FLAGS.batch_size)
                dequeue = nest.pack_sequence_as(structure, dequeue)
                dequeue = [
                    d._replace(env_outputs=make_time_major(d.env_outputs),
                               agent_outputs=make_time_major(d.agent_outputs))
                    for d in dequeue
                ]

                dequeued.append(dequeue)

            with tf.device('/gpu'):
                # Using StagingArea allows us to prepare the next batch and send it to
                # the GPU while we're performing a training step. This adds up to 1 step
                # policy lag.

                num_env_frames = tf.train.get_global_step()
                learning_rate = tf.train.polynomial_decay(
                    FLAGS.learning_rate, num_env_frames,
                    FLAGS.total_environment_frames, 0)
                optimizer = tf.train.RMSPropOptimizer(learning_rate,
                                                      FLAGS.decay,
                                                      FLAGS.momentum,
                                                      FLAGS.epsilon)

                stage_ops = []
                outputss = []
                for i in range(FLAGS.num_agents):
                    flattened_output = nest.flatten(dequeued[i])
                    area = tf.contrib.staging.StagingArea(
                        [t.dtype for t in flattened_output],
                        [t.shape for t in flattened_output])
                    stage_op = area.put(flattened_output)

                    stage_ops.append(stage_op)

                    data_from_actorss = nest.pack_sequence_as(
                        structure, area.get())

                    outputs = []
                    for data_from_actors in data_from_actorss:
                        # Unroll agent on sequence, create losses and update ops.
                        print('building learner', i)
                        outputs.append(
                            build_learner(agents[i],
                                          data_from_actors.agent_state,
                                          data_from_actors.env_outputs,
                                          data_from_actors.agent_outputs,
                                          optimizer,
                                          num_env_frames,
                                          learning_rate,
                                          index=i))

                    outputss.append(outputs)

        # Create MonitoredSession (to run the graph, checkpoint and log).
        tf.logging.info('Creating MonitoredSession, is_chief %s', is_learner)
        config = tf.ConfigProto(
            allow_soft_placement=True, device_filters=filters
        )  #, gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.2))

        with tf.train.MonitoredTrainingSession(
                server.target,
                is_chief=is_learner,
                checkpoint_dir=FLAGS.logdir,
                save_checkpoint_secs=120,
                save_summaries_secs=30,
                log_step_count_steps=50000,
                config=config,
                hooks=[py_process.PyProcessHook()]) as session:

            # session = tf_debug.LocalCLIDebugWrapperSession(session)

            if is_learner:
                # Logging.
                summary_writer = tf.summary.FileWriterCache.get(FLAGS.logdir)

                # Prepare data for first run.
                session.run_step_fn(
                    lambda step_context: step_context.session.run(stage_ops))

                # Execute learning and track performance.
                num_env_frames_v = 0
                while num_env_frames_v < FLAGS.total_environment_frames:
                    summary = tf.summary.Summary()
                    # print('outputss, stage_ops')
                    output_valuess, _ = session.run((outputss, stage_ops))
                    for i in range(len(output_valuess)):
                        for j, output in enumerate(output_valuess[i]):
                            done_v, infos_v, num_env_frames_v = output

                            for episode_return, episode_step in zip(
                                    infos_v.episode_return[done_v],
                                    infos_v.episode_step[done_v]):
                                episode_frames = episode_step * FLAGS.num_action_repeats

                                if j == 0:
                                    tf.logging.info('Episode return: %f',
                                                    episode_return)

                                    summary.value.add(
                                        tag='/episode_return_' + str(i) + '_' +
                                        str(j),
                                        simple_value=episode_return)
                                    summary.value.add(
                                        tag='/episode_frames_' + str(i) + '_' +
                                        str(j),
                                        simple_value=episode_frames)

                    summary_writer.add_summary(summary, num_env_frames_v)
            else:
                # Execute actors (they just need to enqueue their output).

                while True:
                    #print('.', end='')
                    session.run(enqueue_ops)