示例#1
0
def main(_):
  while True:
    try:
      # Client to communicate with the learner.
      client = grpc.Client(FLAGS.server_address)

      env = config.create_environment(FLAGS.task)

      # Unique ID to identify a specific run of an actor.
      run_id = np.random.randint(np.iinfo(np.int64).max)
      observation = env.reset()
      reward = 0.0
      raw_reward = 0.0
      done = False

      while True:
        env_output = utils.EnvOutput(reward, done, np.array(observation))
        action = client.inference((FLAGS.task, run_id, env_output, raw_reward))
        observation, reward, done, info = env.step(action.numpy())
        raw_reward = float(info.get('score_reward', reward))

        if done:
          observation = env.reset()
    except (tf.errors.UnavailableError, tf.errors.CancelledError) as e:
      logging.exception(e)
      env.close()
示例#2
0
    def testRunLearner(self):
        FLAGS.unroll_length = 6
        FLAGS.batch_size = 2
        logdir = FLAGS.test_tmpdir
        mock_problem = testing_utils.MockProblem(
            unroll_length=FLAGS.unroll_length)
        actor_output_spec = mock_problem.get_actor_output_spec()
        utils.write_specs(logdir, actor_output_spec)

        # Create dummy tensors with the right structure.
        zero_actor_output = tf.nest.map_structure(
            lambda sp: tf.zeros(shape=sp.shape, dtype=sp.dtype),
            actor_output_spec)

        server_address = 'unix:/tmp/learner_test_grpc'
        hparams = {}
        hparams['logdir'] = logdir
        hparams['final_iteration'] = 5
        hparams['iter_frame_ratio'] = FLAGS.batch_size * FLAGS.unroll_length

        # Create a learner in a background thread. (Otherwise this call would
        # block.)
        thread = threading.Thread(target=learner.run_with_address,
                                  args=(mock_problem, server_address, hparams))
        thread.start()

        # Creating a client blocks until the learner responds.
        client = grpc.Client(server_address)

        # Send a number of enqueue RPCs to the learner.
        for _ in range(FLAGS.batch_size * hparams['final_iteration']):
            client.enqueue(tf.nest.flatten(zero_actor_output))  # pytype: disable=attribute-error

        # The learner should terminate after a fixed number of iterations.
        thread.join()
示例#3
0
def actor_loop(create_env_fn):
  """Main actor loop.

  Args:
    create_env_fn: Callable (taking the task ID as argument) that must return a
      newly created environment.
  """
  logging.info('Starting actor loop')
  if are_summaries_enabled():
    summary_writer = tf.summary.create_file_writer(
        os.path.join(FLAGS.logdir, 'actor_{}'.format(FLAGS.task)),
        flush_millis=20000, max_queue=1000)
    timer_cls = profiling.ExportingTimer
  else:
    summary_writer = tf.summary.create_noop_writer()
    timer_cls = utils.nullcontext

  actor_step = 0
  with summary_writer.as_default():
    while True:
      try:
        # Client to communicate with the learner.
        client = grpc.Client(FLAGS.server_address)

        env = create_env_fn(FLAGS.task)

        # Unique ID to identify a specific run of an actor.
        run_id = np.random.randint(np.iinfo(np.int64).max)
        observation = env.reset()
        reward = 0.0
        raw_reward = 0.0
        done = False

        while True:
          tf.summary.experimental.set_step(actor_step)
          env_output = utils.EnvOutput(reward, done, observation)
          with timer_cls('actor/elapsed_inference_s', 1000):
            action = client.inference(
                (FLAGS.task, run_id, env_output, raw_reward))
          with timer_cls('actor/elapsed_env_step_s', 1000):
            observation, reward, done, info = env.step(action.numpy())
          raw_reward = float(info.get('score_reward', reward))
          if done:
            with timer_cls('actor/elapsed_env_reset_s', 10):
              observation = env.reset()
          actor_step += 1
      except (tf.errors.UnavailableError, tf.errors.CancelledError) as e:
        logging.exception(e)
        env.close()
示例#4
0
    def test_run_eval_aggregator_many_times(self):
        server_address = 'unix:/tmp/eval_aggregator_test_grpc'
        hparams = {}
        hparams['logdir'] = os.path.join(FLAGS.test_tmpdir, 'mode')
        hparams['num_samples'] = 10

        # Create an eval aggregator in a background thread. (Otherwise this call
        # would block.)
        thread = threading.Thread(target=eval_aggregator.run_with_address,
                                  args=(server_address, hparams))
        thread.start()

        # Creating a client blocks until the aggregator responds.
        client = grpc.Client(server_address)

        # Send a number of eval_enqueue RPCs to the aggregator.
        for i in range(hparams['num_samples'] + 1):
            msg = pickle.dumps({common.STEP: i / 2, 'eval/a_number': 1})
            client.eval_enqueue(msg)  # pytype: disable=attribute-error

        # The aggregator should terminate after num_samples RPCs. Wait for it to
        # exit.
        thread.join()
示例#5
0
def run_with_learner(problem_type: framework_problem_type.ProblemType,
                     learner_address: Text, hparams: Dict[Text, Any]):
    """Runs actor with the given learner address and problem type.

  Args:
    problem_type: An instance of `framework_problem_type.ProblemType`.
    learner_address: The network address of a learner exposing two methods:
      `variable_values`: which returns latest value of trainable variables.
      `enqueue`: which accepts nested tensors of type `ActorOutput` tuple.
    hparams: A dict containing hyperparameter settings.
  """
    env = problem_type.get_environment()
    agent = problem_type.get_agent()
    env_output = env.reset()
    initial_agent_state = agent.get_initial_state(utils.add_batch_dim(
        env_output.observation),
                                                  batch_size=1)
    # Agent always expects time,batch dimensions. First add and then remove.
    env_output = utils.add_time_batch_dim(env_output)
    agent_output, _ = agent(env_output, initial_agent_state)
    env_output, agent_output = utils.remove_time_batch_dim(
        env_output, agent_output)
    actor_action = common.ActorAction(
        chosen_action_idx=tf.zeros([], dtype=tf.int32),
        oracle_next_action_idx=tf.zeros([], dtype=tf.int32))
    # Remove batch_dim from returned agent's initial state.
    initial_agent_state = tf.nest.map_structure(lambda t: tf.squeeze(t, 0),
                                                initial_agent_state)

    # Write TensorSpecs the learner can use for initialization.
    logging.info('My task id is %d', FLAGS.task)
    if FLAGS.task == 0:
        _write_tensor_specs(initial_agent_state, env_output, agent_output,
                            actor_action)

    # gRPC Client creation blocks until the server responds to an RPC. Since the
    # server blocks at startup looking for TensorSpecs, and will not respond to
    # gRPC calls until these TensorSpecs are written, client creation must happen
    # after the actor writes TensorSpecs in order to prevent a deadlock.
    logging.info('Connecting to learner: %s', learner_address)
    client = grpc.Client(learner_address)

    iter_steps = 0
    num_steps = 0
    sum_reward = 0.
    # add batch_dim
    agent_state = tf.nest.map_structure(lambda t: tf.expand_dims(t, 0),
                                        initial_agent_state)

    iterations = 0
    while iter_steps < hparams['max_iter'] or hparams['max_iter'] == -1:
        logging.info('Iteration %d of %d', iter_steps + 1, hparams['max_iter'])
        # Get fresh parameters from the trainer.
        var_dtypes = [v.dtype for v in agent.trainable_variables]
        # trainer also adds `iterations` to the list of variables -- which is a
        # counter tracking number of iterations done so far.
        var_dtypes.append(tf.int64)
        new_values = []
        if iter_steps % hparams['sync_agent_every_n_steps'] == 0:
            new_values = client.variable_values()  # pytype: disable=attribute-error
        if new_values:
            logging.debug('Fetched variables from learner.')
            iterations = new_values[-1].numpy()
            updated_agent_vars = new_values[:-1]
            assert len(updated_agent_vars) == len(agent.trainable_variables)
            for x, y in zip(agent.trainable_variables, updated_agent_vars):
                x.assign(y)

        infos = []
        # Unroll agent.
        # Every episode sent by actor includes previous episode's final agent
        # state and output as well as final environment output.
        initial_agent_state = tf.nest.map_structure(lambda t: tf.squeeze(t, 0),
                                                    agent_state)
        env_outputs = [env_output]
        agent_outputs = [agent_output]
        actor_actions = [actor_action]
        loss_type = problem_type.get_episode_loss_type(iterations)

        for i in range(FLAGS.unroll_length):
            logging.debug('Unroll step %d of %d', i + 1, FLAGS.unroll_length)
            # Agent expects time,batch dimensions in `env_output` and batch
            # dimension in `agent_state`. `agent_state` already has batch_dim.
            env_output = utils.add_time_batch_dim(env_output)
            agent_output, agent_state = agent(env_output, agent_state)

            env_output, agent_output = utils.remove_time_batch_dim(
                env_output, agent_output)

            actor_action, action_val = problem_type.select_actor_action(
                env_output, agent_output)

            env_output = env.step(action_val)

            env_outputs.append(env_output)
            agent_outputs.append(agent_output)
            actor_actions.append(actor_action)
            num_steps += 1
            sum_reward += env_output.reward

            if env_output.done:
                infos.append(
                    problem_type.get_actor_info(env_output, sum_reward,
                                                num_steps))
                num_steps = 0
                sum_reward = 0.

        processed_env_output = problem_type.postprocessing(
            utils.stack_nested_tensors(env_outputs))

        actor_output = common.ActorOutput(
            initial_agent_state=initial_agent_state,
            env_output=processed_env_output,
            agent_output=utils.stack_nested_tensors(agent_outputs),
            actor_action=utils.stack_nested_tensors(actor_actions),
            loss_type=tf.convert_to_tensor(loss_type, tf.int32),
            info=pickle.dumps(infos))
        flattened = tf.nest.flatten(actor_output)
        client.enqueue(flattened)  # pytype: disable=attribute-error
        iter_steps += 1
示例#6
0
def actor_loop(create_env_fn):
    """Main actor loop.

  Args:
    create_env_fn: Callable (taking the task ID as argument) that must return a
      newly created environment.
  """
    logging.info('Starting actor eval loop')

    summary_writer = tf.summary.create_file_writer(os.path.join(
        FLAGS.logdir, 'actor_{}'.format(FLAGS.task)),
                                                   flush_millis=20000,
                                                   max_queue=1000)
    timer_cls = profiling.ExportingTimer

    actor_step = 0
    with summary_writer.as_default():
        while True:
            try:
                # Client to communicate with the learner.
                client = grpc.Client(FLAGS.server_address)

                env = create_env_fn(FLAGS.task, color='black')
                env1 = create_env_fn(FLAGS.task, color='white')

                # Unique ID to identify a specific run of an actor.
                run_id = np.random.randint(np.iinfo(np.int64).max)
                observation = env.reset()
                reward = 0.0
                raw_reward = 0.0
                done = False

                episode_step = 0
                episode_return = 0
                episode_raw_return = 0

                eval_times = 0
                eval_state = 'black'
                print("starting eval: ", eval_state)

                while True:
                    tf.summary.experimental.set_step(actor_step)
                    env_output = utils.EnvOutput(
                        tf.cast(reward, tf.float32), done,
                        tf.cast(observation, tf.float32))
                    with timer_cls('actor/elapsed_inference_s', 1000):
                        action = client.inference_eval(FLAGS.task, run_id,
                                                       env_output, raw_reward)

                    if eval_state == 'black':
                        with timer_cls('actor/elapsed_env_step_s', 1000):
                            observation, reward, done, info = env.step(
                                action.numpy())
                    else:
                        with timer_cls('actor/elapsed_env_step_s', 1000):
                            observation, reward, done, info = env1.step(
                                action.numpy())

                    if is_rendering_enabled():
                        env.render()
                    episode_step += 1
                    episode_return += reward
                    raw_reward = float((info
                                        or {}).get('score_reward', reward))
                    episode_raw_return += raw_reward

                    if done:
                        eval_times += 1
                        if eval_times >= 50:
                            tf.summary.scalar(
                                'actor/eval_return_' + eval_state,
                                episode_return)
                            logging.info(
                                '%s win/all: %d/%d Raw return: %f Steps: %i',
                                eval_state, (episode_return + eval_times) / 2,
                                eval_times, episode_raw_return, episode_step)
                            episode_step = 0
                            episode_return = 0
                            episode_raw_return = 0

                            time.sleep(300)
                            eval_times = 0
                            eval_state = 'white' if eval_state == 'black' else 'black'
                            print("starting eval: ", eval_state)

                        if eval_state == 'black':
                            with timer_cls('actor/elapsed_env_reset_s', 10):
                                observation = env.reset()
                        else:
                            with timer_cls('actor/elapsed_env_reset_s', 10):
                                observation = env1.reset()

                    actor_step += 1
            except (tf.errors.UnavailableError, tf.errors.CancelledError) as e:
                logging.exception(e)
                env.close()
示例#7
0
def actor_loop(create_env_fn,
               mzconfig,
               share_of_supervised_episodes_fn=lambda _: 0.):
    """Main actor loop.

  Args:
    create_env_fn: Callable (taking the task ID as argument) that must return a
      newly created environment.
    mzconfig: MuZeroConfig instance.
    share_of_supervised_episodes_fn: Function that specifies the share of
      episodes that should be supervised based on the learner iteration.
  """

    logging.info('Starting actor loop')

    actor_log_dir = os.path.join(FLAGS.logdir, 'actor_{}'.format(TASK.value))
    if are_summaries_enabled():
        summary_writer = tf.summary.create_file_writer(actor_log_dir,
                                                       flush_millis=20000,
                                                       max_queue=1000)
        timer_cls = profiling.ExportingTimer
        if FLAG_FILE.value:
            mzutils.write_flags(FLAGS.__flags, FLAG_FILE.value)  # pylint: disable=protected-access
    else:
        summary_writer = tf.summary.create_noop_writer()
        timer_cls = utils.nullcontext

    batch_queue = collections.deque()

    actor_step = tf.Variable(0, dtype=tf.int64)
    num_episodes = tf.Variable(0, dtype=tf.int64)

    # We use the checkpoint to keep track of the actor_step and num_episodes.
    actor_checkpoint = tf.train.Checkpoint(actor_step=actor_step,
                                           num_episodes=num_episodes)
    ckpt_manager = tf.train.CheckpointManager(checkpoint=actor_checkpoint,
                                              directory=actor_log_dir,
                                              max_to_keep=1)
    if ckpt_manager.latest_checkpoint:
        logging.info('Restoring actor checkpoint: %s',
                     ckpt_manager.latest_checkpoint)
        actor_checkpoint.restore(
            ckpt_manager.latest_checkpoint).assert_consumed()

    reward_agg, length_agg = profiling.Aggregator(), profiling.Aggregator()
    with summary_writer.as_default():
        tf.summary.experimental.set_step(actor_step)
        while True:
            try:
                # Client to communicate with the learner.
                client = grpc.Client(FLAGS.server_address)

                def _create_training_samples(episode, start_idx=0):
                    start_idx += random.choice(range(ACTOR_SKIP.value + 1))
                    for i in range(start_idx, len(episode.history),
                                   ACTOR_SKIP.value + 1):
                        target = episode.make_target(
                            state_index=i,
                            num_unroll_steps=mzconfig.num_unroll_steps,
                            td_steps=mzconfig.td_steps,
                            rewards=episode.rewards,
                            policy_distributions=episode.child_visits,
                            discount=episode.discount,
                            value_approximations=episode.root_values)
                        priority = np.float32(
                            1e-2)  # preventing all zero priorities
                        if len(episode) > 0:  # pylint: disable=g-explicit-length-test
                            last_value_idx = min(
                                len(episode) - 1 - i,
                                len(target.value) - 1)
                            priority = np.maximum(
                                priority,
                                np.float32(
                                    np.abs(episode.root_values[
                                        i + last_value_idx] -
                                           target.value[last_value_idx])))

                        # This will be batched and given to add_to_replay_buffer on the
                        # learner.
                        sample = (
                            priority,
                            episode.make_image(i),
                            tf.stack(
                                episode.history_range(
                                    i, i + mzconfig.num_unroll_steps)),
                        ) + tuple(
                            map(lambda x: tf.cast(tf.stack(x), tf.float32),
                                target))
                        batch_queue.append(sample)
                    if ENABLE_ACTOR_LOGGING.value:
                        logging.info(
                            'Added %d samples to the batch_queue. Size: %d of needed %d',
                            len(episode.history) - start_idx, len(batch_queue),
                            mzconfig.train_batch_size)

                def _add_queue_to_replay_buffer():
                    with timer_cls('actor/elapsed_add_to_buffer_s',
                                   10 * ACTOR_LOG_FREQUENCY.value):
                        while len(batch_queue) >= mzconfig.train_batch_size:
                            batch = [
                                batch_queue.popleft()
                                for _ in range(mzconfig.train_batch_size)
                            ]
                            flat_batch = [tf.nest.flatten(b) for b in batch]
                            stacked_batch = list(
                                map(tf.stack, zip(*flat_batch)))
                            structured_batch = tf.nest.pack_sequence_as(
                                batch[0], stacked_batch)
                            client.add_to_replay_buffer(*structured_batch)
                            if ENABLE_ACTOR_LOGGING.value:
                                logging.info(
                                    'Added batch of size %d into replay_buffer.',
                                    len(batch))

                env = create_env_fn(TASK.value, training=is_training_actor())

                def recurrent_inference_fn(*args, **kwargs):
                    with timer_cls('actor/elapsed_recurrent_inference_s',
                                   100 * ACTOR_LOG_FREQUENCY.value):
                        output = client.recurrent_inference(*args, **kwargs)
                        output = tf.nest.map_structure(lambda t: t.numpy(),
                                                       output)
                    return output

                def get_legal_actions_fn(episode):
                    def legal_actions_fn(*args, **kwargs):
                        with timer_cls('actor/elapsed_get_legal_actions_s',
                                       100 * ACTOR_LOG_FREQUENCY.value):
                            output = episode.legal_actions(*args, **kwargs)
                        return output

                    return legal_actions_fn

                while True:
                    episode = mzconfig.new_episode(env)
                    is_supervised_episode = is_training_actor() and \
                        random.random() < share_of_supervised_episodes_fn(
                            client.learning_iteration().numpy())

                    if is_supervised_episode:
                        if ENABLE_ACTOR_LOGGING.value:
                            logging.info('Supervised Episode.')
                        try:
                            with timer_cls(
                                    'actor/elapsed_load_supervised_episode_s',
                                    ACTOR_LOG_FREQUENCY.value):
                                episode_example = env.load_supervised_episode()
                            with timer_cls(
                                    'actor/elapsed_run_supervised_episode_s',
                                    ACTOR_LOG_FREQUENCY.value):
                                targets, samples = env.run_supervised_episode(
                                    episode_example)
                            episode.rewards = samples['reward']
                            episode.history = samples['to_predict']
                            for target in targets:
                                batch_queue.append(target)
                        except core.RLEnvironmentError as e:
                            logging.warning('Environment not ready %s', str(e))
                            # restart episode
                            continue
                        except core.BadSupervisedEpisodeError as e:
                            logging.warning('Abort supervised episode: %s',
                                            str(e))
                            # restart episode
                            continue
                    else:
                        if ENABLE_ACTOR_LOGGING.value:
                            logging.info('RL Episode.')
                        try:
                            last_enqueued_idx = 0
                            legal_actions_fn = get_legal_actions_fn(episode)
                        except core.RLEnvironmentError as e:
                            logging.warning('Environment not ready: %s',
                                            str(e))
                            # restart episode
                            continue
                        except core.SkipEpisode as e:
                            logging.warning('Episode is skipped due to: %s',
                                            str(e))
                            # restart episode
                            continue
                        while (not episode.terminal()
                               and len(episode.history) < mzconfig.max_moves):
                            # This loop is the agent playing the episode.
                            current_observation = episode.make_image(-1)

                            # Map the observation to hidden space.
                            with timer_cls('actor/elapsed_initial_inference_s',
                                           10 * ACTOR_LOG_FREQUENCY.value):
                                initial_inference_output = client.initial_inference(
                                    current_observation)
                                initial_inference_output = tf.nest.map_structure(
                                    lambda t: t.numpy(),
                                    initial_inference_output)

                            # Run MCTS using recurrent_inference_fn.
                            with timer_cls('actor/elapsed_mcts_s',
                                           10 * ACTOR_LOG_FREQUENCY.value):
                                legal_actions = legal_actions_fn()
                                root = core.prepare_root_node(
                                    mzconfig, legal_actions,
                                    initial_inference_output)
                                with timer_cls('actor/elapsed_run_mcts_s',
                                               10 * ACTOR_LOG_FREQUENCY.value):
                                    core.run_mcts(mzconfig, root,
                                                  episode.action_history(),
                                                  legal_actions_fn,
                                                  recurrent_inference_fn,
                                                  episode.visualize_mcts)
                                action = core.select_action(
                                    mzconfig,
                                    len(episode.history),
                                    root,
                                    train_step=actor_step.numpy(),
                                    use_softmax=mzconfig.
                                    use_softmax_for_action_selection,
                                    is_training=is_training_actor())

                            try:
                                # Perform chosen action.
                                with timer_cls('actor/elapsed_env_step_s',
                                               10 * ACTOR_LOG_FREQUENCY.value):
                                    training_steps = client.learning_iteration(
                                    ).numpy()
                                    episode.apply(
                                        action=action,
                                        training_steps=training_steps)
                            except core.RLEnvironmentError as env_error:
                                logging.warning('Environment failed: %s',
                                                str(env_error))
                                episode.failed = True
                                # terminate episode
                                break

                            episode.store_search_statistics(
                                root,
                                use_softmax=(
                                    USE_SOFTMAX_FOR_TARGET.value == 1))
                            actor_step.assign_add(delta=1)
                            if is_training_actor(
                            ) and ACTOR_ENQUEUE_EVERY.value > 0 and (
                                    len(episode.history) - last_enqueued_idx
                            ) >= ACTOR_ENQUEUE_EVERY.value:
                                _create_training_samples(
                                    episode, start_idx=last_enqueued_idx)
                                last_enqueued_idx = len(episode.history)
                                _add_queue_to_replay_buffer()

                        if episode.failed:
                            # restart episode
                            logging.warning('Episode failed, restarting.')
                            continue
                        # Post-episode stats
                        num_episodes.assign_add(delta=1)
                        reward_agg.add(episode.total_reward())
                        length_agg.add(len(episode))
                        if ENABLE_ACTOR_LOGGING.value:
                            logging.info(
                                'Episode done. Length: %d, '
                                'Total Reward: %d, Min Reward: %d, Max Reward: %d',
                                len(episode), episode.total_reward(),
                                min(episode.rewards), max(episode.rewards))
                        if reward_agg.count % ACTOR_LOG_FREQUENCY.value == 0:
                            tf.summary.experimental.set_step(actor_step)
                            tf.summary.scalar('actor/total_reward',
                                              reward_agg.average())
                            tf.summary.scalar('actor/episode_length',
                                              length_agg.average())
                            tf.summary.scalar('actor/num_episodes',
                                              num_episodes)
                            tf.summary.scalar('actor/step', actor_step)
                            tf.summary.scalar(
                                'actor/share_of_supervised_episodes',
                                share_of_supervised_episodes_fn(
                                    client.learning_iteration().numpy()))
                            if episode.mcts_visualizations:
                                tf.summary.text(
                                    'actor/mcts_vis',
                                    '\n\n'.join(episode.mcts_visualizations))
                                if are_summaries_enabled(
                                ) and MCTS_VIS_FILE.value is not None:
                                    # write it also into a txt file
                                    with tf.io.gfile.GFile(
                                            MCTS_VIS_FILE.value, 'a') as f:
                                        f.write('Step {}\n{}\n\n\n\n'.format(
                                            actor_step, '\n\n'.join(
                                                episode.mcts_visualizations)))

                            special_stats = episode.special_statistics()
                            for stat_name, stat_value in special_stats.items():
                                if isinstance(stat_value, float) or isinstance(
                                        stat_value, int):
                                    tf.summary.scalar(
                                        'actor/{}'.format(stat_name),
                                        stat_value)
                                elif isinstance(stat_value, str):
                                    tf.summary.text(
                                        'actor/{}'.format(stat_name),
                                        stat_value)
                                else:
                                    logging.warning(
                                        'Special statistic %s could not be tracked. '
                                        'Type %s is not supported.', stat_name,
                                        type(stat_value))

                            ckpt_manager.save()
                            reward_agg.reset()
                            length_agg.reset()

                        if is_training_actor():
                            # Create samples for training.
                            _create_training_samples(
                                episode, start_idx=last_enqueued_idx)

                    # Send training samples to the learner after the episode is finished
                    if is_training_actor():
                        _add_queue_to_replay_buffer()

                    summary_name = 'train' if is_training_actor() else 'test'
                    if is_supervised_episode:
                        summary_name += ' (supervised)'
                    with timer_cls('actor/elapsed_add_to_reward_s',
                                   10 * ACTOR_LOG_FREQUENCY.value):
                        # This is just for statistics.
                        client.add_to_reward_queue(
                            summary_name, np.float32(episode.total_reward()),
                            np.int64(len(episode)),
                            *episode.special_statistics_learner())
                    del episode

            except (tf.errors.UnavailableError, tf.errors.CancelledError) as e:
                logging.exception(e)
                env.close()
示例#8
0
def run_with_aggregator(problem_type, aggregator_address: Text, hparams):
    """Run evaluation actor with given problem_type, aggregator and hparams.

  Args:
    problem_type: An instance of `framework_problem_type.ProblemType`.
    aggregator_address: The aggregator address to which we will send data for
      batching.
    hparams: A dict containing hyperparameter settings.
  """
    assert isinstance(problem_type, framework_problem_type.ProblemType)
    env = problem_type.get_environment()
    agent = problem_type.get_agent()
    env_output = env.reset()

    agent_state = agent.get_initial_state(utils.add_batch_dim(
        env_output.observation),
                                          batch_size=1)
    # Agent always expects time,batch dimensions.
    _, _ = agent(utils.add_time_batch_dim(env_output), agent_state)

    logging.info('Connecting to aggregator %s', aggregator_address)
    aggregator = grpc.Client(aggregator_address)

    iter_steps = 0
    latest_checkpoint_path = ''
    while hparams['max_iter'] == -1 or iter_steps < hparams['max_iter']:
        logging.info('Iteration %d of %d', iter_steps + 1, hparams['max_iter'])
        checkpoint_directory = os.path.join(hparams['logdir'], 'model.ckpt')
        checkpoint_path = tf.train.latest_checkpoint(checkpoint_directory)
        if checkpoint_path == latest_checkpoint_path or not checkpoint_path:
            logging.info(
                'Waiting for next checkpoint. Previously evaluated checkpoint %s',
                latest_checkpoint_path)
            time.sleep(30)
            continue

        ckpt = tf.train.Checkpoint(agent=agent)
        ckpt.restore(checkpoint_path)
        latest_checkpoint_path = checkpoint_path
        logging.info('Evaluating latest checkpoint - %s',
                     latest_checkpoint_path)

        step = int(latest_checkpoint_path.split('-')[-1])
        logging.debug('Step %d', step)

        for i in range(hparams['num_episodes_per_iter']):
            logging.debug('Episode number %d of %d', i + 1,
                          hparams['num_episodes_per_iter'])
            action_list = []
            env_output_list = [env_output]
            while True:
                env_output = utils.add_time_batch_dim(env_output)
                agent_output, agent_state = agent(env_output, agent_state)
                env_output, agent_output = utils.remove_time_batch_dim(
                    env_output, agent_output)

                _, action_val = problem_type.select_actor_action(
                    env_output, agent_output)
                env_output = env.step(action_val)

                action_list.append(action_val)
                env_output_list.append(env_output)

                if env_output.done:
                    eval_result = problem_type.eval(action_list,
                                                    env_output_list)
                    # eval_result is a dict.
                    eval_result[common.STEP] = step
                    aggregator.eval_enqueue(pickle.dumps(eval_result))  # pytype: disable=attribute-error
                    break
            iter_steps += 1
示例#9
0
def actor_loop(create_env_fn, config=None, log_period=1):
  """Main actor loop.

  Args:
    create_env_fn: Callable (taking the task ID as argument) that must return a
      newly created environment.
    config: Configuration of the training.
    log_period: How often to log in seconds.
  """
  if not config:
    config = FLAGS
  env_batch_size = FLAGS.env_batch_size
  logging.info('Starting actor loop. Task: %r. Environment batch size: %r',
               FLAGS.task, env_batch_size)
  is_rendering_enabled = FLAGS.render and FLAGS.task == 0
  if are_summaries_enabled():
    summary_writer = tf.summary.create_file_writer(
        os.path.join(FLAGS.logdir, 'actor_{}'.format(FLAGS.task)),
        flush_millis=20000, max_queue=1000)
    timer_cls = profiling.ExportingTimer
  else:
    summary_writer = tf.summary.create_noop_writer()
    timer_cls = utils.nullcontext

  actor_step = 0
  with summary_writer.as_default():
    while True:
      try:
        # Client to communicate with the learner.
        client = grpc.Client(FLAGS.server_address)
        utils.update_config(config, client)
        batched_env = env_wrappers.BatchedEnvironment(
            create_env_fn, env_batch_size, FLAGS.task * env_batch_size, config)

        env_id = batched_env.env_ids
        run_id = np.random.randint(
            low=0,
            high=np.iinfo(np.int64).max,
            size=env_batch_size,
            dtype=np.int64)
        observation = batched_env.reset()
        reward = np.zeros(env_batch_size, np.float32)
        raw_reward = np.zeros(env_batch_size, np.float32)
        done = np.zeros(env_batch_size, np.bool)
        abandoned = np.zeros(env_batch_size, np.bool)

        global_step = 0
        episode_step = np.zeros(env_batch_size, np.int32)
        episode_return = np.zeros(env_batch_size, np.float32)
        episode_raw_return = np.zeros(env_batch_size, np.float32)
        episode_step_sum = 0
        episode_return_sum = 0
        episode_raw_return_sum = 0
        episodes_in_report = 0

        elapsed_inference_s_timer = timer_cls('actor/elapsed_inference_s', 1000)
        last_log_time = timeit.default_timer()
        last_global_step = 0
        while True:
          tf.summary.experimental.set_step(actor_step)
          env_output = utils.EnvOutput(reward, done, observation,
                                       abandoned, episode_step)
          with elapsed_inference_s_timer:
            action = client.inference(env_id, run_id, env_output, raw_reward)
          with timer_cls('actor/elapsed_env_step_s', 1000):
            observation, reward, done, info = batched_env.step(action.numpy())
          if is_rendering_enabled:
            batched_env.render()
          for i in range(env_batch_size):
            episode_step[i] += 1
            episode_return[i] += reward[i]
            raw_reward[i] = float((info[i] or {}).get('score_reward',
                                                      reward[i]))
            episode_raw_return[i] += raw_reward[i]
            # If the info dict contains an entry abandoned=True and the
            # episode was ended (done=True), then we need to specially handle
            # the final transition as per the explanations below.
            abandoned[i] = (info[i] or {}).get('abandoned', False)
            assert done[i] if abandoned[i] else True
            if done[i]:
              # If the episode was abandoned, we need to report the final
              # transition including the final observation as if the episode has
              # not terminated yet. This way, learning algorithms can use the
              # transition for learning.
              if abandoned[i]:
                # We do not signal yet that the episode was abandoned. This will
                # happen for the transition from the terminal state to the
                # resetted state.
                assert env_batch_size == 1 and i == 0, (
                    'Mixing of batched and non-batched inference calls is not '
                    'yet supported')
                env_output = utils.EnvOutput(reward,
                                             np.array([False]), observation,
                                             np.array([False]), episode_step)
                with elapsed_inference_s_timer:
                  # action is ignored
                  client.inference(env_id, run_id, env_output, raw_reward)
                reward[i] = 0.0
                raw_reward[i] = 0.0

              # Periodically log statistics.
              current_time = timeit.default_timer()
              episode_step_sum += episode_step[i]
              episode_return_sum += episode_return[i]
              episode_raw_return_sum += episode_raw_return[i]
              global_step += episode_step[i]
              episodes_in_report += 1
              if current_time - last_log_time >= log_period:
                logging.info(
                    'Actor steps: %i, Return: %f Raw return: %f '
                    'Episode steps: %f, Speed: %f steps/s', global_step,
                    episode_return_sum / episodes_in_report,
                    episode_raw_return_sum / episodes_in_report,
                    episode_step_sum / episodes_in_report,
                    (global_step - last_global_step) /
                    (current_time - last_log_time))
                last_global_step = global_step
                episode_return_sum = 0
                episode_raw_return_sum = 0
                episode_step_sum = 0
                episodes_in_report = 0
                last_log_time = current_time

              episode_step[i] = 0
              episode_return[i] = 0
              episode_raw_return[i] = 0

          # Finally, we reset the episode which will report the transition
          # from the terminal state to the resetted state in the next loop
          # iteration (with zero rewards).
          with timer_cls('actor/elapsed_env_reset_s', 10):
            observation = batched_env.reset_if_done(done)

          if is_rendering_enabled and done[0]:
            batched_env.render()

          actor_step += 1
      except (tf.errors.UnavailableError, tf.errors.CancelledError):
        logging.info('Inference call failed. This is normal at the end of '
                     'training.')
        batched_env.close()
示例#10
0
def actor_loop(create_env_fn):
    """Main actor loop.

  Args:
    create_env_fn: Callable (taking the task ID as argument) that must return a
      newly created environment.
  """

    project = neptune.init('pmtest/marl-vtrace')
    experiment = DummyExperiment()

    if FLAGS.task == 0 and not FLAGS.is_local:
        # First actor logs winning rate.
        while True:
            time.sleep(5)
            experiments = project.get_experiments(tag=FLAGS.nonce)
            if len(experiments) == 0:
                logging.info('Experiment not found, retry...')
            else:
                experiment = experiments[-1]
                break

    log_period = 5
    log_period_growth = 1.05
    log_period_max = 600

    last_replay_time = timeit.default_timer()
    replay_period = 600
    replay_period_growth = 1.2
    replay_period_max = 3600

    env_batch_size = FLAGS.env_batch_size
    logging.info('Starting actor loop. Task: %r. Environment batch size: %r',
                 FLAGS.task, env_batch_size)
    is_rendering_enabled = FLAGS.render and FLAGS.task == 0
    if are_summaries_enabled():
        summary_writer = tf.summary.create_file_writer(os.path.join(
            FLAGS.logdir, 'actor_{}'.format(FLAGS.task)),
                                                       flush_millis=20000,
                                                       max_queue=1000)
        timer_cls = profiling.ExportingTimer
    else:
        summary_writer = tf.summary.create_noop_writer()
        timer_cls = utils.nullcontext

    actor_step = 0
    with summary_writer.as_default():
        while True:
            try:
                # Client to communicate with the learner.
                client = grpc.Client(FLAGS.server_address)

                batched_env = env_wrappers.BatchedEnvironment(
                    create_env_fn, env_batch_size, FLAGS.task * env_batch_size)

                env_id = batched_env.env_ids
                run_id = np.random.randint(low=0,
                                           high=np.iinfo(np.int64).max,
                                           size=env_batch_size,
                                           dtype=np.int64)
                observation = batched_env.reset()
                reward = np.zeros(env_batch_size, np.float32)
                raw_reward = np.zeros(env_batch_size, np.float32)
                done = np.zeros(env_batch_size, np.bool)
                abandoned = np.zeros(env_batch_size, np.bool)

                global_step = 0
                episode_step = np.zeros(env_batch_size, np.int32)
                episode_return = np.zeros(env_batch_size, np.float32)
                episode_raw_return = np.zeros(env_batch_size, np.float32)
                episode_step_sum = 0
                episode_return_sum = 0
                episode_raw_return_sum = 0
                episode_won = 0
                episodes_in_report = 0

                elapsed_inference_s_timer = timer_cls(
                    'actor/elapsed_inference_s', 1000)
                last_log_time = timeit.default_timer()
                last_global_step = 0
                while True:
                    tf.summary.experimental.set_step(actor_step)
                    env_output = utils.EnvOutput(reward, done, observation,
                                                 abandoned, episode_step)
                    with elapsed_inference_s_timer:
                        action = client.inference(env_id, run_id, env_output,
                                                  raw_reward)
                    with timer_cls('actor/elapsed_env_step_s', 1000):
                        observation, reward, done, info = batched_env.step(
                            action.numpy())
                    if is_rendering_enabled:
                        batched_env.render()
                    for i in range(env_batch_size):
                        episode_step[i] += 1
                        episode_return[i] += reward[i]
                        raw_reward[i] = float(
                            (info[i] or {}).get('score_reward', reward[i]))
                        episode_raw_return[i] += raw_reward[i]
                        # If the info dict contains an entry abandoned=True and the
                        # episode was ended (done=True), then we need to specially handle
                        # the final transition as per the explanations below.
                        abandoned[i] = (info[i] or {}).get('abandoned', False)
                        assert done[i] if abandoned[i] else True
                        if done[i]:
                            # If the episode was abandoned, we need to report the final
                            # transition including the final observation as if the episode has
                            # not terminated yet. This way, learning algorithms can use the
                            # transition for learning.
                            if abandoned[i]:
                                # We do not signal yet that the episode was abandoned. This will
                                # happen for the transition from the terminal state to the
                                # resetted state.
                                assert env_batch_size == 1 and i == 0, (
                                    'Mixing of batched and non-batched inference calls is not '
                                    'yet supported')
                                env_output = utils.EnvOutput(
                                    reward, np.array([False]), observation,
                                    np.array([False]), episode_step)
                                with elapsed_inference_s_timer:
                                    # action is ignored
                                    client.inference(env_id, run_id,
                                                     env_output, raw_reward)
                                reward[i] = 0.0
                                raw_reward[i] = 0.0

                            # Periodically log statistics.
                            current_time = timeit.default_timer()
                            episode_step_sum += episode_step[i]
                            episode_return_sum += episode_return[i]
                            episode_raw_return_sum += episode_raw_return[i]
                            global_step += episode_step[i]
                            episode_won += (info[i]
                                            or {}).get('battle_won', False)
                            episodes_in_report += 1

                            if FLAGS.task == 0 and \
                                    current_time - last_replay_time > replay_period:
                                replay_period = min(
                                    replay_period_max,
                                    replay_period * replay_period_growth)
                                last_replay_time = current_time
                                batched_env.envs[0].save_replay()

                            if current_time - last_log_time > log_period:
                                log_period = min(
                                    log_period_max,
                                    log_period * log_period_growth)
                                logging.info(
                                    'Actor steps: %i, Return: %f Raw return: %f '
                                    'Episode steps: %f, Speed: %f steps/s, Won: %.2f',
                                    global_step,
                                    episode_return_sum / episodes_in_report,
                                    episode_raw_return_sum /
                                    episodes_in_report,
                                    episode_step_sum / episodes_in_report,
                                    (global_step - last_global_step) /
                                    (current_time - last_log_time),
                                    episode_won / episodes_in_report)
                                tf.summary.scalar('episodes win rate',
                                                  episode_won /
                                                  episodes_in_report,
                                                  step=global_step)
                                if FLAGS.task == 0:
                                    experiment.log_metric(
                                        log_name='episode win rate',
                                        x=global_step,
                                        y=episode_won / episodes_in_report)

                                last_global_step = global_step
                                episode_return_sum = 0
                                episode_raw_return_sum = 0
                                episode_step_sum = 0
                                episode_won = 0
                                episodes_in_report = 0
                                last_log_time = current_time

                            episode_step[i] = 0
                            episode_return[i] = 0
                            episode_raw_return[i] = 0

                    # Finally, we reset the episode which will report the transition
                    # from the terminal state to the resetted state in the next loop
                    # iteration (with zero rewards).
                    with timer_cls('actor/elapsed_env_reset_s', 10):
                        observation = batched_env.reset_if_done(done)

                    if is_rendering_enabled and done[0]:
                        batched_env.render()

                    actor_step += 1
            except (tf.errors.UnavailableError, tf.errors.CancelledError) as e:
                logging.exception(e)
                batched_env.close()
示例#11
0
def actor_loop(create_env_fn):
    """Main actor loop.

  Args:
    create_env_fn: Callable (taking the task ID as argument) that must return a
      newly created environment.
  """
    logging.info('Starting actor loop')
    if are_summaries_enabled():
        summary_writer = tf.summary.create_file_writer(os.path.join(
            FLAGS.logdir, 'actor_{}'.format(FLAGS.task)),
                                                       flush_millis=20000,
                                                       max_queue=1000)
        timer_cls = profiling.ExportingTimer
    else:
        summary_writer = tf.summary.create_noop_writer()
        timer_cls = utils.nullcontext

    actor_step = 0
    with summary_writer.as_default():
        while True:
            try:
                # Client to communicate with the learner.
                client = grpc.Client(FLAGS.server_address)

                env = create_env_fn(FLAGS.task)

                # Unique ID to identify a specific run of an actor.
                run_id = np.random.randint(np.iinfo(np.int64).max)
                observation = env.reset()
                reward = 0.0
                raw_reward = 0.0
                done = False
                abandoned = False

                global_step = 0
                episode_step = 0
                episode_step_sum = 0
                episode_return_sum = 0
                episode_raw_return_sum = 0
                episodes_in_report = 0

                elapsed_inference_s_timer = timer_cls(
                    'actor/elapsed_inference_s', 1000)
                last_log_time = timeit.default_timer()
                while True:
                    tf.summary.experimental.set_step(actor_step)
                    env_output = utils.EnvOutput(reward, done, observation,
                                                 abandoned, episode_step)
                    with elapsed_inference_s_timer:
                        action = client.inference(FLAGS.task, run_id,
                                                  env_output, raw_reward)
                    with timer_cls('actor/elapsed_env_step_s', 1000):
                        observation, reward, done, info = env.step(
                            action.numpy())
                    if is_rendering_enabled():
                        env.render()
                    episode_step += 1
                    episode_return_sum += reward
                    raw_reward = float((info
                                        or {}).get('score_reward', reward))
                    episode_raw_return_sum += raw_reward
                    # If the info dict contains an entry abandoned=True and the
                    # episode was ended (done=True), then we need to specially handle
                    # the final transition as per the explanations below.
                    abandoned = (info or {}).get('abandoned', False)
                    assert done if abandoned else True
                    if done:
                        # If the episode was abandoned, we need to report the final
                        # transition including the final observation as if the episode has
                        # not terminated yet. This way, learning algorithms can use the
                        # transition for learning.
                        if abandoned:
                            # We do not signal yet that the episode was abandoned. This will
                            # happen for the transition from the terminal state to the
                            # resetted state.
                            env_output = utils.EnvOutput(
                                reward, False, observation, False,
                                episode_step)
                            with elapsed_inference_s_timer:
                                action = client.inference(
                                    FLAGS.task, run_id, env_output, raw_reward)
                            reward = 0.0
                            raw_reward = 0.0

                        # Periodically log statistics.
                        current_time = timeit.default_timer()
                        episode_step_sum += episode_step
                        global_step += episode_step
                        episodes_in_report += 1
                        if current_time - last_log_time > 1:
                            logging.info(
                                'Actor steps: %i, Return: %f Raw return: %f Episode steps: %f',
                                global_step,
                                episode_return_sum / episodes_in_report,
                                episode_raw_return_sum / episodes_in_report,
                                episode_step_sum / episodes_in_report)
                            episode_return_sum = 0
                            episode_raw_return_sum = 0
                            episode_step_sum = 0
                            episodes_in_report = 0
                            last_log_time = current_time

                        # Finally, we reset the episode which will report the transition
                        # from the terminal state to the resetted state in the next loop
                        # iteration (with zero rewards).
                        with timer_cls('actor/elapsed_env_reset_s', 10):
                            observation = env.reset()
                            episode_step = 0
                        if is_rendering_enabled():
                            env.render()
                    actor_step += 1
            except (tf.errors.UnavailableError, tf.errors.CancelledError) as e:
                logging.exception(e)
                env.close()
示例#12
0
def actor_loop(create_env_fn):
  """Main actor loop.

  Args:
    create_env_fn: Callable (taking the task ID as argument) that must return a
      newly created environment.
  """
  logging.info('Starting actor loop')
  if are_summaries_enabled():
    summary_writer = tf.summary.create_file_writer(
        os.path.join(FLAGS.logdir, 'actor_{}'.format(FLAGS.task)),
        flush_millis=20000, max_queue=1000)
    timer_cls = profiling.ExportingTimer
  else:
    summary_writer = tf.summary.create_noop_writer()
    timer_cls = utils.nullcontext

  actor_step = 0
  with summary_writer.as_default():
    while True:
      try:
        # Client to communicate with the learner.
        client = grpc.Client(FLAGS.server_address)

        env = create_env_fn(FLAGS.task)

        # Unique ID to identify a specific run of an actor.
        run_id = np.random.randint(np.iinfo(np.int64).max)
        run_id1 = np.random.randint(np.iinfo(np.int64).max)
        observation = env.reset()
        reward = 0.0
        raw_reward = 0.0
        done = False

        episode_step = 0
        episode_return = 0

        color_state = 0
        episode_end = False

        while True:
          tf.summary.experimental.set_step(actor_step)

          env_output = utils.EnvOutput(tf.cast(reward, tf.float32), done, tf.cast(observation, tf.float32))
          if color_state==0:
            with timer_cls('actor/elapsed_inference_s', 1000):
              action = client.inference(
                  FLAGS.task, run_id, env_output, reward)
              
            with timer_cls('actor/elapsed_env_step_s', 1000):
              observation, _reward, _done, info = env.step(action.numpy())

          else:
            with timer_cls('actor/elapsed_inference_s', 1000):
              action = client.inference(
                  int(FLAGS.num_actors/2+FLAGS.task), run_id1, env_output, reward)
            with timer_cls('actor/elapsed_env_step_s', 1000):
              observation, _reward, _done, info = env.step(action.numpy())

          episode_step += 1
          if _done:
            random_num_ = np.random.random()
            if random_num_>0.98:
              if is_rendering_enabled():
                env.render()

            with timer_cls('actor/elapsed_env_reset_s', 10):
              observation = env.reset()

            color_state = 0
          else:
            color_state = 1 - color_state

          if episode_end:
            # this color must be white
            assert color_state==1
            if random_num_>0.98:
              logging.info('Return: %f Steps: %i', episode_return, episode_step)
            episode_step = 0
            episode_return = 0

            done = episode_end
            reward = -reward
            
            episode_end=_done
          else:
            reward=_reward
            episode_end=_done
            done = episode_end
            episode_return+=reward

          actor_step += 1
          
      except (tf.errors.UnavailableError, tf.errors.CancelledError) as e:
        logging.exception(e)
        env.close()