def __init__(
        self,
        env,
        global_step,
        root_dir,
        step_metrics,
        name='Agent',
        is_environment=False,
        use_tf_functions=True,
        max_steps=250,
        replace_reward=True,
        non_negative_regret=False,
        id_num=0,
        block_budget_weight=0.,

        # Architecture hparams
        use_rnn=True,
        learning_rate=1e-4,
        actor_fc_layers=(32, 32),
        value_fc_layers=(32, 32),
        lstm_size=(128, ),
        conv_filters=8,
        conv_kernel=3,
        scalar_fc=5,
        entropy_regularization=0.,
        xy_dim=None,

        # Training & logging settings
        num_epochs=25,
        num_eval_episodes=5,
        num_parallel_envs=5,
        replay_buffer_capacity=1001,
        debug_summaries=True,
        summarize_grads_and_vars=True,
    ):
        """Initializes agent, replay buffer, metrics, and checkpointing.

    Args:
      env: An AdversarialTfPyEnvironment with specs and advesary specs.
      global_step: A tf variable tracking the global step.
      root_dir: Path to directory where metrics and checkpoints should be saved.
      step_metrics: A list of tf-agents metrics which represent the x-axis
        during training, such as the number of episodes or the number of
        environment steps.
      name: The name of this agent, e.g. 'Adversary'.
      is_environment: If True, will use the adversary specs from the environment
        and construct a network with additional inputs for the adversary.
      use_tf_functions: If True, will use tf.function to wrap the agent's train
        function.
      max_steps: The maximum number of steps the agent is allowed to interact
        with the environment in every data collection loop.
      replace_reward: If False, will not modify the reward stored in the agent's
        trajectories. This means the agent will be trained with the default
        environment reward rather than regret.
      non_negative_regret: If True, will ensure that the regret reward cannot
        be below 0.
      id_num: The ID number of this agent within the population of agents of the
        same type. I.e. this is adversary agent 3.
      block_budget_weight: Weight to place on the adversary's block budget
        reward. Default is 0 for no block budget.
      use_rnn: If True, will use an RNN within the network architecture.
      learning_rate: The learning rate used to initialize the optimizer for this
        agent.
      actor_fc_layers: The number and size of fully connected layers in the
        policy.
      value_fc_layers: The number and size of fully connected layers in the
        critic / value network.
      lstm_size: The number of LSTM cells in the RNN.
      conv_filters: The number of convolution filters.
      conv_kernel: The width of the convolution kernel.
      scalar_fc: The width of the fully-connected layer which inputs a scalar.
      entropy_regularization: Entropy regularization coefficient.
      xy_dim: Certain adversaries take in the current (x,y) position as a
        one-hot vector. In this case, the maximum value for x or y is required
        to create the one-hot representation.
      num_epochs: Number of epochs for computing PPO policy updates.
      num_eval_episodes: Number of evaluation episodes be eval step, used as
        batch size to initialize eval metrics.
      num_parallel_envs: Number of parallel environments used in trainin, used
        as batch size for training metrics and rewards.
      replay_buffer_capacity: Capacity of this agent's replay buffer.
      debug_summaries: Log additional summaries from the PPO agent.
      summarize_grads_and_vars: If True, logs gradient norms and variances in
        PPO agent.
    """
        self.name = name
        self.id = id_num
        self.max_steps = max_steps
        self.is_environment = is_environment
        self.replace_reward = replace_reward
        self.non_negative_regret = non_negative_regret
        self.block_budget_weight = block_budget_weight

        with tf.name_scope(self.name):
            self.optimizer = tf.compat.v1.train.AdamOptimizer(
                learning_rate=learning_rate)

            logging.info('\tCalculating specs and building networks...')
            if is_environment:
                self.time_step_spec = env.adversary_time_step_spec
                self.action_spec = env.adversary_action_spec
                self.observation_spec = env.adversary_observation_spec

                (self.actor_net, self.value_net
                 ) = multigrid_networks.construct_multigrid_networks(
                     self.observation_spec,
                     self.action_spec,
                     use_rnns=use_rnn,
                     actor_fc_layers=actor_fc_layers,
                     value_fc_layers=value_fc_layers,
                     lstm_size=lstm_size,
                     conv_filters=conv_filters,
                     conv_kernel=conv_kernel,
                     scalar_fc=scalar_fc,
                     scalar_name='time_step',
                     scalar_dim=self.observation_spec['time_step'].maximum + 1,
                     random_z=True,
                     xy_dim=xy_dim)
            else:
                self.time_step_spec = env.time_step_spec()
                self.action_spec = env.action_spec()
                self.observation_spec = env.observation_spec()

                (self.actor_net, self.value_net
                 ) = multigrid_networks.construct_multigrid_networks(
                     self.observation_spec,
                     self.action_spec,
                     use_rnns=use_rnn,
                     actor_fc_layers=actor_fc_layers,
                     value_fc_layers=value_fc_layers,
                     lstm_size=lstm_size,
                     conv_filters=conv_filters,
                     conv_kernel=conv_kernel,
                     scalar_fc=scalar_fc)

            self.tf_agent = ppo_clip_agent.PPOClipAgent(
                self.time_step_spec,
                self.action_spec,
                self.optimizer,
                actor_net=self.actor_net,
                value_net=self.value_net,
                entropy_regularization=entropy_regularization,
                importance_ratio_clipping=0.2,
                normalize_observations=False,
                normalize_rewards=False,
                use_gae=True,
                num_epochs=num_epochs,
                debug_summaries=debug_summaries,
                summarize_grads_and_vars=summarize_grads_and_vars,
                train_step_counter=global_step)
            self.tf_agent.initialize()
            self.eval_policy = self.tf_agent.policy
            self.collect_policy = self.tf_agent.collect_policy

            logging.info('\tAllocating replay buffer ...')
            self.replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
                self.tf_agent.collect_data_spec,
                batch_size=num_parallel_envs,
                max_length=replay_buffer_capacity)
            logging.info('\t\tRB capacity: %i', self.replay_buffer.capacity)
            self.final_reward = tf.zeros(shape=(num_parallel_envs),
                                         dtype=tf.float32)
            self.enemy_max = tf.zeros(shape=(num_parallel_envs),
                                      dtype=tf.float32)

            # Creates train metrics
            self.step_metrics = step_metrics
            self.train_metrics = step_metrics + [
                tf_metrics.AverageEpisodeLengthMetric(
                    batch_size=num_parallel_envs,
                    name=name + '_AverageEpisodeLength')
            ]
            self.eval_metrics = [
                tf_metrics.AverageEpisodeLengthMetric(
                    batch_size=num_eval_episodes,
                    name=name + '_AverageEpisodeLength')
            ]
            if is_environment:
                self.env_train_metric = adversarial_eval.AdversarialEnvironmentScalar(
                    batch_size=num_parallel_envs,
                    name=name + '_AdversaryReward')
                self.env_eval_metric = adversarial_eval.AdversarialEnvironmentScalar(
                    batch_size=num_eval_episodes,
                    name=name + '_AdversaryReward')
            else:
                self.train_metrics.append(
                    tf_metrics.AverageReturnMetric(
                        batch_size=num_parallel_envs,
                        name=name + '_AverageReturn'))
                self.eval_metrics.append(
                    tf_metrics.AverageReturnMetric(
                        batch_size=num_eval_episodes,
                        name=name + '_AverageReturn'))

            self.metrics_group = metric_utils.MetricsGroup(
                self.train_metrics, name + '_train_metrics')
            self.observers = self.train_metrics + [
                self.replay_buffer.add_batch
            ]

            self.train_dir = os.path.join(root_dir, 'train', name, str(id_num))
            self.eval_dir = os.path.join(root_dir, 'eval', name, str(id_num))
            self.train_checkpointer = common.Checkpointer(
                ckpt_dir=self.train_dir,
                agent=self.tf_agent,
                global_step=global_step,
                metrics=self.metrics_group,
            )
            self.policy_checkpointer = common.Checkpointer(
                ckpt_dir=os.path.join(self.train_dir, 'policy'),
                policy=self.eval_policy,
                global_step=global_step)
            self.saved_model = policy_saver.PolicySaver(self.eval_policy,
                                                        train_step=global_step)
            self.saved_model_dir = os.path.join(root_dir, 'policy_saved_model',
                                                name, str(id_num))

            self.train_checkpointer.initialize_or_restore()

            if use_tf_functions:
                self.tf_agent.train = common.function(self.tf_agent.train,
                                                      autograph=False)

            self.total_loss = None
            self.extra_loss = None
            self.loss_divergence_counter = 0
예제 #2
0
def train_eval(
        root_dir,
        env_name='MultiGrid-Adversarial-v0',
        random_seed=None,
        # PAIRED parameters
        agents_learn_with_regret=True,
        non_negative_regret=True,
        unconstrained_adversary=False,
        domain_randomization=False,
        percent_random_episodes=0.,
        protagonist_episode_length=None,
        flexible_protagonist=False,
        adversary_population_size=1,
        protagonist_population_size=1,
        antagonist_population_size=1,
        combined_population=False,
        block_budget_weight=0,
        # Agent architecture params
        actor_fc_layers=(32, 32),
        value_fc_layers=(32, 32),
        lstm_size=(128, ),
        conv_filters=8,
        conv_kernel=3,
        direction_fc=5,
        entropy_regularization=0.,
        # Adversary architecture params
        adversary_env_rnn=True,
        adv_actor_fc_layers=(32, 32),
        adv_value_fc_layers=(32, 32),
        adv_lstm_size=(128, ),
        adv_conv_filters=16,
        adv_conv_kernel=3,
        adv_timestep_fc=10,
        adv_entropy_regularization=0.,
        # Params for collect
        num_train_steps=500000,
        collect_episodes_per_iteration=30,
        num_parallel_envs=5,
        replay_buffer_capacity=1001,  # Per-environment
        # Params for train
    num_epochs=25,
        learning_rate=1e-4,
        # Params for eval
        num_eval_episodes=5,
        eval_interval=10,
        # Params for summaries and logging
        train_checkpoint_interval=100,
        policy_checkpoint_interval=100,
        log_interval=5,
        summary_interval=5,
        summaries_flush_secs=1,
        use_tf_functions=True,
        debug_summaries=True,
        summarize_grads_and_vars=True,
        eval_metrics_callback=None,
        debug=True):
    """Adversarial environment train and eval."""
    tf.compat.v1.enable_v2_behavior()

    if debug:
        logging.info('In debug mode. Disabling tf functions.')
        use_tf_functions = False

    if combined_population:
        # The number of train steps per environment episodes differs based on the
        # number of agents trained per episode. Adjust value when training a
        # population of agents per episode.
        # The number of agents must change from 3 (for protagonist, antagonist,
        # adversary) to protagonist population size + adversary
        num_train_steps = num_train_steps / 3 * (protagonist_population_size +
                                                 1)

    if root_dir is None:
        raise AttributeError('train_eval requires a root_dir.')

    gym_env = adversarial_env.load(env_name)

    # Set up logging
    root_dir = os.path.expanduser(root_dir)
    train_dir = os.path.join(root_dir, 'train')
    eval_dir = os.path.join(root_dir, 'eval')

    train_summary_writer = tf.compat.v2.summary.create_file_writer(
        train_dir, flush_millis=summaries_flush_secs * 1000)
    train_summary_writer.set_as_default()

    eval_summary_writer = tf.compat.v2.summary.create_file_writer(
        eval_dir, flush_millis=summaries_flush_secs * 1000)

    # Initialize global step and random seed
    global_step = tf.compat.v1.train.get_or_create_global_step()
    with tf.compat.v2.summary.record_if(
            lambda: tf.math.equal(global_step % summary_interval, 0)):
        if random_seed is not None:
            tf.compat.v1.set_random_seed(random_seed)

        # Create environments
        logging.info('Creating %d environments...', num_parallel_envs)
        eval_tf_env = adversarial_env.AdversarialTFPyEnvironment(
            adversarial_env_parallel.AdversarialParallelPyEnvironment(
                [lambda: adversarial_env.load(env_name)] * num_eval_episodes))
        tf_env = adversarial_env.AdversarialTFPyEnvironment(
            adversarial_env_parallel.AdversarialParallelPyEnvironment(
                [lambda: adversarial_env.load(env_name)] * num_parallel_envs))

        logging.info('Preparing to train...')
        environment_steps_metric = tf_metrics.EnvironmentSteps()
        step_metrics = [
            tf_metrics.NumberOfEpisodes(),
            environment_steps_metric,
        ]

        # Logging for special environment metrics
        env_metrics_names = [
            'DistanceToGoal',
            'NumBlocks',
            'DeliberatePlacement',
            'NumEnvEpisodes',
            'GoalX',
            'GoalY',
            'IsPassable',
            'ShortestPathLength',
            'ShortestPassablePathLength',
            'SolvedPathLength',
            'TrainEpisodesCollected',
        ]
        env_train_metrics = []
        env_eval_metrics = []
        for mname in env_metrics_names:
            env_train_metrics.append(
                adversarial_eval.AdversarialEnvironmentScalar(
                    batch_size=num_parallel_envs, name=mname))
            env_eval_metrics.append(
                adversarial_eval.AdversarialEnvironmentScalar(
                    batch_size=num_eval_episodes, name=mname))

        # Create (populations of) both agents that learn to navigate the environment
        agents = {}
        for agent_name in ['agent', 'adversary_agent']:
            if (agent_name == 'adversary_agent'
                    and (domain_randomization or unconstrained_adversary
                         or combined_population)):
                # Antagonist agent not needed for baselines
                continue

            max_steps = gym_env.max_steps
            if protagonist_episode_length is not None and agent_name == 'agent':
                max_steps = protagonist_episode_length

            if agent_name == 'agent':
                population_size = protagonist_population_size
            else:
                population_size = antagonist_population_size

            agents[agent_name] = []
            for i in range(population_size):
                logging.info('Creating agent... %s %d', agent_name, i)
                agents[agent_name].append(
                    agent_train_package.AgentTrainPackage(
                        tf_env,
                        global_step,
                        root_dir,
                        step_metrics,
                        name=agent_name,
                        use_tf_functions=use_tf_functions,
                        max_steps=max_steps,
                        replace_reward=(not unconstrained_adversary
                                        and agents_learn_with_regret),
                        id_num=i,

                        # Architecture hparams
                        learning_rate=learning_rate,
                        actor_fc_layers=actor_fc_layers,
                        value_fc_layers=value_fc_layers,
                        lstm_size=lstm_size,
                        conv_filters=conv_filters,
                        conv_kernel=conv_kernel,
                        scalar_fc=direction_fc,
                        entropy_regularization=entropy_regularization,

                        # Training & logging settings
                        num_epochs=num_epochs,
                        num_eval_episodes=num_eval_episodes,
                        num_parallel_envs=num_parallel_envs,
                        replay_buffer_capacity=replay_buffer_capacity,
                        debug_summaries=debug_summaries,
                        summarize_grads_and_vars=summarize_grads_and_vars))

        if not domain_randomization:
            xy_dim = None
            if 'Reparam' in env_name:
                xy_dim = gym_env.width

            # Create (population of) environment-generating adversaries
            agents['adversary_env'] = []
            for i in range(adversary_population_size):
                logging.info('Creating adversary environment %d', i)
                agents['adversary_env'].append(
                    agent_train_package.AgentTrainPackage(
                        tf_env,
                        global_step,
                        root_dir,
                        step_metrics,
                        name='adversary_env',
                        is_environment=True,
                        use_rnn=adversary_env_rnn,
                        use_tf_functions=use_tf_functions,
                        max_steps=gym_env.adversary_max_steps,
                        replace_reward=True,
                        non_negative_regret=non_negative_regret,
                        xy_dim=xy_dim,
                        id_num=i,
                        block_budget_weight=block_budget_weight,

                        # Architecture hparams
                        learning_rate=learning_rate,
                        actor_fc_layers=adv_actor_fc_layers,
                        value_fc_layers=adv_value_fc_layers,
                        lstm_size=adv_lstm_size,
                        conv_filters=adv_conv_filters,
                        conv_kernel=adv_conv_kernel,
                        scalar_fc=adv_timestep_fc,
                        entropy_regularization=adv_entropy_regularization,

                        # Training & logging settings
                        num_epochs=num_epochs,
                        num_eval_episodes=num_eval_episodes,
                        num_parallel_envs=num_parallel_envs,
                        replay_buffer_capacity=replay_buffer_capacity,
                        debug_summaries=debug_summaries,
                        summarize_grads_and_vars=summarize_grads_and_vars))

        logging.info('Creating adversarial drivers')
        if unconstrained_adversary or domain_randomization or combined_population:
            adversary_agent = None
        else:
            adversary_agent = agents['adversary_agent']

        if domain_randomization:
            adversary_env = None
        else:
            adversary_env = agents['adversary_env']

        collect_driver = adversarial_driver.AdversarialDriver(
            tf_env,
            agents['agent'],
            adversary_agent,
            adversary_env,
            env_metrics=env_train_metrics,
            collect=True,
            disable_tf_function=True,  # TODO(natashajaques): enable tf functions
            debug=debug,
            combined_population=combined_population,
            flexible_protagonist=flexible_protagonist)
        eval_driver = adversarial_driver.AdversarialDriver(
            eval_tf_env,
            agents['agent'],
            adversary_agent,
            adversary_env,
            env_metrics=env_eval_metrics,
            collect=False,
            disable_tf_function=True,  # TODO(natashajaques): enable tf functions
            debug=False,
            combined_population=combined_population,
            flexible_protagonist=flexible_protagonist)

        collect_time = 0
        train_time = 0
        timed_at_step = global_step.numpy()

        # Save operative config as late as possible to include used configurables.
        if global_step.numpy() == 0:
            config_filename = os.path.join(
                train_dir,
                'operative_config-{}.gin'.format(global_step.numpy()))
            with tf.io.gfile.GFile(config_filename, 'wb') as f:
                f.write(gin.operative_config_str())

        total_episodes = 0
        logging.info('Commencing train loop!')
        # Note that if there are N agents, the global step will increase at N times
        # the rate for the same number of train episodes (because it increases for
        # each agent trained. Therefore it is important to divide the train steps by
        # N when plotting).
        while global_step.numpy() <= num_train_steps:
            global_step_val = global_step.numpy()

            # Evaluation
            if global_step_val % eval_interval == 0:
                if debug:
                    logging.info('Performing evaluation at step %d',
                                 global_step_val)
                results = adversarial_eval.eager_compute(
                    eval_driver,
                    agents,
                    env_metrics=env_eval_metrics,
                    train_step=global_step,
                    summary_writer=eval_summary_writer,
                    summary_prefix='Metrics')
                if eval_metrics_callback is not None:
                    eval_metrics_callback(results, global_step.numpy())
                adversarial_eval.log_metrics(agents, env_eval_metrics)

            # Used to interleave randomized episodes with adversarial training
            random_episodes = False
            if percent_random_episodes > 0:
                chance_random = random.random()
                if chance_random < percent_random_episodes:
                    random_episodes = True
                    if debug: logging.info('RANDOM EPISODE')

            # Collect data
            if debug: logging.info('Collecting at step %d', global_step_val)
            start_time = time.time()
            train_idxs = collect_driver.run(random_episodes=random_episodes)
            collect_time += time.time() - start_time
            if debug:
                logging.info('Trained agents: %s', ', '.join(train_idxs))

            # Log total episodes collected
            total_episodes += collect_episodes_per_iteration
            eps_metric = [
                tf.convert_to_tensor(total_episodes, dtype=tf.float32)
            ]
            env_train_metrics[-1](eps_metric)
            env_eval_metrics[-1](eps_metric)
            if debug:
                logging.info('Have collected a total of %d episodes',
                             total_episodes)

            # Train
            if debug: logging.info('Training at step %d', global_step_val)
            start_time = time.time()
            for name, agent_list in agents.items():
                if random_episodes and name == 'adversary_env':
                    # Don't train the adversary on randomly generated episodes
                    continue

                # Train the agents selected by the driver this training run
                for agent_idx in train_idxs[name]:
                    agent = agent_list[agent_idx]
                    if debug: logging.info('\tTraining %s %d', name, agent_idx)
                    agent.total_loss, agent.extra_loss = agent.train_step()
                    agent.replay_buffer.clear()

                    # Check for exploding losses.
                    if (math.isnan(agent.total_loss)
                            or math.isinf(agent.total_loss)
                            or agent.total_loss > MAX_LOSS):
                        agent.loss_divergence_counter += 1
                        if agent.loss_divergence_counter > TERMINATE_AFTER_DIVERGED_STEPS:
                            logging.info(
                                'Loss diverged for too many timesteps, breaking...'
                            )
                            break
                    else:
                        agent.loss_divergence_counter = 0

                # Log train metrics to tensorboard
                for train_metric in agent.train_metrics:
                    train_metric.tf_summaries(train_step=global_step,
                                              step_metrics=step_metrics)
                if agent.is_environment:
                    agent.env_train_metric.tf_summaries(
                        train_step=global_step, step_metrics=step_metrics)

            # Global environment stats logging
            for metric in env_train_metrics:
                metric.tf_summaries(train_step=global_step,
                                    step_metrics=step_metrics)
            if debug:
                logging.info('Train metrics for step %d', global_step_val)
                adversarial_eval.log_metrics(agents, env_train_metrics)

            train_time += time.time() - start_time

            # Print output logging statements
            if global_step_val % log_interval == 0:
                for name, agent_list in agents.items():
                    for i, agent in enumerate(agent_list):
                        print('Loss for', name, i, '=', agent.total_loss)
                steps_per_sec = ((global_step_val - timed_at_step) /
                                 (collect_time + train_time))
                logging.info('%.3f steps/sec', steps_per_sec)
                logging.info('collect_time = %.3f, train_time = %.3f',
                             collect_time, train_time)
                with tf.compat.v2.summary.record_if(True):
                    tf.compat.v2.summary.scalar(name='global_steps_per_sec',
                                                data=steps_per_sec,
                                                step=global_step)

                # Save checkpoints for all agent types and population members
                if global_step_val % train_checkpoint_interval == 0:
                    for name, agent_list in agents.items():
                        for i, agent in enumerate(agent_list):
                            if debug:
                                logging.info(
                                    'Saving checkpoint for agent %s %d', name,
                                    i)
                            agent.train_checkpointer.save(
                                global_step=global_step_val)
                if global_step_val % policy_checkpoint_interval == 0:
                    for name, agent_list in agents.items():
                        for i, agent in enumerate(agent_list):
                            agent.policy_checkpointer.save(
                                global_step=global_step_val)
                            saved_model_path = os.path.join(
                                agent.saved_model_dir,
                                'policy_' + ('%d' % global_step_val).zfill(9))
                            agent.saved_model.save(saved_model_path)

                timed_at_step = global_step_val
                collect_time = 0
                train_time = 0

        if total_episodes > 0:
            # Save one final checkpoint for all agent types and population members
            for name, agent_list in agents.items():
                for i, agent in enumerate(agent_list):
                    if debug:
                        logging.info('Saving checkpoint for agent %s %d', name,
                                     i)
                    agent.train_checkpointer.save(global_step=global_step_val)
            for name, agent_list in agents.items():
                for i, agent in enumerate(agent_list):
                    agent.policy_checkpointer.save(global_step=global_step_val)
                    saved_model_path = os.path.join(
                        agent.saved_model_dir,
                        'policy_' + ('%d' % global_step_val).zfill(9))
                    agent.saved_model.save(saved_model_path)

            # One final eval before exiting.
            results = adversarial_eval.eager_compute(
                eval_driver,
                agents,
                env_metrics=env_eval_metrics,
                train_step=global_step,
                summary_writer=eval_summary_writer,
                summary_prefix='Metrics')
            if eval_metrics_callback is not None:
                eval_metrics_callback(results, global_step.numpy())
            adversarial_eval.log_metrics(agents, env_eval_metrics)