Exemplo n.º 1
0
def run(bsuite_id: str) -> str:
    """Runs A2C agent on a single bsuite environment, logging to CSV."""

    env = bsuite.load_and_record(
        bsuite_id=bsuite_id,
        save_path=FLAGS.save_path,
        logging_mode=FLAGS.logging_mode,
        overwrite=FLAGS.overwrite,
    )

    hidden_sizes = [FLAGS.num_units] * FLAGS.num_hidden_layers
    network = actor_critic.PolicyValueNet(
        hidden_sizes=hidden_sizes,
        action_spec=env.action_spec(),
    )
    agent = actor_critic.ActorCritic(
        obs_spec=env.observation_spec(),
        action_spec=env.action_spec(),
        network=network,
        optimizer=snt.optimizers.Adam(learning_rate=FLAGS.learning_rate),
        max_sequence_length=FLAGS.sequence_length,
        td_lambda=FLAGS.td_lambda,
        discount=FLAGS.discount,
        seed=FLAGS.seed,
    )

    num_episodes = FLAGS.num_episodes or getattr(env, 'bsuite_num_episodes')
    experiment.run(agent=agent,
                   environment=env,
                   num_episodes=num_episodes,
                   verbose=FLAGS.verbose)

    return bsuite_id
Exemplo n.º 2
0
def run_random_agent(bsuite_id, save_path=SAVE_PATH_RAND, overwrite=True):
    env = bsuite.load_and_record(bsuite_id, save_path, overwrite=overwrite)
    print('bsuite_id={}, settings={}, num_episodes={}'.format(
        bsuite_id, sweep.SETTINGS[bsuite_id], env.bsuite_num_episodes))
    agent = random.default_agent(obs_spec=env.observation_spec(),
                                 action_spec=env.action_spec())
    experiment.run(agent, env, num_episodes=env.bsuite_num_episodes)
Exemplo n.º 3
0
def run_agent(bsuite_id, save_path=SAVE_PATH_RAND, overwrite=True):
    # Load environment
    env = bsuite.load_and_record(bsuite_id, save_path, overwrite=overwrite)
    agent = actor_critic_rnn.default_agent(
        obs_spec=env.observation_spec(),
        action_spec=env.action_spec()
    )
    experiment.run(agent, env, num_episodes=env.bsuite_num_episodes)
Exemplo n.º 4
0
def run_agent(bsuite_id, save_path=SAVE_PATH_RAND, overwrite=True):
    # Load environment
    env = bsuite.load_and_record(bsuite_id, save_path, overwrite=overwrite)
    print('bsuite_id={}, settings={}, num_episodes={}, start={}'.format(
        bsuite_id, sweep.SETTINGS[bsuite_id], env.bsuite_num_episodes,
        datetime.now().strftime("%H:%M:%S")))
    agent = DQNTF2.default_agent(obs_spec=env.observation_spec(),
                                 action_spec=env.action_spec())
    experiment.run(agent, env, num_episodes=env.bsuite_num_episodes)
Exemplo n.º 5
0
 def _load_env():
     raw_env = bsuite.load_and_record(
         bsuite_id=bsuite_id,
         save_path=FLAGS.save_path,
         logging_mode=FLAGS.logging_mode,
         overwrite=FLAGS.overwrite,
     )
     if FLAGS.verbose:
         raw_env = terminal_logging.wrap_environment(raw_env,
                                                     log_every=True)  # pytype: disable=wrong-arg-types
     return gym_wrapper.GymFromDMEnv(raw_env)
Exemplo n.º 6
0
def run(bsuite_id: str) -> str:
    """Runs a DQN agent on a given bsuite environment, logging to CSV."""

    env = bsuite.load_and_record(
        bsuite_id=bsuite_id,
        save_path=FLAGS.save_path,
        logging_mode=FLAGS.logging_mode,
        overwrite=FLAGS.overwrite,
    )
    action_spec = env.action_spec()

    # Define network.
    prior_scale = 5.
    hidden_sizes = [50, 50]

    def network(inputs: jnp.ndarray) -> jnp.ndarray:
        """Simple Q-network with randomized prior function."""
        net = hk.nets.MLP([*hidden_sizes, action_spec.num_values])
        prior_net = hk.nets.MLP([*hidden_sizes, action_spec.num_values])
        x = hk.Flatten()(inputs)
        return net(x) + prior_scale * lax.stop_gradient(prior_net(x))

    optimizer = optix.adam(learning_rate=1e-3)

    agent = boot_dqn.BootstrappedDqn(
        obs_spec=env.observation_spec(),
        action_spec=action_spec,
        network=network,
        optimizer=optimizer,
        num_ensemble=FLAGS.num_ensemble,
        batch_size=128,
        discount=.99,
        replay_capacity=10000,
        min_replay_size=128,
        sgd_period=1,
        target_update_period=4,
        mask_prob=1.0,
        noise_scale=0.,
    )

    num_episodes = FLAGS.num_episodes or getattr(env, 'bsuite_num_episodes')
    experiment.run(agent=agent,
                   environment=env,
                   num_episodes=num_episodes,
                   verbose=FLAGS.verbose)

    return bsuite_id
Exemplo n.º 7
0
def run(bsuite_id: str) -> str:
    """Runs a DQN agent on a given bsuite environment, logging to CSV."""

    raw_env = bsuite.load_and_record(
        bsuite_id=bsuite_id,
        save_path=FLAGS.save_path,
        logging_mode=FLAGS.logging_mode,
        overwrite=FLAGS.overwrite,
    )
    if FLAGS.verbose:
        raw_env = terminal_logging.wrap_environment(raw_env, log_every=True)  # pytype: disable=wrong-arg-types
    env = gym_wrapper.GymFromDMEnv(raw_env)

    num_episodes = FLAGS.num_episodes or getattr(raw_env,
                                                 'bsuite_num_episodes')

    def callback(lcl, unused_glb):
        # Terminate after `num_episodes`.
        try:
            return lcl['num_episodes'] > num_episodes
        except KeyError:
            return False

    # Note: we should never run for this many steps as we end after `num_episodes`
    total_timesteps = FLAGS.total_timesteps

    deepq.learn(
        env=env,
        network='mlp',
        hiddens=[FLAGS.num_units] * FLAGS.num_hidden_layers,
        batch_size=FLAGS.batch_size,
        lr=FLAGS.learning_rate,
        total_timesteps=total_timesteps,
        buffer_size=FLAGS.replay_capacity,
        exploration_fraction=1. / total_timesteps,  # i.e. immediately anneal.
        exploration_final_eps=FLAGS.epsilon,  # constant epsilon.
        print_freq=None,  # pylint: disable=wrong-arg-types
        learning_starts=FLAGS.min_replay_size,
        target_network_update_freq=FLAGS.target_update_period,
        callback=callback,  # pytype: disable=wrong-arg-types
        gamma=FLAGS.agent_discount,
        checkpoint_freq=None,
    )

    return bsuite_id
Exemplo n.º 8
0
 def __init__(self,
              id: str,
              exp_kwargs: dict = None,
              external_logging: str = 'none',
              save_path: str = '',
              overwrite: bool = True):
     assert (id in VALID_ENV_SWEEP_IDS) or (
         id in VALID_ENV_IDS and exp_kwargs is not None
     )  # Either using one of presets or using base experiment with other settings
     aug_path = osp.join(LOG_DIR, save_path)  # LOG_DIR + save_path
     if id in VALID_ENV_SWEEP_IDS:  # Pre-parameterized experiments
         if external_logging == 'none':
             env = bsuite.load_from_id(id)  # No recording
         else:
             env = bsuite.load_and_record(
                 id, aug_path, external_logging, overwrite=overwrite
             )  # Record in sql or csv. same sql for each id
         self.num_episodes = env.bsuite_num_episodes
     else:
         noise_scale = exp_kwargs.pop('noise_scale', 0.)
         noise_scale_seed = exp_kwargs.pop('noise_scale_seed', 0.)
         reward_scale = exp_kwargs.pop('reward_scale', 0.)
         env = bsuite.load(id, **exp_kwargs)
         if noise_scale:
             env = RewardNoise(env, noise_scale, noise_scale_seed)
         if reward_scale: env = RewardScale(env, reward_scale)
         self.num_episodes = 1e4  # Default
     self.env = env
     self._action_space = IntBox(low=0,
                                 high=self.env.action_spec().num_values)
     o_spec = self.env.observation_spec()
     if isinstance(o_spec, specs.BoundedArray):
         self._observation_space = FloatBox(low=o_spec.minimum.item(),
                                            high=o_spec.maximum.item(),
                                            shape=o_spec.shape,
                                            dtype=o_spec.dtype)
     else:
         self._observation_space = FloatBox(low=-float('inf'),
                                            high=float('inf'),
                                            shape=o_spec.shape,
                                            dtype=o_spec.dtype)
     self._last_observation = None
     self.game_over = False,
     self.viewer = None
Exemplo n.º 9
0
def run(bsuite_id: str) -> str:
    """Runs a DQN agent on a given bsuite environment, logging to CSV."""

    env = bsuite.load_and_record(
        bsuite_id=bsuite_id,
        save_path=FLAGS.save_path,
        logging_mode=FLAGS.logging_mode,
        overwrite=FLAGS.overwrite,
    )

    agent = q_learning.QLearning(env)

    num_episodes = FLAGS.num_episodes or getattr(env, 'bsuite_num_episodes')
    experiment.run(agent=agent,
                   environment=env,
                   num_episodes=num_episodes,
                   verbose=FLAGS.verbose)

    return bsuite_id
Exemplo n.º 10
0
def run(bsuite_id: str) -> str:
    """Runs a DQN agent on a given bsuite environment, logging to CSV."""

    env = bsuite.load_and_record(
        bsuite_id=bsuite_id,
        save_path=FLAGS.save_path,
        logging_mode=FLAGS.logging_mode,
        overwrite=FLAGS.overwrite,
    )

    agent = dqn.default_agent(env.observation_spec(), env.action_spec())

    num_episodes = FLAGS.num_episodes or getattr(env, 'bsuite_num_episodes')
    experiment.run(agent=agent,
                   environment=env,
                   num_episodes=num_episodes,
                   verbose=FLAGS.verbose)

    return bsuite_id
Exemplo n.º 11
0
def run(bsuite_id: str) -> str:
    """Runs a random agent on a given bsuite environment, logging to CSV."""

    env = bsuite.load_and_record(
        bsuite_id=bsuite_id,
        save_path=FLAGS.save_path,
        logging_mode=FLAGS.logging_mode,
        overwrite=FLAGS.overwrite,
    )
    agent = random.default_agent(obs_spec=env.observation_spec(),
                                 action_spec=env.action_spec(),
                                 seed=FLAGS.seed)

    experiment.run(
        agent=agent,
        environment=env,
        num_episodes=FLAGS.num_episodes or env.bsuite_num_episodes,  # pytype: disable=attribute-error
        verbose=FLAGS.verbose)

    return bsuite_id
Exemplo n.º 12
0
def run(bsuite_id: str) -> str:
  """Runs a DQN agent on a given bsuite environment, logging to CSV."""

  env = bsuite.load_and_record(
      bsuite_id=bsuite_id,
      save_path=FLAGS.save_path,
      logging_mode=FLAGS.logging_mode,
      overwrite=FLAGS.overwrite,
  )

  # Making the networks.
  hidden_units = [FLAGS.num_units] * FLAGS.num_hidden_layers
  network = snt.Sequential([
      snt.Flatten(),
      snt.nets.MLP(hidden_units + [env.action_spec().num_values]),
  ])
  optimizer = snt.optimizers.Adam(learning_rate=FLAGS.learning_rate)

  agent = dqn.DQN(
      action_spec=env.action_spec(),
      network=network,
      batch_size=FLAGS.batch_size,
      discount=FLAGS.discount,
      replay_capacity=FLAGS.replay_capacity,
      min_replay_size=FLAGS.min_replay_size,
      sgd_period=FLAGS.sgd_period,
      target_update_period=FLAGS.target_update_period,
      optimizer=optimizer,
      epsilon=FLAGS.epsilon,
      seed=FLAGS.seed,
  )

  num_episodes = FLAGS.num_episodes or getattr(env, 'bsuite_num_episodes')
  experiment.run(
      agent=agent,
      environment=env,
      num_episodes=num_episodes,
      verbose=FLAGS.verbose)

  return bsuite_id
Exemplo n.º 13
0
def run(bsuite_id: str) -> str:
    """Runs a BDQN agent on a given bsuite environment, logging to CSV."""

    env = bsuite.load_and_record(
        bsuite_id=bsuite_id,
        save_path=FLAGS.save_path,
        logging_mode=FLAGS.logging_mode,
        overwrite=FLAGS.overwrite,
    )

    ensemble = boot_dqn.make_ensemble(
        num_actions=env.action_spec().num_values,
        num_ensemble=FLAGS.num_ensemble,
        num_hidden_layers=FLAGS.num_hidden_layers,
        num_units=FLAGS.num_units,
        prior_scale=FLAGS.prior_scale)

    agent = boot_dqn.BootstrappedDqn(
        obs_spec=env.observation_spec(),
        action_spec=env.action_spec(),
        ensemble=ensemble,
        batch_size=FLAGS.batch_size,
        discount=FLAGS.discount,
        replay_capacity=FLAGS.replay_capacity,
        min_replay_size=FLAGS.min_replay_size,
        sgd_period=FLAGS.sgd_period,
        target_update_period=FLAGS.target_update_period,
        optimizer=snt.optimizers.Adam(learning_rate=FLAGS.learning_rate),
        mask_prob=FLAGS.mask_prob,
        noise_scale=FLAGS.noise_scale,
        epsilon_fn=lambda x: FLAGS.epsilon,
        seed=FLAGS.seed)

    num_episodes = FLAGS.num_episodes or getattr(env, 'bsuite_num_episodes')
    experiment.run(agent=agent,
                   environment=env,
                   num_episodes=num_episodes,
                   verbose=FLAGS.verbose)

    return bsuite_id
Exemplo n.º 14
0
def run(bsuite_id: str) -> str:
    """
    Runs a bsuite experiment and saves the results as csv files

    Args:
        bsuite_id: string, the id of the bsuite experiment to run

    Returns: none

    """
    env = bsuite.load_and_record(
        bsuite_id=bsuite_id,
        save_path=save_path,
        logging_mode='csv',
        overwrite=True,
    )

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # Settings for the neural network
    qnet_settings = {
        'layers_sizes': [50],
        'batch_size': 64,
        'noisy_nets': True,
        'distributional': True,
        'vmin': 0,
        'vmax': 1000,
        'number_atoms': 51
    }

    # Settings for the specific agent
    settings = {
        'batch_size': qnet_settings["batch_size"],
        'epsilon_start': 0.0,
        'epsilon_decay': 0.00,
        'epsilon_min': 0.00,
        'gamma': 0.99,
        'buffer_size': 2**16,
        'lr': 1e-3,
        'qnet_settings': qnet_settings,
        'start_optimization': 64,
        'update_qnet_every': 2,
        'update_target_every': 50,
        'ddqn': True,
        'n_steps': 4,
        'duelling_dqn': True,
        'prioritized_buffer': True,
        'alpha': 0.6,
        'beta0': 0.4,
        'beta_increment': 1e-6
    }

    agent = Agent(action_spec=env.action_spec(),
                  observation_spec=env.observation_spec(),
                  device=device,
                  settings=settings)

    experiment.run(agent=agent,
                   environment=env,
                   num_episodes=env.bsuite_num_episodes,
                   verbose=False)
    return bsuite_id
Exemplo n.º 15
0
def run(bsuite_id: str) -> str:
    """Runs Dopamine DQN on a given bsuite environment, logging to CSV."""

    raw_env = bsuite.load_and_record(
        bsuite_id=bsuite_id,
        save_path=FLAGS.save_path,
        logging_mode=FLAGS.logging_mode,
        overwrite=FLAGS.overwrite,
    )

    class Network(tf.keras.Model):
        """Build deep network compatible with dopamine/discrete_domains/gym_lib."""
        def __init__(self, num_actions: int, name='Network'):
            super(Network, self).__init__(name=name)
            self.forward_fn = tf.keras.Sequential(
                [tf.keras.layers.Flatten()] + [
                    tf.keras.layers.Dense(FLAGS.num_units,
                                          activation=tf.keras.activations.relu)
                    for _ in range(FLAGS.num_hidden_layers)
                ] + [tf.keras.layers.Dense(num_actions, activation=None)])

        def call(self, state):
            """Creates the output tensor/op given the state tensor as input."""
            x = tf.cast(state, tf.float32)
            x = self.forward_fn(x)
            return atari_lib.DQNNetworkType(x)

    def create_agent(sess: tf.Session,
                     environment: gym.Env,
                     summary_writer=None):
        """Factory method for agent initialization in Dopmamine."""
        del summary_writer
        return dqn_agent.DQNAgent(
            sess=sess,
            num_actions=environment.action_space.n,
            observation_shape=OBSERVATION_SHAPE,
            observation_dtype=tf.float32,
            stack_size=1,
            network=Network,
            gamma=FLAGS.agent_discount,
            update_horizon=1,
            min_replay_history=FLAGS.min_replay_size,
            update_period=FLAGS.sgd_period,
            target_update_period=FLAGS.target_update_period,
            epsilon_decay_period=FLAGS.epsilon_decay_period,
            epsilon_train=FLAGS.epsilon,
            optimizer=tf.train.AdamOptimizer(FLAGS.learning_rate),
        )

    def create_environment() -> gym.Env:
        """Factory method for environment initialization in Dopmamine."""
        env = wrappers.ImageObservation(raw_env, OBSERVATION_SHAPE)
        if FLAGS.verbose:
            env = terminal_logging.wrap_environment(env, log_every=True)  # pytype: disable=wrong-arg-types
        env = gym_wrapper.GymFromDMEnv(env)
        env.game_over = False  # Dopamine looks for this
        return env

    runner = run_experiment.Runner(
        base_dir=FLAGS.base_dir,
        create_agent_fn=create_agent,
        create_environment_fn=create_environment,
    )

    num_episodes = FLAGS.num_episodes or getattr(raw_env,
                                                 'bsuite_num_episodes')
    for _ in range(num_episodes):
        runner._run_one_episode()  # pylint: disable=protected-access

    return bsuite_id
Exemplo n.º 16
0
    if not os.path.exists(result_path):
        os.makedirs(result_path)
    if not os.path.exists(agent_path):
        os.makedirs(agent_path)

    # use cartpole_swingup/19 from bsuite, set env. parameters to the ones used in the paper:
    bsuite_id = 'cartpole_swingup/19'
    sweep.SETTINGS[bsuite_id]['x_reward_threshold'] = 1.0
    sweep.SETTINGS[bsuite_id]['x_threshold'] = 5.
    sweep.SETTINGS[bsuite_id]['move_cost'] = 0.05

    # train agent over multiple seeds
    for seed in trange(81, 86, 1):
        env = bsuite.load_and_record(bsuite_id,
                                     result_path + str(seed),
                                     overwrite=True)
        np.random.seed(seed)
        torch.manual_seed(seed)

        agent = IndDQNAgent(
            action_set=[0, 1, 2],
            reward_function=functools.partial(cartpole_reward_function,
                                              reward_type='sparse'),
            feature_extractor=CartpoleIdentityFeature(
            ),  # use feature from bsuite without any modification 
            hidden_dims=[50, 50, 50],
            hidden_dims_std=[50, 50, 50],
            learning_rate=1e-3,
            buffer_size=int(1e6),
            batch_size=64,