Пример #1
0
def main(_):
    # Create an environment, grab the spec, and use it to create networks.
    environment = helpers.make_environment()
    environment_spec = specs.make_environment_spec(environment)
    agent_networks = d4pg.make_default_networks(environment_spec.actions)

    # Construct the agent.
    agent = d4pg.D4PG(
        environment_spec=environment_spec,
        policy_network=agent_networks['policy'],
        critic_network=agent_networks['critic'],
        observation_network=agent_networks['observation'],  # pytype: disable=wrong-arg-types
    )

    # Create the environment loop used for training.
    train_loop = acme.EnvironmentLoop(environment, agent, label='train_loop')

    # Create the evaluation policy.
    eval_policy = snt.Sequential([
        agent_networks['observation'],
        agent_networks['policy'],
    ])

    # Create the evaluation actor and loop.
    eval_actor = actors.FeedForwardActor(policy_network=eval_policy)
    eval_env = helpers.make_environment()
    eval_loop = acme.EnvironmentLoop(eval_env, eval_actor, label='eval_loop')

    for _ in range(FLAGS.num_episodes // FLAGS.num_episodes_per_eval):
        train_loop.run(num_episodes=FLAGS.num_episodes_per_eval)
        eval_loop.run(num_episodes=1)
Пример #2
0
def build_experiment_config():
  """Builds D4PG experiment config which can be executed in different ways."""

  # Create an environment, grab the spec, and use it to create networks.
  suite, task = FLAGS.env_name.split(':', 1)

  # Bound of the distributional critic. The reward for control environments is
  # normalized, not for gym locomotion environments hence the different scales.
  vmax_values = {
      'gym': 1000.,
      'control': 150.,
  }
  vmax = vmax_values[suite]

  def network_factory(spec) -> d4pg.D4PGNetworks:
    return d4pg.make_networks(
        spec,
        policy_layer_sizes=(256, 256, 256),
        critic_layer_sizes=(256, 256, 256),
        vmin=-vmax,
        vmax=vmax,
    )

  # Configure the agent.
  d4pg_config = d4pg.D4PGConfig(learning_rate=3e-4, sigma=0.2)

  return experiments.ExperimentConfig(
      builder=d4pg.D4PGBuilder(d4pg_config),
      environment_factory=lambda seed: helpers.make_environment(suite, task),
      network_factory=network_factory,
      seed=FLAGS.seed,
      max_num_actor_steps=FLAGS.num_steps)
Пример #3
0
def main(_):
    # Create an environment, grab the spec, and use it to create networks.
    environment = helpers.make_environment(task=FLAGS.env_name)
    environment_spec = specs.make_environment_spec(environment)
    agent_networks = ppo.make_continuous_networks(environment_spec)

    # Construct the agent.
    config = ppo.PPOConfig(unroll_length=FLAGS.unroll_length,
                           num_minibatches=FLAGS.num_minibatches,
                           num_epochs=FLAGS.num_epochs,
                           batch_size=FLAGS.batch_size)

    learner_logger = experiment_utils.make_experiment_logger(
        label='learner', steps_key='learner_steps')
    agent = ppo.PPO(environment_spec,
                    agent_networks,
                    config=config,
                    seed=FLAGS.seed,
                    counter=counting.Counter(prefix='learner'),
                    logger=learner_logger)

    # Create the environment loop used for training.
    train_logger = experiment_utils.make_experiment_logger(
        label='train', steps_key='train_steps')
    train_loop = acme.EnvironmentLoop(environment,
                                      agent,
                                      counter=counting.Counter(prefix='train'),
                                      logger=train_logger)

    # Create the evaluation actor and loop.
    eval_logger = experiment_utils.make_experiment_logger(
        label='eval', steps_key='eval_steps')
    eval_actor = agent.builder.make_actor(
        random_key=jax.random.PRNGKey(FLAGS.seed),
        policy_network=ppo.make_inference_fn(agent_networks, evaluation=True),
        variable_source=agent)
    eval_env = helpers.make_environment(task=FLAGS.env_name)
    eval_loop = acme.EnvironmentLoop(eval_env,
                                     eval_actor,
                                     counter=counting.Counter(prefix='eval'),
                                     logger=eval_logger)

    assert FLAGS.num_steps % FLAGS.eval_every == 0
    for _ in range(FLAGS.num_steps // FLAGS.eval_every):
        eval_loop.run(num_episodes=5)
        train_loop.run(num_steps=FLAGS.eval_every)
    eval_loop.run(num_episodes=5)
Пример #4
0
def main(_):
  env = helpers.make_environment(FLAGS.level)
  env_spec = acme.make_environment_spec(env)
  network = networks.DQNAtariNetwork(env_spec.actions.num_values)

  agent = dqn.DQN(env_spec, network)

  loop = acme.EnvironmentLoop(env, agent)
  loop.run(FLAGS.num_episodes)
Пример #5
0
def main(_):
    # Create an environment, grab the spec, and use it to create networks.
    environment = helpers.make_environment(task=FLAGS.env_name)
    environment_spec = specs.make_environment_spec(environment)
    agent_networks = value_dice.make_networks(environment_spec)

    # Construct the agent.
    config = value_dice.ValueDiceConfig(
        num_sgd_steps_per_step=FLAGS.num_sgd_steps_per_step)
    agent = value_dice.ValueDice(environment_spec,
                                 agent_networks,
                                 config=config,
                                 make_demonstrations=functools.partial(
                                     helpers.make_demonstration_iterator,
                                     dataset_name=FLAGS.dataset_name),
                                 seed=FLAGS.seed)

    # Create the environment loop used for training.
    train_logger = experiment_utils.make_experiment_logger(
        label='train', steps_key='train_steps')
    train_loop = acme.EnvironmentLoop(environment,
                                      agent,
                                      counter=counting.Counter(prefix='train'),
                                      logger=train_logger)

    # Create the evaluation actor and loop.
    eval_logger = experiment_utils.make_experiment_logger(
        label='eval', steps_key='eval_steps')
    eval_actor = agent.builder.make_actor(
        random_key=jax.random.PRNGKey(FLAGS.seed),
        policy_network=value_dice.apply_policy_and_sample(agent_networks,
                                                          eval_mode=True),
        variable_source=agent)
    eval_env = helpers.make_environment(task=FLAGS.env_name)
    eval_loop = acme.EnvironmentLoop(eval_env,
                                     eval_actor,
                                     counter=counting.Counter(prefix='eval'),
                                     logger=eval_logger)

    assert FLAGS.num_steps % FLAGS.eval_every == 0
    for _ in range(FLAGS.num_steps // FLAGS.eval_every):
        eval_loop.run(num_episodes=5)
        train_loop.run(num_steps=FLAGS.eval_every)
    eval_loop.run(num_episodes=5)
Пример #6
0
def main(_):
    # Create an environment, grab the spec, and use it to create networks.
    environment = helpers.make_environment(task=FLAGS.env_name)
    environment_spec = specs.make_environment_spec(environment)
    agent_networks = td3.make_networks(environment_spec)

    # Construct the agent.
    config = td3.TD3Config(num_sgd_steps_per_step=FLAGS.num_sgd_steps_per_step)
    agent = td3.TD3(environment_spec,
                    agent_networks,
                    config=config,
                    seed=FLAGS.seed)

    # Create the environment loop used for training.
    train_logger = experiment_utils.make_experiment_logger(
        label='train', steps_key='train_steps')
    train_loop = acme.EnvironmentLoop(environment,
                                      agent,
                                      counter=counting.Counter(prefix='train'),
                                      logger=train_logger)

    # Create the evaluation actor and loop.
    eval_logger = experiment_utils.make_experiment_logger(
        label='eval', steps_key='eval_steps')
    eval_actor = agent.builder.make_actor(
        random_key=jax.random.PRNGKey(FLAGS.seed),
        policy_network=td3.get_default_behavior_policy(
            agent_networks, environment_spec.actions, sigma=0.),
        variable_source=agent)
    eval_env = helpers.make_environment(task=FLAGS.env_name)
    eval_loop = acme.EnvironmentLoop(eval_env,
                                     eval_actor,
                                     counter=counting.Counter(prefix='eval'),
                                     logger=eval_logger)

    assert FLAGS.num_steps % FLAGS.eval_every == 0
    for _ in range(FLAGS.num_steps // FLAGS.eval_every):
        eval_loop.run(num_episodes=5)
        train_loop.run(num_steps=FLAGS.eval_every)
    eval_loop.run(num_episodes=5)
Пример #7
0
def main(_):
    task = FLAGS.task
    environment_factory = lambda seed: helpers.make_environment(task)
    program = sac.DistributedSAC(
        environment_factory=environment_factory,
        network_factory=sac.make_networks,
        config=sac.SACConfig(**{'num_sgd_steps_per_step': 64}),
        num_actors=4,
        seed=1,
        max_number_of_steps=100).build()

    # Launch experiment.
    lp.launch(program, xm_resources=lp_utils.make_xm_docker_resources(program))
Пример #8
0
def main(_):
    task = FLAGS.task
    environment_factory = lambda seed: helpers.make_environment(task)
    sac_config = sac.SACConfig(num_sgd_steps_per_step=64)
    sac_builder = sac.SACBuilder(sac_config)

    ail_config = ail.AILConfig(direct_rl_batch_size=sac_config.batch_size *
                               sac_config.num_sgd_steps_per_step)

    def network_factory(spec: specs.EnvironmentSpec) -> ail.AILNetworks:
        def discriminator(*args, **kwargs) -> networks_lib.Logits:
            return ail.DiscriminatorModule(environment_spec=spec,
                                           use_action=True,
                                           use_next_obs=True,
                                           network_core=ail.DiscriminatorMLP(
                                               [4, 4], ))(*args, **kwargs)

        discriminator_transformed = hk.without_apply_rng(
            hk.transform_with_state(discriminator))

        return ail.AILNetworks(ail.make_discriminator(
            spec, discriminator_transformed),
                               imitation_reward_fn=ail.rewards.gail_reward(),
                               direct_rl_networks=sac.make_networks(spec))

    def policy_network(
            network: ail.AILNetworks,
            eval_mode: bool = False) -> actor_core_lib.FeedForwardPolicy:
        return sac.apply_policy_and_sample(network.direct_rl_networks,
                                           eval_mode=eval_mode)

    program = ail.DistributedAIL(
        environment_factory=environment_factory,
        rl_agent=sac_builder,
        config=ail_config,
        network_factory=network_factory,
        seed=0,
        batch_size=sac_config.batch_size * sac_config.num_sgd_steps_per_step,
        make_demonstrations=functools.partial(
            helpers.make_demonstration_iterator,
            dataset_name=FLAGS.dataset_name),
        policy_network=policy_network,
        evaluator_policy_network=(lambda n: policy_network(n, eval_mode=True)),
        num_actors=4,
        max_number_of_steps=100,
        discriminator_loss=ail.losses.gail_loss()).build()

    # Launch experiment.
    lp.launch(program, xm_resources=lp_utils.make_xm_docker_resources(program))
Пример #9
0
def build_experiment_config():
  """Builds PPO experiment config which can be executed in different ways."""
  # Create an environment, grab the spec, and use it to create networks.
  suite, task = FLAGS.env_name.split(':', 1)

  config = ppo.PPOConfig(entropy_cost=0, learning_rate=1e-4)
  ppo_builder = ppo.PPOBuilder(config)

  layer_sizes = (256, 256, 256)
  return experiments.ExperimentConfig(
      builder=ppo_builder,
      environment_factory=lambda seed: helpers.make_environment(suite, task),
      network_factory=lambda spec: ppo.make_networks(spec, layer_sizes),
      seed=FLAGS.seed,
      max_num_actor_steps=FLAGS.num_steps)
Пример #10
0
def main(_):
  task = FLAGS.task
  env_factory = lambda seed: helpers.make_environment(task)

  environment_spec = specs.make_environment_spec(env_factory(True))
  program = td3.DistributedTD3(
      environment_factory=env_factory,
      environment_spec=environment_spec,
      network_factory=td3.make_networks,
      config=td3.TD3Config(),
      num_actors=4,
      seed=1,
      max_number_of_steps=100).build()

  lp.launch(program, xm_resources=lp_utils.make_xm_docker_resources(program))
Пример #11
0
def main(_):
    task = FLAGS.task
    environment_factory = lambda seed: helpers.make_environment(task)
    config = ppo.PPOConfig(unroll_length=16,
                           num_minibatches=32,
                           num_epochs=10,
                           batch_size=2048 // 16)
    program = ppo.DistributedPPO(environment_factory=environment_factory,
                                 network_factory=ppo.make_continuous_networks,
                                 config=config,
                                 seed=FLAGS.seed,
                                 num_actors=4,
                                 max_number_of_steps=100).build()

    # Launch experiment.
    lp.launch(program, xm_resources=lp_utils.make_xm_docker_resources(program))
Пример #12
0
def main(_):
    env = helpers.make_environment(level=FLAGS.level, oar_wrapper=True)
    env_spec = acme.make_environment_spec(env)

    config = r2d2.R2D2Config(batch_size=16,
                             trace_length=20,
                             burn_in_length=10,
                             sequence_period=10)

    agent = r2d2.R2D2(env_spec,
                      networks=r2d2.make_atari_networks(
                          config.batch_size, env_spec),
                      config=config,
                      seed=FLAGS.seed)

    loop = acme.EnvironmentLoop(env, agent)
    loop.run(FLAGS.num_episodes)
Пример #13
0
def main(_):
    task = FLAGS.env_name
    environment_factory = lambda seed: helpers.make_environment(task)
    config = value_dice.ValueDiceConfig(num_sgd_steps_per_step=64)
    agent = value_dice.DistributedValueDice(
        environment_factory=environment_factory,
        network_factory=value_dice.make_networks,
        config=config,
        num_actors=4,
        log_to_bigtable=True,
        max_number_of_steps=100,
        seed=1,
        make_demonstrations=functools.partial(
            helpers.make_demonstration_iterator,
            dataset_name=FLAGS.dataset_name))
    program = agent.build()

    # Launch experiment.
    lp.launch(program, xm_resources=lp_utils.make_xm_docker_resources(program))
Пример #14
0
def build_experiment_config():
    """Builds TD3 experiment config which can be executed in different ways."""
    # Create an environment, grab the spec, and use it to create networks.

    suite, task = FLAGS.env_name.split(':', 1)
    network_factory = (lambda spec: td3.make_networks(
        spec, hidden_layer_sizes=(256, 256, 256)))

    # Construct the agent.
    config = td3.TD3Config(
        policy_learning_rate=3e-4,
        critic_learning_rate=3e-4,
    )
    td3_builder = td3.TD3Builder(config)
    # pylint:disable=g-long-lambda
    return experiments.ExperimentConfig(
        builder=td3_builder,
        environment_factory=lambda seed: helpers.make_environment(suite, task),
        network_factory=network_factory,
        seed=FLAGS.seed,
        max_num_actor_steps=FLAGS.num_steps)
Пример #15
0
def main(_):
    env = helpers.make_environment(level=FLAGS.level, oar_wrapper=True)
    env_spec = acme.make_environment_spec(env)

    config = impala.IMPALAConfig(
        batch_size=16,
        sequence_period=10,
        seed=FLAGS.seed,
    )

    networks = impala.make_atari_networks(env_spec)
    agent = impala.IMPALAFromConfig(
        environment_spec=env_spec,
        forward_fn=networks.forward_fn,
        unroll_init_fn=networks.unroll_init_fn,
        unroll_fn=networks.unroll_fn,
        initial_state_init_fn=networks.initial_state_init_fn,
        initial_state_fn=networks.initial_state_fn,
        config=config,
    )

    loop = acme.EnvironmentLoop(env, agent)
    loop.run(FLAGS.num_episodes)
Пример #16
0
def main(_):
    # Access flag value.
    level = FLAGS.task
    environment_factory = (
        lambda seed: helpers.make_environment(level=level, oar_wrapper=True))
    config = r2d2.R2D2Config()

    def net_factory(spec: specs.EnvironmentSpec):
        return r2d2_networks.make_atari_networks(config.batch_size,
                                                 env_spec=spec)

    env = environment_factory(False)
    env_spec = acme.make_environment_spec(env)

    program = r2d2.DistributedR2D2FromConfig(
        seed=0,
        environment_factory=environment_factory,
        network_factory=net_factory,
        config=config,
        num_actors=FLAGS.num_actors,
        environment_spec=env_spec,
    ).build()

    lp.launch(program, lp.LaunchType.LOCAL_MULTI_PROCESSING)
Пример #17
0
def main(_):
    # Create an environment, grab the spec, and use it to create networks.
    environment = helpers.make_environment(task=FLAGS.env_name)
    environment_spec = specs.make_environment_spec(environment)

    # Construct the agent.
    # Local layout makes sure that we populate the buffer with min_replay_size
    # initial transitions and that there's no need for tolerance_rate. In order
    # for deadlocks not to happen we need to disable rate limiting that heppens
    # inside the TD3Builder. This is achieved by the min_replay_size and
    # samples_per_insert_tolerance_rate arguments.
    td3_config = td3.TD3Config(
        num_sgd_steps_per_step=FLAGS.num_sgd_steps_per_step,
        min_replay_size=1,
        samples_per_insert_tolerance_rate=float('inf'))
    td3_networks = td3.make_networks(environment_spec)
    if FLAGS.pretrain:
        td3_networks = add_bc_pretraining(td3_networks)

    ail_config = ail.AILConfig(direct_rl_batch_size=td3_config.batch_size *
                               td3_config.num_sgd_steps_per_step)
    dac_config = ail.DACConfig(ail_config, td3_config)

    def discriminator(*args, **kwargs) -> networks_lib.Logits:
        return ail.DiscriminatorModule(environment_spec=environment_spec,
                                       use_action=True,
                                       use_next_obs=True,
                                       network_core=ail.DiscriminatorMLP(
                                           [4, 4], ))(*args, **kwargs)

    discriminator_transformed = hk.without_apply_rng(
        hk.transform_with_state(discriminator))

    ail_network = ail.AILNetworks(
        ail.make_discriminator(environment_spec, discriminator_transformed),
        imitation_reward_fn=ail.rewards.gail_reward(),
        direct_rl_networks=td3_networks)

    agent = ail.DAC(spec=environment_spec,
                    network=ail_network,
                    config=dac_config,
                    seed=FLAGS.seed,
                    batch_size=td3_config.batch_size *
                    td3_config.num_sgd_steps_per_step,
                    make_demonstrations=functools.partial(
                        helpers.make_demonstration_iterator,
                        dataset_name=FLAGS.dataset_name),
                    policy_network=td3.get_default_behavior_policy(
                        td3_networks,
                        action_specs=environment_spec.actions,
                        sigma=td3_config.sigma))

    # Create the environment loop used for training.
    train_logger = experiment_utils.make_experiment_logger(
        label='train', steps_key='train_steps')
    train_loop = acme.EnvironmentLoop(environment,
                                      agent,
                                      counter=counting.Counter(prefix='train'),
                                      logger=train_logger)

    # Create the evaluation actor and loop.
    # TODO(lukstafi): sigma=0 for eval?
    eval_logger = experiment_utils.make_experiment_logger(
        label='eval', steps_key='eval_steps')
    eval_actor = agent.builder.make_actor(
        random_key=jax.random.PRNGKey(FLAGS.seed),
        policy_network=td3.get_default_behavior_policy(
            td3_networks, action_specs=environment_spec.actions, sigma=0.),
        variable_source=agent)
    eval_env = helpers.make_environment(task=FLAGS.env_name)
    eval_loop = acme.EnvironmentLoop(eval_env,
                                     eval_actor,
                                     counter=counting.Counter(prefix='eval'),
                                     logger=eval_logger)

    assert FLAGS.num_steps % FLAGS.eval_every == 0
    for _ in range(FLAGS.num_steps // FLAGS.eval_every):
        eval_loop.run(num_episodes=5)
        train_loop.run(num_steps=FLAGS.eval_every)
    eval_loop.run(num_episodes=5)
Пример #18
0
def main(_):
    # Create an environment, grab the spec, and use it to create networks.
    environment = helpers.make_environment(task=FLAGS.env_name)
    environment_spec = specs.make_environment_spec(environment)
    agent_networks = ppo.make_continuous_networks(environment_spec)

    # Construct the agent.
    ppo_config = ppo.PPOConfig(unroll_length=FLAGS.unroll_length,
                               num_minibatches=FLAGS.ppo_num_minibatches,
                               num_epochs=FLAGS.ppo_num_epochs,
                               batch_size=FLAGS.transition_batch_size //
                               FLAGS.unroll_length,
                               learning_rate=0.0003,
                               entropy_cost=0,
                               gae_lambda=0.8,
                               value_cost=0.25)
    ppo_networks = ppo.make_continuous_networks(environment_spec)
    if FLAGS.pretrain:
        ppo_networks = add_bc_pretraining(ppo_networks)

    discriminator_batch_size = FLAGS.transition_batch_size
    ail_config = ail.AILConfig(
        direct_rl_batch_size=ppo_config.batch_size * ppo_config.unroll_length,
        discriminator_batch_size=discriminator_batch_size,
        is_sequence_based=True,
        num_sgd_steps_per_step=FLAGS.num_discriminator_steps_per_step,
        share_iterator=FLAGS.share_iterator,
    )

    def discriminator(*args, **kwargs) -> networks_lib.Logits:
        # Note: observation embedding is not needed for e.g. Mujoco.
        return ail.DiscriminatorModule(
            environment_spec=environment_spec,
            use_action=True,
            use_next_obs=True,
            network_core=ail.DiscriminatorMLP([4, 4], ),
        )(*args, **kwargs)

    discriminator_transformed = hk.without_apply_rng(
        hk.transform_with_state(discriminator))

    ail_network = ail.AILNetworks(
        ail.make_discriminator(environment_spec, discriminator_transformed),
        imitation_reward_fn=ail.rewards.gail_reward(),
        direct_rl_networks=ppo_networks)

    agent = ail.GAIL(spec=environment_spec,
                     network=ail_network,
                     config=ail.GAILConfig(ail_config, ppo_config),
                     seed=FLAGS.seed,
                     batch_size=ppo_config.batch_size,
                     make_demonstrations=functools.partial(
                         helpers.make_demonstration_iterator,
                         dataset_name=FLAGS.dataset_name),
                     policy_network=ppo.make_inference_fn(ppo_networks))

    # Create the environment loop used for training.
    train_logger = experiment_utils.make_experiment_logger(
        label='train', steps_key='train_steps')
    train_loop = acme.EnvironmentLoop(environment,
                                      agent,
                                      counter=counting.Counter(prefix='train'),
                                      logger=train_logger)

    # Create the evaluation actor and loop.
    eval_logger = experiment_utils.make_experiment_logger(
        label='eval', steps_key='eval_steps')
    eval_actor = agent.builder.make_actor(
        random_key=jax.random.PRNGKey(FLAGS.seed),
        policy_network=ppo.make_inference_fn(agent_networks, evaluation=True),
        variable_source=agent)
    eval_env = helpers.make_environment(task=FLAGS.env_name)
    eval_loop = acme.EnvironmentLoop(eval_env,
                                     eval_actor,
                                     counter=counting.Counter(prefix='eval'),
                                     logger=eval_logger)

    assert FLAGS.num_steps % FLAGS.eval_every == 0
    for _ in range(FLAGS.num_steps // FLAGS.eval_every):
        eval_loop.run(num_episodes=5)
        train_loop.run(num_steps=FLAGS.eval_every)
    eval_loop.run(num_episodes=5)