def main(_): # Create an environment, grab the spec, and use it to create networks. environment = helpers.make_environment() environment_spec = specs.make_environment_spec(environment) agent_networks = d4pg.make_default_networks(environment_spec.actions) # Construct the agent. agent = d4pg.D4PG( environment_spec=environment_spec, policy_network=agent_networks['policy'], critic_network=agent_networks['critic'], observation_network=agent_networks['observation'], # pytype: disable=wrong-arg-types ) # Create the environment loop used for training. train_loop = acme.EnvironmentLoop(environment, agent, label='train_loop') # Create the evaluation policy. eval_policy = snt.Sequential([ agent_networks['observation'], agent_networks['policy'], ]) # Create the evaluation actor and loop. eval_actor = actors.FeedForwardActor(policy_network=eval_policy) eval_env = helpers.make_environment() eval_loop = acme.EnvironmentLoop(eval_env, eval_actor, label='eval_loop') for _ in range(FLAGS.num_episodes // FLAGS.num_episodes_per_eval): train_loop.run(num_episodes=FLAGS.num_episodes_per_eval) eval_loop.run(num_episodes=1)
def build_experiment_config(): """Builds D4PG experiment config which can be executed in different ways.""" # Create an environment, grab the spec, and use it to create networks. suite, task = FLAGS.env_name.split(':', 1) # Bound of the distributional critic. The reward for control environments is # normalized, not for gym locomotion environments hence the different scales. vmax_values = { 'gym': 1000., 'control': 150., } vmax = vmax_values[suite] def network_factory(spec) -> d4pg.D4PGNetworks: return d4pg.make_networks( spec, policy_layer_sizes=(256, 256, 256), critic_layer_sizes=(256, 256, 256), vmin=-vmax, vmax=vmax, ) # Configure the agent. d4pg_config = d4pg.D4PGConfig(learning_rate=3e-4, sigma=0.2) return experiments.ExperimentConfig( builder=d4pg.D4PGBuilder(d4pg_config), environment_factory=lambda seed: helpers.make_environment(suite, task), network_factory=network_factory, seed=FLAGS.seed, max_num_actor_steps=FLAGS.num_steps)
def main(_): # Create an environment, grab the spec, and use it to create networks. environment = helpers.make_environment(task=FLAGS.env_name) environment_spec = specs.make_environment_spec(environment) agent_networks = ppo.make_continuous_networks(environment_spec) # Construct the agent. config = ppo.PPOConfig(unroll_length=FLAGS.unroll_length, num_minibatches=FLAGS.num_minibatches, num_epochs=FLAGS.num_epochs, batch_size=FLAGS.batch_size) learner_logger = experiment_utils.make_experiment_logger( label='learner', steps_key='learner_steps') agent = ppo.PPO(environment_spec, agent_networks, config=config, seed=FLAGS.seed, counter=counting.Counter(prefix='learner'), logger=learner_logger) # Create the environment loop used for training. train_logger = experiment_utils.make_experiment_logger( label='train', steps_key='train_steps') train_loop = acme.EnvironmentLoop(environment, agent, counter=counting.Counter(prefix='train'), logger=train_logger) # Create the evaluation actor and loop. eval_logger = experiment_utils.make_experiment_logger( label='eval', steps_key='eval_steps') eval_actor = agent.builder.make_actor( random_key=jax.random.PRNGKey(FLAGS.seed), policy_network=ppo.make_inference_fn(agent_networks, evaluation=True), variable_source=agent) eval_env = helpers.make_environment(task=FLAGS.env_name) eval_loop = acme.EnvironmentLoop(eval_env, eval_actor, counter=counting.Counter(prefix='eval'), logger=eval_logger) assert FLAGS.num_steps % FLAGS.eval_every == 0 for _ in range(FLAGS.num_steps // FLAGS.eval_every): eval_loop.run(num_episodes=5) train_loop.run(num_steps=FLAGS.eval_every) eval_loop.run(num_episodes=5)
def main(_): env = helpers.make_environment(FLAGS.level) env_spec = acme.make_environment_spec(env) network = networks.DQNAtariNetwork(env_spec.actions.num_values) agent = dqn.DQN(env_spec, network) loop = acme.EnvironmentLoop(env, agent) loop.run(FLAGS.num_episodes)
def main(_): # Create an environment, grab the spec, and use it to create networks. environment = helpers.make_environment(task=FLAGS.env_name) environment_spec = specs.make_environment_spec(environment) agent_networks = value_dice.make_networks(environment_spec) # Construct the agent. config = value_dice.ValueDiceConfig( num_sgd_steps_per_step=FLAGS.num_sgd_steps_per_step) agent = value_dice.ValueDice(environment_spec, agent_networks, config=config, make_demonstrations=functools.partial( helpers.make_demonstration_iterator, dataset_name=FLAGS.dataset_name), seed=FLAGS.seed) # Create the environment loop used for training. train_logger = experiment_utils.make_experiment_logger( label='train', steps_key='train_steps') train_loop = acme.EnvironmentLoop(environment, agent, counter=counting.Counter(prefix='train'), logger=train_logger) # Create the evaluation actor and loop. eval_logger = experiment_utils.make_experiment_logger( label='eval', steps_key='eval_steps') eval_actor = agent.builder.make_actor( random_key=jax.random.PRNGKey(FLAGS.seed), policy_network=value_dice.apply_policy_and_sample(agent_networks, eval_mode=True), variable_source=agent) eval_env = helpers.make_environment(task=FLAGS.env_name) eval_loop = acme.EnvironmentLoop(eval_env, eval_actor, counter=counting.Counter(prefix='eval'), logger=eval_logger) assert FLAGS.num_steps % FLAGS.eval_every == 0 for _ in range(FLAGS.num_steps // FLAGS.eval_every): eval_loop.run(num_episodes=5) train_loop.run(num_steps=FLAGS.eval_every) eval_loop.run(num_episodes=5)
def main(_): # Create an environment, grab the spec, and use it to create networks. environment = helpers.make_environment(task=FLAGS.env_name) environment_spec = specs.make_environment_spec(environment) agent_networks = td3.make_networks(environment_spec) # Construct the agent. config = td3.TD3Config(num_sgd_steps_per_step=FLAGS.num_sgd_steps_per_step) agent = td3.TD3(environment_spec, agent_networks, config=config, seed=FLAGS.seed) # Create the environment loop used for training. train_logger = experiment_utils.make_experiment_logger( label='train', steps_key='train_steps') train_loop = acme.EnvironmentLoop(environment, agent, counter=counting.Counter(prefix='train'), logger=train_logger) # Create the evaluation actor and loop. eval_logger = experiment_utils.make_experiment_logger( label='eval', steps_key='eval_steps') eval_actor = agent.builder.make_actor( random_key=jax.random.PRNGKey(FLAGS.seed), policy_network=td3.get_default_behavior_policy( agent_networks, environment_spec.actions, sigma=0.), variable_source=agent) eval_env = helpers.make_environment(task=FLAGS.env_name) eval_loop = acme.EnvironmentLoop(eval_env, eval_actor, counter=counting.Counter(prefix='eval'), logger=eval_logger) assert FLAGS.num_steps % FLAGS.eval_every == 0 for _ in range(FLAGS.num_steps // FLAGS.eval_every): eval_loop.run(num_episodes=5) train_loop.run(num_steps=FLAGS.eval_every) eval_loop.run(num_episodes=5)
def main(_): task = FLAGS.task environment_factory = lambda seed: helpers.make_environment(task) program = sac.DistributedSAC( environment_factory=environment_factory, network_factory=sac.make_networks, config=sac.SACConfig(**{'num_sgd_steps_per_step': 64}), num_actors=4, seed=1, max_number_of_steps=100).build() # Launch experiment. lp.launch(program, xm_resources=lp_utils.make_xm_docker_resources(program))
def main(_): task = FLAGS.task environment_factory = lambda seed: helpers.make_environment(task) sac_config = sac.SACConfig(num_sgd_steps_per_step=64) sac_builder = sac.SACBuilder(sac_config) ail_config = ail.AILConfig(direct_rl_batch_size=sac_config.batch_size * sac_config.num_sgd_steps_per_step) def network_factory(spec: specs.EnvironmentSpec) -> ail.AILNetworks: def discriminator(*args, **kwargs) -> networks_lib.Logits: return ail.DiscriminatorModule(environment_spec=spec, use_action=True, use_next_obs=True, network_core=ail.DiscriminatorMLP( [4, 4], ))(*args, **kwargs) discriminator_transformed = hk.without_apply_rng( hk.transform_with_state(discriminator)) return ail.AILNetworks(ail.make_discriminator( spec, discriminator_transformed), imitation_reward_fn=ail.rewards.gail_reward(), direct_rl_networks=sac.make_networks(spec)) def policy_network( network: ail.AILNetworks, eval_mode: bool = False) -> actor_core_lib.FeedForwardPolicy: return sac.apply_policy_and_sample(network.direct_rl_networks, eval_mode=eval_mode) program = ail.DistributedAIL( environment_factory=environment_factory, rl_agent=sac_builder, config=ail_config, network_factory=network_factory, seed=0, batch_size=sac_config.batch_size * sac_config.num_sgd_steps_per_step, make_demonstrations=functools.partial( helpers.make_demonstration_iterator, dataset_name=FLAGS.dataset_name), policy_network=policy_network, evaluator_policy_network=(lambda n: policy_network(n, eval_mode=True)), num_actors=4, max_number_of_steps=100, discriminator_loss=ail.losses.gail_loss()).build() # Launch experiment. lp.launch(program, xm_resources=lp_utils.make_xm_docker_resources(program))
def build_experiment_config(): """Builds PPO experiment config which can be executed in different ways.""" # Create an environment, grab the spec, and use it to create networks. suite, task = FLAGS.env_name.split(':', 1) config = ppo.PPOConfig(entropy_cost=0, learning_rate=1e-4) ppo_builder = ppo.PPOBuilder(config) layer_sizes = (256, 256, 256) return experiments.ExperimentConfig( builder=ppo_builder, environment_factory=lambda seed: helpers.make_environment(suite, task), network_factory=lambda spec: ppo.make_networks(spec, layer_sizes), seed=FLAGS.seed, max_num_actor_steps=FLAGS.num_steps)
def main(_): task = FLAGS.task env_factory = lambda seed: helpers.make_environment(task) environment_spec = specs.make_environment_spec(env_factory(True)) program = td3.DistributedTD3( environment_factory=env_factory, environment_spec=environment_spec, network_factory=td3.make_networks, config=td3.TD3Config(), num_actors=4, seed=1, max_number_of_steps=100).build() lp.launch(program, xm_resources=lp_utils.make_xm_docker_resources(program))
def main(_): task = FLAGS.task environment_factory = lambda seed: helpers.make_environment(task) config = ppo.PPOConfig(unroll_length=16, num_minibatches=32, num_epochs=10, batch_size=2048 // 16) program = ppo.DistributedPPO(environment_factory=environment_factory, network_factory=ppo.make_continuous_networks, config=config, seed=FLAGS.seed, num_actors=4, max_number_of_steps=100).build() # Launch experiment. lp.launch(program, xm_resources=lp_utils.make_xm_docker_resources(program))
def main(_): env = helpers.make_environment(level=FLAGS.level, oar_wrapper=True) env_spec = acme.make_environment_spec(env) config = r2d2.R2D2Config(batch_size=16, trace_length=20, burn_in_length=10, sequence_period=10) agent = r2d2.R2D2(env_spec, networks=r2d2.make_atari_networks( config.batch_size, env_spec), config=config, seed=FLAGS.seed) loop = acme.EnvironmentLoop(env, agent) loop.run(FLAGS.num_episodes)
def main(_): task = FLAGS.env_name environment_factory = lambda seed: helpers.make_environment(task) config = value_dice.ValueDiceConfig(num_sgd_steps_per_step=64) agent = value_dice.DistributedValueDice( environment_factory=environment_factory, network_factory=value_dice.make_networks, config=config, num_actors=4, log_to_bigtable=True, max_number_of_steps=100, seed=1, make_demonstrations=functools.partial( helpers.make_demonstration_iterator, dataset_name=FLAGS.dataset_name)) program = agent.build() # Launch experiment. lp.launch(program, xm_resources=lp_utils.make_xm_docker_resources(program))
def build_experiment_config(): """Builds TD3 experiment config which can be executed in different ways.""" # Create an environment, grab the spec, and use it to create networks. suite, task = FLAGS.env_name.split(':', 1) network_factory = (lambda spec: td3.make_networks( spec, hidden_layer_sizes=(256, 256, 256))) # Construct the agent. config = td3.TD3Config( policy_learning_rate=3e-4, critic_learning_rate=3e-4, ) td3_builder = td3.TD3Builder(config) # pylint:disable=g-long-lambda return experiments.ExperimentConfig( builder=td3_builder, environment_factory=lambda seed: helpers.make_environment(suite, task), network_factory=network_factory, seed=FLAGS.seed, max_num_actor_steps=FLAGS.num_steps)
def main(_): env = helpers.make_environment(level=FLAGS.level, oar_wrapper=True) env_spec = acme.make_environment_spec(env) config = impala.IMPALAConfig( batch_size=16, sequence_period=10, seed=FLAGS.seed, ) networks = impala.make_atari_networks(env_spec) agent = impala.IMPALAFromConfig( environment_spec=env_spec, forward_fn=networks.forward_fn, unroll_init_fn=networks.unroll_init_fn, unroll_fn=networks.unroll_fn, initial_state_init_fn=networks.initial_state_init_fn, initial_state_fn=networks.initial_state_fn, config=config, ) loop = acme.EnvironmentLoop(env, agent) loop.run(FLAGS.num_episodes)
def main(_): # Access flag value. level = FLAGS.task environment_factory = ( lambda seed: helpers.make_environment(level=level, oar_wrapper=True)) config = r2d2.R2D2Config() def net_factory(spec: specs.EnvironmentSpec): return r2d2_networks.make_atari_networks(config.batch_size, env_spec=spec) env = environment_factory(False) env_spec = acme.make_environment_spec(env) program = r2d2.DistributedR2D2FromConfig( seed=0, environment_factory=environment_factory, network_factory=net_factory, config=config, num_actors=FLAGS.num_actors, environment_spec=env_spec, ).build() lp.launch(program, lp.LaunchType.LOCAL_MULTI_PROCESSING)
def main(_): # Create an environment, grab the spec, and use it to create networks. environment = helpers.make_environment(task=FLAGS.env_name) environment_spec = specs.make_environment_spec(environment) # Construct the agent. # Local layout makes sure that we populate the buffer with min_replay_size # initial transitions and that there's no need for tolerance_rate. In order # for deadlocks not to happen we need to disable rate limiting that heppens # inside the TD3Builder. This is achieved by the min_replay_size and # samples_per_insert_tolerance_rate arguments. td3_config = td3.TD3Config( num_sgd_steps_per_step=FLAGS.num_sgd_steps_per_step, min_replay_size=1, samples_per_insert_tolerance_rate=float('inf')) td3_networks = td3.make_networks(environment_spec) if FLAGS.pretrain: td3_networks = add_bc_pretraining(td3_networks) ail_config = ail.AILConfig(direct_rl_batch_size=td3_config.batch_size * td3_config.num_sgd_steps_per_step) dac_config = ail.DACConfig(ail_config, td3_config) def discriminator(*args, **kwargs) -> networks_lib.Logits: return ail.DiscriminatorModule(environment_spec=environment_spec, use_action=True, use_next_obs=True, network_core=ail.DiscriminatorMLP( [4, 4], ))(*args, **kwargs) discriminator_transformed = hk.without_apply_rng( hk.transform_with_state(discriminator)) ail_network = ail.AILNetworks( ail.make_discriminator(environment_spec, discriminator_transformed), imitation_reward_fn=ail.rewards.gail_reward(), direct_rl_networks=td3_networks) agent = ail.DAC(spec=environment_spec, network=ail_network, config=dac_config, seed=FLAGS.seed, batch_size=td3_config.batch_size * td3_config.num_sgd_steps_per_step, make_demonstrations=functools.partial( helpers.make_demonstration_iterator, dataset_name=FLAGS.dataset_name), policy_network=td3.get_default_behavior_policy( td3_networks, action_specs=environment_spec.actions, sigma=td3_config.sigma)) # Create the environment loop used for training. train_logger = experiment_utils.make_experiment_logger( label='train', steps_key='train_steps') train_loop = acme.EnvironmentLoop(environment, agent, counter=counting.Counter(prefix='train'), logger=train_logger) # Create the evaluation actor and loop. # TODO(lukstafi): sigma=0 for eval? eval_logger = experiment_utils.make_experiment_logger( label='eval', steps_key='eval_steps') eval_actor = agent.builder.make_actor( random_key=jax.random.PRNGKey(FLAGS.seed), policy_network=td3.get_default_behavior_policy( td3_networks, action_specs=environment_spec.actions, sigma=0.), variable_source=agent) eval_env = helpers.make_environment(task=FLAGS.env_name) eval_loop = acme.EnvironmentLoop(eval_env, eval_actor, counter=counting.Counter(prefix='eval'), logger=eval_logger) assert FLAGS.num_steps % FLAGS.eval_every == 0 for _ in range(FLAGS.num_steps // FLAGS.eval_every): eval_loop.run(num_episodes=5) train_loop.run(num_steps=FLAGS.eval_every) eval_loop.run(num_episodes=5)
def main(_): # Create an environment, grab the spec, and use it to create networks. environment = helpers.make_environment(task=FLAGS.env_name) environment_spec = specs.make_environment_spec(environment) agent_networks = ppo.make_continuous_networks(environment_spec) # Construct the agent. ppo_config = ppo.PPOConfig(unroll_length=FLAGS.unroll_length, num_minibatches=FLAGS.ppo_num_minibatches, num_epochs=FLAGS.ppo_num_epochs, batch_size=FLAGS.transition_batch_size // FLAGS.unroll_length, learning_rate=0.0003, entropy_cost=0, gae_lambda=0.8, value_cost=0.25) ppo_networks = ppo.make_continuous_networks(environment_spec) if FLAGS.pretrain: ppo_networks = add_bc_pretraining(ppo_networks) discriminator_batch_size = FLAGS.transition_batch_size ail_config = ail.AILConfig( direct_rl_batch_size=ppo_config.batch_size * ppo_config.unroll_length, discriminator_batch_size=discriminator_batch_size, is_sequence_based=True, num_sgd_steps_per_step=FLAGS.num_discriminator_steps_per_step, share_iterator=FLAGS.share_iterator, ) def discriminator(*args, **kwargs) -> networks_lib.Logits: # Note: observation embedding is not needed for e.g. Mujoco. return ail.DiscriminatorModule( environment_spec=environment_spec, use_action=True, use_next_obs=True, network_core=ail.DiscriminatorMLP([4, 4], ), )(*args, **kwargs) discriminator_transformed = hk.without_apply_rng( hk.transform_with_state(discriminator)) ail_network = ail.AILNetworks( ail.make_discriminator(environment_spec, discriminator_transformed), imitation_reward_fn=ail.rewards.gail_reward(), direct_rl_networks=ppo_networks) agent = ail.GAIL(spec=environment_spec, network=ail_network, config=ail.GAILConfig(ail_config, ppo_config), seed=FLAGS.seed, batch_size=ppo_config.batch_size, make_demonstrations=functools.partial( helpers.make_demonstration_iterator, dataset_name=FLAGS.dataset_name), policy_network=ppo.make_inference_fn(ppo_networks)) # Create the environment loop used for training. train_logger = experiment_utils.make_experiment_logger( label='train', steps_key='train_steps') train_loop = acme.EnvironmentLoop(environment, agent, counter=counting.Counter(prefix='train'), logger=train_logger) # Create the evaluation actor and loop. eval_logger = experiment_utils.make_experiment_logger( label='eval', steps_key='eval_steps') eval_actor = agent.builder.make_actor( random_key=jax.random.PRNGKey(FLAGS.seed), policy_network=ppo.make_inference_fn(agent_networks, evaluation=True), variable_source=agent) eval_env = helpers.make_environment(task=FLAGS.env_name) eval_loop = acme.EnvironmentLoop(eval_env, eval_actor, counter=counting.Counter(prefix='eval'), logger=eval_logger) assert FLAGS.num_steps % FLAGS.eval_every == 0 for _ in range(FLAGS.num_steps // FLAGS.eval_every): eval_loop.run(num_episodes=5) train_loop.run(num_steps=FLAGS.eval_every) eval_loop.run(num_episodes=5)