def __init__(self, env_id, agent, verbose=True, log_interval=100, eval=False): ''' PARAMETERS: 'env_id' - Environment ID eg: environments.CARTPOLE 'agent' - Instance of an Agent class with the necessary methods implemented 'verbose' - True: prints logs, False: doesn't print logs 'log_interval' - Interval between episodes to print logs at 'eval' - Use custom private results path as results dir ''' self.agent = agent self.env_id = env_id self.verbose = verbose self.log_interval = log_interval if (eval): results_dir = os.environ.get('PRIVATE_RESULTS_DIR') else: results_dir = os.environ.get('RESULTS_DIR') env = bsuite.load_and_record_to_csv(env_id, results_dir=results_dir, overwrite=True) self.env = gym_wrapper.GymFromDMEnv(env)
def make_bsuite_environment(bsuite_id: str = 'deep_sea/0', results_dir: str = '/tmp/bsuite', overwrite: bool = False) -> dm_env.Environment: raw_environment = bsuite.load_and_record_to_csv( bsuite_id=bsuite_id, results_dir=results_dir, overwrite=overwrite, ) return wrappers.SinglePrecisionWrapper(raw_environment)
def get_env(*args, **kwargs): return GymEnvWrapper( TransformObservation(env=FrameStack( num_stack=4, env=(gym_wrapper.GymFromDMEnv( bsuite.load_and_record_to_csv( bsuite_id=bsuite_id, results_dir=results_dir, overwrite=True, )) if not gym_id else gym.make(gym_id))), f=lambda lazy_frames: np.reshape( np.stack(lazy_frames._frames), -1)))
def run_random(): for env_name in sweep.SWEEP: # Or for a specific suite: sweep.DEEP_SEA dm_env = bsuite.load_and_record_to_csv(env_name, results_dir=RANDOM_RESULTS_PATH, overwrite=True) # Instanciate the agent env = gym_wrapper.GymWrapper(dm_env) env = ch.envs.Runner(env) policy = ch.models.RandomPolicy(env) # Generate the results print('Running', env_name) env.run(policy, episodes=env.bsuite_num_episodes)
def main(_): # Create an environment and grab the spec. raw_environment = bsuite.load_and_record_to_csv( bsuite_id=FLAGS.bsuite_id, results_dir=FLAGS.results_dir, overwrite=FLAGS.overwrite, ) environment = wrappers.SinglePrecisionWrapper(raw_environment) environment_spec = specs.make_environment_spec(environment) network = snt.Sequential([ snt.Flatten(), snt.nets.MLP([50, 50, environment_spec.actions.num_values]) ]) # Construct the agent. agent = dqn.DQN(environment_spec=environment_spec, network=network) # Run the environment loop. loop = acme.EnvironmentLoop(environment, agent) loop.run(num_episodes=environment.bsuite_num_episodes) # pytype: disable=attribute-error
def main(_): # Create an environment and grab the spec. raw_environment = bsuite.load_and_record_to_csv( bsuite_id=FLAGS.bsuite_id, results_dir=FLAGS.results_dir, overwrite=FLAGS.overwrite, ) environment = wrappers.SinglePrecisionWrapper(raw_environment) environment_spec = specs.make_environment_spec(environment) # Construct the agent. agent = dqfd.DQfD( environment_spec=environment_spec, network=make_network(environment_spec.actions), demonstration_dataset=bsuite_demonstrations.make_dataset(raw_environment), demonstration_ratio=FLAGS.demonstration_ratio, samples_per_insert=FLAGS.samples_per_insert, learning_rate=FLAGS.learning_rate) # Run the environment loop. loop = acme.EnvironmentLoop(environment, agent) loop.run(num_episodes=environment.bsuite_num_episodes) # pytype: disable=attribute-error
def run_trpo(): ch.debug.debug() for i, env_name in enumerate(sweep.SWEEP): dm_env = bsuite.load_and_record_to_csv(env_name, results_dir=TRPO_RESULTS_PATH, overwrite=True) # Instanciate the env and agent env = gym_wrapper.GymWrapper(dm_env) env = ch.envs.Torch(env) env = ch.envs.Runner(env) policy = Policy(env) baseline = LinearValue(env.state_size) # Generate the results replay = ch.ExperienceReplay() for episode in tqdm(range(1, 1 + env.bsuite_num_episodes), desc=env_name): replay += env.run(policy, episodes=1) if episode % 10 == 0: trpo_update(replay, policy, baseline) replay.empty()
def main(_): # Create an environment and grab the spec. raw_environment = bsuite.load_and_record_to_csv( bsuite_id=FLAGS.bsuite_id, results_dir=FLAGS.results_dir, overwrite=FLAGS.overwrite, ) environment = wrappers.SinglePrecisionWrapper(raw_environment) environment_spec = specs.make_environment_spec(environment) # Create the networks to optimize. network = make_network(environment_spec.actions) agent = impala.IMPALA( environment_spec=environment_spec, network=network, sequence_length=3, sequence_period=3, ) # Run the environment loop. loop = acme.EnvironmentLoop(environment, agent) loop.run(num_episodes=environment.bsuite_num_episodes) # pytype: disable=attribute-error
def main(_): # Create an environment and grab the spec. raw_environment = bsuite.load_and_record_to_csv( bsuite_id=FLAGS.bsuite_id, results_dir=FLAGS.results_dir, overwrite=FLAGS.overwrite, ) environment = single_precision.SinglePrecisionWrapper(raw_environment) environment_spec = specs.make_environment_spec(environment) # Build demonstration dataset. if hasattr(raw_environment, 'raw_env'): raw_environment = raw_environment.raw_env batch_dataset = bsuite_demonstrations.make_dataset(raw_environment, stochastic=False) # Combine with demonstration dataset. transition = functools.partial(_n_step_transition_from_episode, n_step=1, additional_discount=1.) dataset = batch_dataset.map(transition) # Batch and prefetch. dataset = dataset.batch(FLAGS.batch_size, drop_remainder=True) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) # Create the networks to optimize. policy_network = make_policy_network(environment_spec.actions) # If the agent is non-autoregressive use epsilon=0 which will be a greedy # policy. evaluator_network = snt.Sequential([ policy_network, lambda q: trfl.epsilon_greedy(q, epsilon=FLAGS.epsilon).sample(), ]) # Ensure that we create the variables before proceeding (maybe not needed). tf2_utils.create_variables(policy_network, [environment_spec.observations]) counter = counting.Counter() learner_counter = counting.Counter(counter, prefix='learner') # Create the actor which defines how we take actions. evaluation_network = actors.FeedForwardActor(evaluator_network) eval_loop = acme.EnvironmentLoop(environment=environment, actor=evaluation_network, counter=counter, logger=loggers.TerminalLogger( 'evaluation', time_delta=1.)) # The learner updates the parameters (and initializes them). learner = learning.BCLearner(network=policy_network, learning_rate=FLAGS.learning_rate, dataset=dataset, counter=learner_counter) # Run the environment loop. while True: for _ in range(FLAGS.evaluate_every): learner.step() learner_counter.increment(learner_steps=FLAGS.evaluate_every) eval_loop.run(FLAGS.evaluation_episodes)
plot_ext = "." + plot_format #envs = sweep.BANDIT envs = ["bandit/0"] for bsuite_id in envs: b_env = 'bandit' env_plot_path = Path(plot_dir + bsuite_id.replace("/", "-") + "/") env_plot_path.mkdir(parents=True, exist_ok=True) env_plot_path = str(env_plot_path.resolve()) args = get_args() # Initialize the environment bsuite_env = load_and_record_to_csv(bsuite_id, results_dir=csv_dir, overwrite=True) gym_env = gym_wrapper.GymFromDMEnv(bsuite_env) env = GymEnv(gym_env) env_builder = lambda: env algo = setup_test(args, env) off_policy_trainer = OffPolicyTrainer() off_policy_trainer.train(args, env_builder, algo) # Analyze performance df, sweep_vars = csv_load.load_bsuite(csv_dir) bandit_df = df[df.bsuite_env == b_env].copy()
def main(_): # Create an environment and grab the spec. raw_environment = bsuite.load_and_record_to_csv( bsuite_id=FLAGS.bsuite_id, results_dir=FLAGS.results_dir, overwrite=FLAGS.overwrite, ) environment = single_precision.SinglePrecisionWrapper(raw_environment) environment_spec = specs.make_environment_spec(environment) # Build demonstration dataset. if hasattr(raw_environment, 'raw_env'): raw_environment = raw_environment.raw_env batch_dataset = bsuite_demonstrations.make_dataset(raw_environment) # Combine with demonstration dataset. transition = functools.partial(_n_step_transition_from_episode, n_step=1, additional_discount=1.) dataset = batch_dataset.map(transition) # Batch and prefetch. dataset = dataset.batch(FLAGS.batch_size, drop_remainder=True) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) dataset = tfds.as_numpy(dataset) # Create the networks to optimize. policy_network = make_policy_network(environment_spec.actions) policy_network = hk.without_apply_rng(hk.transform(policy_network)) # If the agent is non-autoregressive use epsilon=0 which will be a greedy # policy. def evaluator_network(params: hk.Params, key: jnp.DeviceArray, observation: jnp.DeviceArray) -> jnp.DeviceArray: action_values = policy_network.apply(params, observation) return rlax.epsilon_greedy(FLAGS.epsilon).sample(key, action_values) counter = counting.Counter() learner_counter = counting.Counter(counter, prefix='learner') # The learner updates the parameters (and initializes them). learner = learning.BCLearner(network=policy_network, optimizer=optax.adam(FLAGS.learning_rate), obs_spec=environment.observation_spec(), dataset=dataset, counter=learner_counter, rng=hk.PRNGSequence(FLAGS.seed)) # Create the actor which defines how we take actions. variable_client = variable_utils.VariableClient(learner, '') evaluator = actors.FeedForwardActor(evaluator_network, variable_client=variable_client, rng=hk.PRNGSequence(FLAGS.seed)) eval_loop = acme.EnvironmentLoop(environment=environment, actor=evaluator, counter=counter, logger=loggers.TerminalLogger( 'evaluation', time_delta=1.)) # Run the environment loop. while True: for _ in range(FLAGS.evaluate_every): learner.step() learner_counter.increment(learner_steps=FLAGS.evaluate_every) eval_loop.run(FLAGS.evaluation_episodes)