def main(_): # Create an environment and create the spec. environment, environment_spec = _build_environment( FLAGS.environment_name, max_steps=FLAGS.max_steps_per_episode) if FLAGS.model_name: loaded_network = load_wb_model(FLAGS.model_name, FLAGS.model_tag) if FLAGS.stochastic: head = networks.StochasticSamplingHead() else: head = lambda q: trfl.epsilon_greedy(q, epsilon=FLAGS.epsilon ).sample() policy_network = snt.Sequential([ loaded_network, head, ]) actor = actors.FeedForwardActor(policy_network) else: actor = RandomActor(environment_spec) recorder = DemonstrationRecorder(environment, actor) recorder.collect_n_episodes(FLAGS.n_episodes) recorder.make_tf_dataset() recorder.save(FLAGS.save_dir)
def make_actor( self, policy_network: snt.Module, adder: Optional[adders.Adder] = None, variable_source: Optional[core.VariableSource] = None, ): """Create an actor instance.""" if variable_source: # Create the variable client responsible for keeping the actor up-to-date. variable_client = variable_utils.VariableClient( client=variable_source, variables={'policy': policy_network.variables}, update_period=1000, ) # Make sure not to use a random policy after checkpoint restoration by # assigning variables before running the environment loop. variable_client.update_and_wait() else: variable_client = None # Create the actor which defines how we take actions. return actors.FeedForwardActor( policy_network=policy_network, adder=adder, variable_client=variable_client, )
def actor( self, replay: reverb.Client, variable_source: acme.VariableSource, counter: counting.Counter, ) -> acme.EnvironmentLoop: """The actor process.""" action_spec = self._environment_spec.actions observation_spec = self._environment_spec.observations # Create environment and target networks to act with. environment = self._environment_factory(False) agent_networks = self._network_factory(action_spec, self._num_critic_heads) # Make sure observation network is defined. observation_network = agent_networks.get('observation', tf.identity) # Create a stochastic behavior policy. behavior_network = snt.Sequential([ observation_network, agent_networks['policy'], networks.StochasticSamplingHead(), ]) # Ensure network variables are created. tf2_utils.create_variables(behavior_network, [observation_spec]) policy_variables = {'policy': behavior_network.variables} # Create the variable client responsible for keeping the actor up-to-date. variable_client = tf2_variable_utils.VariableClient(variable_source, policy_variables, update_period=1000) # Make sure not to use a random policy after checkpoint restoration by # assigning variables before running the environment loop. variable_client.update_and_wait() # Component to add things into replay. adder = adders.NStepTransitionAdder( client=replay, n_step=self._n_step, max_in_flight_items=self._max_in_flight_items, discount=self._additional_discount) # Create the agent. actor = actors.FeedForwardActor(policy_network=behavior_network, adder=adder, variable_client=variable_client) # Create logger and counter; actors will not spam bigtable. counter = counting.Counter(counter, 'actor') logger = loggers.make_default_logger('actor', save_data=False, time_delta=self._log_every, steps_key='actor_steps') # Create the run loop and return it. return acme.EnvironmentLoop(environment, actor, counter, logger)
def main(_): # Create an environment, grab the spec, and use it to create networks. environment = make_environment(FLAGS.task_name) environment_spec = specs.make_environment_spec(environment) agent_networks = make_networks(environment_spec) # Construct the agent. agent = sac.SAC( environment_spec=environment_spec, policy_network=agent_networks['policy'], critic_network=agent_networks['critic'], encoder_network=agent_networks['observation'], #sigma=0.3, # pytype: disable=wrong-arg-types ) # Create the environment loop used for training. train_loop = acme.EnvironmentLoop(environment, agent, label='train_loop') # Create the evaluation policy. eval_policy = agent.behavior_network # Create the evaluation actor and loop. eval_actor = actors.FeedForwardActor(policy_network=eval_policy) eval_env = make_environment(FLAGS.task_name) eval_loop = acme.EnvironmentLoop(eval_env, eval_actor, label='eval_loop') for _ in range(FLAGS.num_episodes // FLAGS.num_episodes_per_eval): train_loop.run(num_episodes=FLAGS.num_episodes_per_eval)
def main(_): # Create an environment, grab the spec, and use it to create networks. environment = make_environment() environment_spec = specs.make_environment_spec(environment) agent_networks = make_networks(environment_spec.actions) # Construct the agent. agent = d4pg.D4PG( environment_spec=environment_spec, policy_network=agent_networks['policy'], critic_network=agent_networks['critic'], observation_network=agent_networks['observation'], # pytype: disable=wrong-arg-types ) # Create the environment loop used for training. train_loop = acme.EnvironmentLoop(environment, agent, label='train_loop') # Create the evaluation policy. eval_policy = snt.Sequential([ agent_networks['observation'], agent_networks['policy'], ]) # Create the evaluation actor and loop. eval_actor = actors.FeedForwardActor(policy_network=eval_policy) eval_env = make_environment() eval_loop = acme.EnvironmentLoop(eval_env, eval_actor, label='eval_loop') for _ in range(FLAGS.num_episodes // FLAGS.num_episodes_per_eval): train_loop.run(num_episodes=FLAGS.num_episodes_per_eval) eval_loop.run(num_episodes=1)
def main(_): # Create an environment and grab the spec. environment = atari.environment(FLAGS.game) environment_spec = specs.make_environment_spec(environment) # Create dataset. dataset = atari.dataset(path=FLAGS.dataset_path, game=FLAGS.game, run=FLAGS.run, num_shards=FLAGS.num_shards) # Discard extra inputs dataset = dataset.map(lambda x: x._replace(data=x.data[:5])) # Batch and prefetch. dataset = dataset.batch(FLAGS.batch_size, drop_remainder=True) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) # Build network. g_network = make_network(environment_spec.actions) q_network = make_network(environment_spec.actions) network = networks.DiscreteFilteredQNetwork(g_network=g_network, q_network=q_network, threshold=FLAGS.bcq_threshold) tf2_utils.create_variables(network, [environment_spec.observations]) evaluator_network = snt.Sequential([ q_network, lambda q: trfl.epsilon_greedy(q, epsilon=FLAGS.epsilon).sample(), ]) # Counters. counter = counting.Counter() learner_counter = counting.Counter(counter, prefix='learner') # Create the actor which defines how we take actions. evaluation_network = actors.FeedForwardActor(evaluator_network) eval_loop = acme.EnvironmentLoop(environment=environment, actor=evaluation_network, counter=counter, logger=loggers.TerminalLogger( 'evaluation', time_delta=1.)) # The learner updates the parameters (and initializes them). learner = bcq.DiscreteBCQLearner( network=network, dataset=dataset, learning_rate=FLAGS.learning_rate, discount=FLAGS.discount, importance_sampling_exponent=FLAGS.importance_sampling_exponent, target_update_period=FLAGS.target_update_period, counter=counter) # Run the environment loop. while True: for _ in range(FLAGS.evaluate_every): learner.step() learner_counter.increment(learner_steps=FLAGS.evaluate_every) eval_loop.run(FLAGS.evaluation_episodes)
def evaluator( self, variable_source: acme.VariableSource, counter: counting.Counter, ): """The evaluation process.""" action_spec = self._environment_spec.actions observation_spec = self._environment_spec.observations # Create environment and target networks to act with. environment = self._environment_factory(True) agent_networks = self._network_factory(action_spec) # Make sure observation network is defined. observation_network = agent_networks.get('observation', tf.identity) # Create a stochastic behavior policy. evaluator_network = snt.Sequential([ observation_network, agent_networks['policy'], networks.StochasticMeanHead(), ]) # Ensure network variables are created. tf2_utils.create_variables(evaluator_network, [observation_spec]) policy_variables = {'policy': evaluator_network.variables} # Create the variable client responsible for keeping the actor up-to-date. variable_client = tf2_variable_utils.VariableClient( variable_source, policy_variables, update_period=self._variable_update_period) # Make sure not to evaluate a random actor by assigning variables before # running the environment loop. variable_client.update_and_wait() # Create the agent. evaluator = actors.FeedForwardActor( policy_network=evaluator_network, variable_client=variable_client) # Create logger and counter. counter = counting.Counter(counter, 'evaluator') logger = loggers.make_default_logger( 'evaluator', time_delta=self._log_every, steps_key='evaluator_steps') observers = self._make_observers() if self._make_observers else () # Create the run loop and return it. return acme.EnvironmentLoop( environment, evaluator, counter, logger, observers=observers)
def actor( self, replay: reverb.Client, variable_source: acme.VariableSource, counter: counting.Counter, ): """The actor process.""" action_spec = self._environment_spec.actions observation_spec = self._environment_spec.observations # Create environment and behavior networks environment = self._environment_factory(False) agent_networks = self._network_factory(action_spec) # Create behavior network by adding some random dithering. behavior_network = snt.Sequential([ agent_networks.get('observation', tf.identity), agent_networks.get('policy'), networks.ClippedGaussian(self._sigma), ]) # Ensure network variables are created. tf2_utils.create_variables(behavior_network, [observation_spec]) variables = {'policy': behavior_network.variables} # Create the variable client responsible for keeping the actor up-to-date. variable_client = tf2_variable_utils.VariableClient( variable_source, variables, update_period=self._variable_update_period) # Make sure not to use a random policy after checkpoint restoration by # assigning variables before running the environment loop. variable_client.update_and_wait() # Component to add things into replay. adder = adders.NStepTransitionAdder(client=replay, n_step=self._n_step, discount=self._discount) # Create the agent. actor = actors.FeedForwardActor(behavior_network, adder=adder, variable_client=variable_client) # Create logger and counter; actors will not spam bigtable. counter = counting.Counter(counter, 'actor') logger = loggers.make_default_logger('actor', save_data=False, time_delta=self._log_every, steps_key='actor_steps') # Create the loop to connect environment and agent. return acme.EnvironmentLoop(environment, actor, counter, logger)
def main(_): wb_run = init_or_resume() if FLAGS.seed: tf.random.set_seed(FLAGS.seed) # Create an environment and grab the spec. environment, env_spec = _build_environment(FLAGS.environment_name) # Load demonstration dataset. raw_dataset = load_tf_dataset(directory=FLAGS.dataset_dir) dataset = preprocess_dataset(raw_dataset, FLAGS.batch_size, FLAGS.n_step_returns, FLAGS.discount) # Create the policy and critic networks. policy_network = networks.get_default_critic(env_spec) # Ensure that we create the variables before proceeding (maybe not needed). tf2_utils.create_variables(policy_network, [env_spec.observations]) # If the agent is non-autoregressive use epsilon=0 which will be a greedy # policy. evaluator_network = snt.Sequential([ policy_network, lambda q: trfl.epsilon_greedy(q, epsilon=FLAGS.epsilon).sample(), ]) # Create the actor which defines how we take actions. evaluation_actor = actors.FeedForwardActor(evaluator_network) counter = counting.Counter() disp, disp_loop = _build_custom_loggers(wb_run) eval_loop = EnvironmentLoop(environment=environment, actor=evaluation_actor, counter=counter, logger=disp_loop) # The learner updates the parameters (and initializes them). learner = BCLearner(network=policy_network, learning_rate=FLAGS.learning_rate, dataset=dataset, counter=counter) # Run the environment loop. for _ in tqdm(range(FLAGS.epochs)): for _ in range(FLAGS.evaluate_every): learner.step() eval_loop.run(FLAGS.evaluation_episodes) learner.save(tag=FLAGS.logs_tag)
def main(_): # Initialize Neptune and create an experiment. neptune.init(FLAGS.neptune_project_name) experiment = neptune.create_experiment(name='Acme example') # Create an environment, grab the spec, and use it to create networks. environment = make_environment() environment_spec = specs.make_environment_spec(environment) agent_networks = make_networks(environment_spec.actions) # Construct the agent. agent = d4pg.D4PG( environment_spec=environment_spec, policy_network=agent_networks['policy'], critic_network=agent_networks['critic'], observation_network=agent_networks['observation'], sigma=1.0, # pytype: disable=wrong-arg-types logger=make_logger(experiment, prefix='learner'), ) # Create the environment loop used for training. train_loop = acme.EnvironmentLoop(environment, agent, label='train_loop', logger=make_logger( experiment, prefix='train', smoothing_regex='return')) # Create the evaluation policy. eval_policy = snt.Sequential([ agent_networks['observation'], agent_networks['policy'], ]) # Create the evaluation actor and loop. eval_actor = actors.FeedForwardActor(policy_network=eval_policy) eval_env = make_environment() eval_logger = make_logger(experiment, prefix='eval', aggregate_regex='return') eval_loop = acme.EnvironmentLoop( eval_env, eval_actor, label='eval_loop', logger=eval_logger, ) for _ in range(FLAGS.num_episodes // FLAGS.num_episodes_per_eval): train_loop.run(num_episodes=FLAGS.num_episodes_per_eval) eval_loop.run(num_episodes=5) eval_logger.dump()
def evaluator( self, variable_source: acme.VariableSource, counter: counting.Counter, ): """The evaluation process.""" action_spec = self._environment_spec.actions observation_spec = self._environment_spec.observations # Create environment and evaluator networks environment = self._environment_factory(True) agent_networks = self._network_factory(action_spec) # Create evaluator network. evaluator_network = snt.Sequential([ agent_networks.get('observation', tf.identity), agent_networks.get('policy'), ]) # Ensure network variables are created. tf2_utils.create_variables(evaluator_network, [observation_spec]) variables = {'policy': evaluator_network.variables} # Create the variable client responsible for keeping the actor up-to-date. variable_client = tf2_variable_utils.VariableClient( variable_source, variables, update_period=self._variable_update_period) # Make sure not to evaluate a random actor by assigning variables before # running the environment loop. variable_client.update_and_wait() # Create the evaluator; note it will not add experience to replay. evaluator = actors.FeedForwardActor(evaluator_network, variable_client=variable_client) # Create logger and counter. counter = counting.Counter(counter, 'evaluator') logger = loggers.make_default_logger('evaluator', time_delta=self._log_every, steps_key='evaluator_steps') # Create the run loop and return it. return acme.EnvironmentLoop(environment, evaluator, counter, logger)
def actor( self, replay: reverb.Client, variable_source: acme.VariableSource, counter: counting.Counter, epsilon: float, ) -> acme.EnvironmentLoop: """The actor process.""" environment = self._environment_factory(False) network = self._network_factory(self._env_spec.actions) # Just inline the policy network here. policy_network = snt.Sequential([ network, lambda q: trfl.epsilon_greedy(q, epsilon=epsilon).sample(), ]) tf2_utils.create_variables(policy_network, [self._env_spec.observations]) variable_client = tf2_variable_utils.VariableClient( client=variable_source, variables={'policy': policy_network.trainable_variables}, update_period=self._variable_update_period) # Make sure not to use a random policy after checkpoint restoration by # assigning variables before running the environment loop. variable_client.update_and_wait() # Component to add things into replay. adder = adders.NStepTransitionAdder( client=replay, n_step=self._n_step, discount=self._discount, ) # Create the agent. actor = actors.FeedForwardActor(policy_network, adder, variable_client) # Create the loop to connect environment and agent. counter = counting.Counter(counter, 'actor') logger = loggers.make_default_logger('actor', save_data=False, steps_key='actor_steps') return acme.EnvironmentLoop(environment, actor, counter, logger)
def evaluator( self, variable_source: acme.VariableSource, counter: counting.Counter, ): """The evaluation process.""" # Create environment and target networks to act with. environment = self._environment_factory(True) agent_networks = self._network_factory(self._environment_spec) # Create a stochastic behavior policy. evaluator_network = snt.Sequential([ agent_networks['observation'], agent_networks['policy'], networks.StochasticMeanHead(), ]) # Create the variable client responsible for keeping the actor up-to-date. variable_client = tf2_variable_utils.VariableClient( variable_source, variables={'policy': evaluator_network.variables}, update_period=1000) # Make sure not to evaluate a random actor by assigning variables before # running the environment loop. variable_client.update_and_wait() # Create the agent. evaluator = actors.FeedForwardActor(policy_network=evaluator_network, variable_client=variable_client) # Create logger and counter. counter = counting.Counter(counter, 'evaluator') logger = loggers.make_default_logger('evaluator', time_delta=self._log_every) # Create the run loop and return it. return acme.EnvironmentLoop(environment, evaluator, counter, logger)
def evaluator( self, variable_source: acme.VariableSource, counter: counting.Counter, ): """The evaluation process.""" environment = self._environment_factory(True) network = self._network_factory(self._env_spec.actions) # Just inline the policy network here. policy_network = snt.Sequential([ network, lambda q: trfl.epsilon_greedy(q, self._evaluator_epsilon).sample(), ]) tf2_utils.create_variables(policy_network, [self._env_spec.observations]) variable_client = tf2_variable_utils.VariableClient( client=variable_source, variables={'policy': policy_network.trainable_variables}, update_period=self._variable_update_period) # Make sure not to use a random policy after checkpoint restoration by # assigning variables before running the environment loop. variable_client.update_and_wait() # Create the agent. actor = actors.FeedForwardActor(policy_network, variable_client=variable_client) # Create the run loop and return it. logger = loggers.make_default_logger('evaluator', steps_key='evaluator_steps') counter = counting.Counter(counter, 'evaluator') return acme.EnvironmentLoop(environment, actor, counter=counter, logger=logger)
def cal_mse(value_func, policy_net, environment, mse_samples, discount): sample_count = 0 actor = actors.FeedForwardActor(policy_network=policy_net) timestep = environment.reset() actor.observe_first(timestep) mse = 0.0 while sample_count < mse_samples: current_obs = timestep.observation action = actor.select_action(current_obs) timestep = environment.step(action) actor.observe(action, next_timestep=timestep) next_obs = timestep.observation reward = timestep.reward if timestep.last(): timestep = environment.reset() actor.observe_first(timestep) current_obs = tf2_utils.add_batch_dim(current_obs) action = tf2_utils.add_batch_dim(action) mse_one = (reward - value_func(current_obs, action))**2 print(value_func(current_obs, action).numpy().squeeze()) print(f'reward = {reward}') print('=====End Episode=====') else: next_action = tf2_utils.add_batch_dim( actor.select_action(next_obs)) action = tf2_utils.add_batch_dim(action) current_obs = tf2_utils.add_batch_dim(current_obs) next_obs = tf2_utils.add_batch_dim(next_obs) mse_one = (reward + discount * value_func(next_obs, next_action) - value_func(current_obs, action))**2 print(value_func(current_obs, action).numpy().squeeze()) mse = mse + mse_one.numpy() sample_count += 1 return mse.squeeze() / mse_samples
def main(_): # Create an environment and grab the spec. raw_environment = bsuite.load_and_record_to_csv( bsuite_id=FLAGS.bsuite_id, results_dir=FLAGS.results_dir, overwrite=FLAGS.overwrite, ) environment = single_precision.SinglePrecisionWrapper(raw_environment) environment_spec = specs.make_environment_spec(environment) # Build demonstration dataset. if hasattr(raw_environment, 'raw_env'): raw_environment = raw_environment.raw_env batch_dataset = bsuite_demonstrations.make_dataset(raw_environment, stochastic=False) # Combine with demonstration dataset. transition = functools.partial(_n_step_transition_from_episode, n_step=1, additional_discount=1.) dataset = batch_dataset.map(transition) # Batch and prefetch. dataset = dataset.batch(FLAGS.batch_size, drop_remainder=True) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) # Create the networks to optimize. policy_network = make_policy_network(environment_spec.actions) # If the agent is non-autoregressive use epsilon=0 which will be a greedy # policy. evaluator_network = snt.Sequential([ policy_network, lambda q: trfl.epsilon_greedy(q, epsilon=FLAGS.epsilon).sample(), ]) # Ensure that we create the variables before proceeding (maybe not needed). tf2_utils.create_variables(policy_network, [environment_spec.observations]) counter = counting.Counter() learner_counter = counting.Counter(counter, prefix='learner') # Create the actor which defines how we take actions. evaluation_network = actors.FeedForwardActor(evaluator_network) eval_loop = acme.EnvironmentLoop(environment=environment, actor=evaluation_network, counter=counter, logger=loggers.TerminalLogger( 'evaluation', time_delta=1.)) # The learner updates the parameters (and initializes them). learner = learning.BCLearner(network=policy_network, learning_rate=FLAGS.learning_rate, dataset=dataset, counter=learner_counter) # Run the environment loop. while True: for _ in range(FLAGS.evaluate_every): learner.step() learner_counter.increment(learner_steps=FLAGS.evaluate_every) eval_loop.run(FLAGS.evaluation_episodes)
def __init__( self, environment_spec: specs.EnvironmentSpec, network: snt.Module, batch_size: int = 256, prefetch_size: int = 4, target_update_period: int = 100, samples_per_insert: float = 32.0, min_replay_size: int = 1000, max_replay_size: int = 1000000, importance_sampling_exponent: float = 0.2, priority_exponent: float = 0.6, n_step: int = 5, epsilon: tf.Tensor = None, learning_rate: float = 1e-3, discount: float = 0.99, cql_alpha: float = 1., logger: loggers.Logger = None, counter: counting.Counter = None, checkpoint_subpath: str = '~/acme/', ): """Initialize the agent. Args: environment_spec: description of the actions, observations, etc. network: the online Q network (the one being optimized) batch_size: batch size for updates. prefetch_size: size to prefetch from replay. target_update_period: number of learner steps to perform before updating the target networks. samples_per_insert: number of samples to take from replay for every insert that is made. min_replay_size: minimum replay size before updating. This and all following arguments are related to dataset construction and will be ignored if a dataset argument is passed. max_replay_size: maximum replay size. importance_sampling_exponent: power to which importance weights are raised before normalizing. priority_exponent: exponent used in prioritized sampling. n_step: number of steps to squash into a single transition. epsilon: probability of taking a random action; ignored if a policy network is given. learning_rate: learning rate for the q-network update. discount: discount to use for TD updates. logger: logger object to be used by learner. checkpoint: boolean indicating whether to checkpoint the learner. checkpoint_subpath: directory for the checkpoint. """ # Create a replay server to add data to. This uses no limiter behavior in # order to allow the Agent interface to handle it. replay_table = reverb.Table( name=adders.DEFAULT_PRIORITY_TABLE, sampler=reverb.selectors.Prioritized(priority_exponent), remover=reverb.selectors.Fifo(), max_size=max_replay_size, rate_limiter=reverb.rate_limiters.MinSize(1), signature=adders.NStepTransitionAdder.signature(environment_spec)) self._server = reverb.Server([replay_table], port=None) # The adder is used to insert observations into replay. address = f'localhost:{self._server.port}' adder = adders.NStepTransitionAdder(client=reverb.Client(address), n_step=n_step, discount=discount) # The dataset provides an interface to sample from replay. replay_client = reverb.TFClient(address) dataset = datasets.make_reverb_dataset( client=replay_client, environment_spec=environment_spec, batch_size=batch_size, prefetch_size=prefetch_size, transition_adder=True) # Use constant 0.05 epsilon greedy policy by default. if epsilon is None: epsilon = tf.Variable(0.05, trainable=False) policy_network = snt.Sequential([ network, lambda q: trfl.epsilon_greedy(q, epsilon=epsilon).sample(), ]) # Create a target network. target_network = copy.deepcopy(network) # Ensure that we create the variables before proceeding (maybe not needed). tf2_utils.create_variables(network, [environment_spec.observations]) tf2_utils.create_variables(target_network, [environment_spec.observations]) # Create the actor which defines how we take actions. actor = actors.FeedForwardActor(policy_network, adder) # The learner updates the parameters (and initializes them). learner = CQLLearner( network=network, discount=discount, importance_sampling_exponent=importance_sampling_exponent, learning_rate=learning_rate, cql_alpha=cql_alpha, target_update_period=target_update_period, dataset=dataset, replay_client=replay_client, logger=logger, counter=counter, checkpoint_subpath=checkpoint_subpath) super().__init__(actor=actor, learner=learner, min_observations=max(batch_size, min_replay_size), observations_per_step=float(batch_size) / samples_per_insert)
def __init__( self, environment_spec: specs.EnvironmentSpec, policy_network: snt.Module, critic_network: snt.Module, observation_network: types.TensorTransformation = tf.identity, discount: float = 0.99, batch_size: int = 256, prefetch_size: int = 4, target_policy_update_period: int = 100, target_critic_update_period: int = 100, min_replay_size: int = 1000, max_replay_size: int = 1000000, samples_per_insert: float = 32.0, policy_loss_module: snt.Module = None, policy_optimizer: snt.Optimizer = None, critic_optimizer: snt.Optimizer = None, n_step: int = 5, num_samples: int = 20, clipping: bool = True, logger: loggers.Logger = None, counter: counting.Counter = None, checkpoint: bool = True, replay_table_name: str = adders.DEFAULT_PRIORITY_TABLE, ): """Initialize the agent. Args: environment_spec: description of the actions, observations, etc. policy_network: the online (optimized) policy. critic_network: the online critic. observation_network: optional network to transform the observations before they are fed into any network. discount: discount to use for TD updates. batch_size: batch size for updates. prefetch_size: size to prefetch from replay. target_policy_update_period: number of updates to perform before updating the target policy network. target_critic_update_period: number of updates to perform before updating the target critic network. min_replay_size: minimum replay size before updating. max_replay_size: maximum replay size. samples_per_insert: number of samples to take from replay for every insert that is made. policy_loss_module: configured MPO loss function for the policy optimization; defaults to sensible values on the control suite. See `acme/tf/losses/mpo.py` for more details. policy_optimizer: optimizer to be used on the policy. critic_optimizer: optimizer to be used on the critic. n_step: number of steps to squash into a single transition. num_samples: number of actions to sample when doing a Monte Carlo integration with respect to the policy. clipping: whether to clip gradients by global norm. logger: logging object used to write to logs. counter: counter object used to keep track of steps. checkpoint: boolean indicating whether to checkpoint the learner. replay_table_name: string indicating what name to give the replay table. """ # Create a replay server to add data to. replay_table = reverb.Table( name=adders.DEFAULT_PRIORITY_TABLE, sampler=reverb.selectors.Uniform(), remover=reverb.selectors.Fifo(), max_size=max_replay_size, rate_limiter=reverb.rate_limiters.MinSize(min_size_to_sample=1), signature=adders.NStepTransitionAdder.signature(environment_spec)) self._server = reverb.Server([replay_table], port=None) # The adder is used to insert observations into replay. address = f'localhost:{self._server.port}' adder = adders.NStepTransitionAdder(client=reverb.Client(address), n_step=n_step, discount=discount) # The dataset object to learn from. dataset = datasets.make_reverb_dataset( table=replay_table_name, client=reverb.TFClient(address), batch_size=batch_size, prefetch_size=prefetch_size, environment_spec=environment_spec, transition_adder=True) # Make sure observation network is a Sonnet Module. observation_network = tf2_utils.to_sonnet_module(observation_network) # Create target networks before creating online/target network variables. target_policy_network = copy.deepcopy(policy_network) target_critic_network = copy.deepcopy(critic_network) target_observation_network = copy.deepcopy(observation_network) # Get observation and action specs. act_spec = environment_spec.actions obs_spec = environment_spec.observations emb_spec = tf2_utils.create_variables(observation_network, [obs_spec]) # Create the behavior policy. behavior_network = snt.Sequential([ observation_network, policy_network, networks.StochasticSamplingHead(), ]) # Create variables. tf2_utils.create_variables(policy_network, [emb_spec]) tf2_utils.create_variables(critic_network, [emb_spec, act_spec]) tf2_utils.create_variables(target_policy_network, [emb_spec]) tf2_utils.create_variables(target_critic_network, [emb_spec, act_spec]) tf2_utils.create_variables(target_observation_network, [obs_spec]) # Create the actor which defines how we take actions. actor = actors.FeedForwardActor(policy_network=behavior_network, adder=adder) # Create optimizers. policy_optimizer = policy_optimizer or snt.optimizers.Adam(1e-4) critic_optimizer = critic_optimizer or snt.optimizers.Adam(1e-4) # The learner updates the parameters (and initializes them). learner = learning.MPOLearner( policy_network=policy_network, critic_network=critic_network, observation_network=observation_network, target_policy_network=target_policy_network, target_critic_network=target_critic_network, target_observation_network=target_observation_network, policy_loss_module=policy_loss_module, policy_optimizer=policy_optimizer, critic_optimizer=critic_optimizer, clipping=clipping, discount=discount, num_samples=num_samples, target_policy_update_period=target_policy_update_period, target_critic_update_period=target_critic_update_period, dataset=dataset, logger=logger, counter=counter, checkpoint=checkpoint) super().__init__(actor=actor, learner=learner, min_observations=max(batch_size, min_replay_size), observations_per_step=float(batch_size) / samples_per_insert)
def main(_): # TODO(yutian): Create environment. # # Create an environment and grab the spec. # raw_environment = bsuite.load_and_record_to_csv( # bsuite_id=FLAGS.bsuite_id, # results_dir=FLAGS.results_dir, # overwrite=FLAGS.overwrite, # ) # environment = single_precision.SinglePrecisionWrapper(raw_environment) # environment_spec = specs.make_environment_spec(environment) # TODO(yutian): Create dataset. # Build the dataset. # if hasattr(raw_environment, 'raw_env'): # raw_environment = raw_environment.raw_env # # batch_dataset = bsuite_demonstrations.make_dataset(raw_environment) # # Combine with demonstration dataset. # transition = functools.partial( # _n_step_transition_from_episode, n_step=1, additional_discount=1.) # # dataset = batch_dataset.map(transition) # # # Batch and prefetch. # dataset = dataset.batch(FLAGS.batch_size, drop_remainder=True) # dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) # Create the networks to optimize. networks = make_networks(environment_spec.actions) treatment_net = networks['treatment_net'] instrumental_net = networks['instrumental_net'] policy_net = networks['policy_net'] # If the agent is non-autoregressive use epsilon=0 which will be a greedy # policy. evaluator_net = snt.Sequential([ policy_net, # Sample actions. acme_nets.StochasticSamplingHead() ]) # Ensure that we create the variables before proceeding (maybe not needed). tf2_utils.create_variables(policy_net, [environment_spec.observations]) # TODO(liyuan): set the proper input spec using environment_spec.observations # and environment_spec.actions. tf2_utils.create_variables(treatment_net, [environment_spec.observations]) tf2_utils.create_variables( instrumental_net, [environment_spec.observations, environment_spec.actions]) counter = counting.Counter() learner_counter = counting.Counter(counter, prefix='learner') # Create the actor which defines how we take actions. evaluator_net = actors.FeedForwardActor(evaluator_net) eval_loop = acme.EnvironmentLoop(environment=environment, actor=evaluator_net, counter=counter, logger=loggers.TerminalLogger( 'evaluation', time_delta=1.)) # The learner updates the parameters (and initializes them). learner = learning.DFIVLearner( treatment_net=treatment_net, instrumental_net=instrumental_net, policy_net=policy_net, treatment_learning_rate=FLAGS.treatment_learning_rate, instrumental_learning_rate=FLAGS.instrumental_learning_rate, policy_learning_rate=FLAGS.policy_learning_rate, dataset=dataset, counter=learner_counter) # Run the environment loop. while True: for _ in range(FLAGS.evaluate_every): learner.step() learner_counter.increment(learner_steps=FLAGS.evaluate_every) eval_loop.run(FLAGS.evaluation_episodes)
def __init__( self, environment_spec: specs.EnvironmentSpec, network: snt.Module, params=None, logger: loggers.Logger = None, checkpoint: bool = True, paths: Save_paths = None, ): """Initialize the agent. Args: environment_spec: description of the actions, observations, etc. network: the online Q network (the one being optimized) batch_size: batch size for updates. prefetch_size: size to prefetch from replay. target_update_period: number of learner steps to perform before updating the target networks. samples_per_insert: number of samples to take from replay for every insert that is made. min_replay_size: minimum replay size before updating. This and all following arguments are related to dataset construction and will be ignored if a dataset argument is passed. max_replay_size: maximum replay size. importance_sampling_exponent: power to which importance weights are raised before normalizing. priority_exponent: exponent used in prioritized sampling. n_step: number of steps to squash into a single transition. epsilon: probability of taking a random action; ignored if a policy network is given. learning_rate: learning rate for the q-network update. discount: discount to use for TD updates. logger: logger object to be used by learner. checkpoint: boolean indicating whether to checkpoint the learner. """ # Create a replay server to add data to. This uses no limiter behavior in # order to allow the Agent interface to handle it. if params is None: params = { 'batch_size': 256, 'prefetch_size': 4, 'target_update_period': 100, 'samples_per_insert': 32.0, 'min_replay_size': 1000, 'max_replay_size': 1000000, 'importance_sampling_exponent': 0.2, 'priority_exponent': 0.6, 'n_step': 5, 'epsilon': 0.05, 'learning_rate': 1e-3, 'discount': 0.99, } replay_table = reverb.Table( name=adders.DEFAULT_PRIORITY_TABLE, sampler=reverb.selectors.Prioritized(params['priority_exponent']), remover=reverb.selectors.Fifo(), max_size=params['max_replay_size'], rate_limiter=reverb.rate_limiters.MinSize(1)) self._server = reverb.Server([replay_table], port=None) # The adder is used to insert observations into replay. address = f'localhost:{self._server.port}' adder = adders.NStepTransitionAdder(client=reverb.Client(address), n_step=params['n_step'], discount=params['discount']) # The dataset provides an interface to sample from replay. replay_client = reverb.TFClient(address) dataset = datasets.make_reverb_dataset( client=replay_client, environment_spec=environment_spec, batch_size=params['batch_size'], prefetch_size=params['prefetch_size'], transition_adder=True) # Use constant 0.05 epsilon greedy policy by default. epsilon = tf.Variable(params['epsilon'], trainable=False) policy_network = snt.Sequential([ network, lambda q: trfl.epsilon_greedy(q, epsilon=epsilon).sample(), ]) # Create a target network. target_network = copy.deepcopy(network) # Ensure that we create the variables before proceeding (maybe not needed). # tf2_utils.create_variables(network, [environment_spec.observations]) # tf2_utils.create_variables(target_network, [environment_spec.observations]) # Create the actor which defines how we take actions. actor = actors.FeedForwardActor(policy_network, adder) # The learner updates the parameters (and initializes them). learner = learning.DQNLearner( network=network, target_network=target_network, discount=params['discount'], importance_sampling_exponent=params[ 'importance_sampling_exponent'], learning_rate=params['learning_rate'], target_update_period=params['target_update_period'], dataset=dataset, replay_client=replay_client, logger=logger, checkpoint=checkpoint) if checkpoint: self._checkpointer = tf2_savers.Checkpointer( add_uid=False, objects_to_save=learner.state, directory=paths.data_dir, subdirectory=paths.experiment_name, time_delta_minutes=60.) else: self._checkpointer = None super().__init__(actor=actor, learner=learner, min_observations=max(params['batch_size'], params['min_replay_size']), observations_per_step=float(params['batch_size']) / params['samples_per_insert'])
max_replay_size=args.replay_table_max_replay_size, min_replay_size=args.min_replay_size, shutdown_table_name=args.shutdown_table_name, device_placement=args.learner_device_placement, batch_size=args.batch_size, broadcaster_table_name=args.broadcaster_table_name) # Create the evaluation policy. with tf.device(args.learner_device_placement): # Create the behavior policy. eval_policy = snt.Sequential([ agent_networks['observation'], agent_networks['policy'], ]) eval_actor = actors.FeedForwardActor(policy_network=eval_policy) eval_env = make_environment(args.taskstr) eval_loop = CustomEnvironmentLoop(eval_env, eval_actor, label='%s/' % (args.logpath)) def broadcast_shutdown(should_shutdown): learner.client.insert(should_shutdown, {args.shutdown_table_name: 1.0}) steps = 0 def broadcast_variables(weights): if weights is None: weights = [ tf2_utils.to_numpy(v)
def main(_): problem_config = FLAGS.problem_config # Load the offline dataset and environment. _, _, environment = utils.load_data_and_env( task_name=problem_config['task_name'], noise_level=problem_config['noise_level'], near_policy_dataset=problem_config['near_policy_dataset'], dataset_path=FLAGS.dataset_path, batch_size=1) environment_spec = specs.make_environment_spec(environment) # Load pretrained target policy network. policy_net = utils.load_policy_net( task_name=problem_config['task_name'], noise_level=problem_config['noise_level'], near_policy_dataset=problem_config['near_policy_dataset'], dataset_path=FLAGS.dataset_path, environment_spec=environment_spec) actor = actors.FeedForwardActor(policy_network=policy_net) logger = loggers.TerminalLogger('ground_truth') discount = problem_config['discount'] returns = [] lengths = [] t_start = time.time() timestep = environment.reset() actor.observe_first(timestep) cur_return = 0. cur_step = 0 while len(returns) < FLAGS.num_episodes: action = actor.select_action(timestep.observation) timestep = environment.step(action) # Have the agent observe the timestep and let the actor update itself. actor.observe(action, next_timestep=timestep) cur_return += pow(discount, cur_step) * timestep.reward cur_step += 1 if timestep.last(): # Append return of the current episode, and reset the environment. returns.append(cur_return) lengths.append(cur_step) timestep = environment.reset() actor.observe_first(timestep) cur_return = 0. cur_step = 0 if len(returns) % (FLAGS.num_episodes // 10) == 0: print( f'Run time {time.time() - t_start:0.0f} secs, ' f'evaluated episode {len(returns)} / {FLAGS.num_episodes}') # Returned data include problem configs. results = { '_'.join(keys): value for keys, value in tree.flatten_with_path(problem_config) } # And computed results. results.update({ 'metric_value': np.mean(returns), 'metric_std_dev': np.std(returns, ddof=0), 'metric_std_err': np.std(returns, ddof=0) / np.sqrt(len(returns)), 'length_mean': np.mean(lengths), 'length_std': np.std(lengths, ddof=0), 'num_episodes': len(returns), }) logger.write(results)
def _generate_data( policy_net, environment, n_samples, batch_size, shuffle, include_terminal=False, # Include terminal absorbing state. ignore_d_tm1=False # Set d_tm1 as constant 1.0 if True. ): sample_count = 0 actor = actors.FeedForwardActor(policy_network=policy_net) timestep = environment.reset() actor.observe_first(timestep) current_obs_list = [] action_list = [] next_obs_list = [] reward_list = [] discount_list = [] nonterminal_list = [] while sample_count < n_samples: current_obs = timestep.observation action = actor.select_action(current_obs) timestep = environment.step(action) actor.observe(action, next_timestep=timestep) next_obs = timestep.observation reward = timestep.reward discount = np.array(1.0, dtype=np.float32) if timestep.last() and not include_terminal: discount = np.array(0.0, dtype=np.float32) current_obs_list.append(tf2_utils.add_batch_dim(current_obs)) action_list.append(tf2_utils.add_batch_dim(action)) reward_list.append(tf2_utils.add_batch_dim(reward)) discount_list.append(tf2_utils.add_batch_dim(discount)) next_obs_list.append(tf2_utils.add_batch_dim(next_obs)) nonterminal_list.append( tf2_utils.add_batch_dim(np.array(1.0, dtype=np.float32))) if timestep.last(): if include_terminal: # Make another transition tuple from s, a -> s, a with 0 reward. current_obs = next_obs # action = actor.select_action(current_obs) reward = np.zeros_like(timestep.reward) discount = np.array(1.0, dtype=np.float32) next_obs = current_obs if ignore_d_tm1: d_tm1 = np.array(1.0, dtype=np.float32) else: d_tm1 = np.array(0.0, dtype=np.float32) for i in range(environment.action_spec().num_values): action_ = np.array(i, dtype=action.dtype).reshape( action.shape) current_obs_list.append( tf2_utils.add_batch_dim(current_obs)) action_list.append(tf2_utils.add_batch_dim(action_)) reward_list.append(tf2_utils.add_batch_dim(reward)) discount_list.append(tf2_utils.add_batch_dim(discount)) next_obs_list.append(tf2_utils.add_batch_dim(next_obs)) nonterminal_list.append(tf2_utils.add_batch_dim(d_tm1)) timestep = environment.reset() actor.observe_first(timestep) sample_count += 1 current_obs_data = tf.concat(current_obs_list, axis=0) action_data = tf.concat(action_list, axis=0) next_obs_data = tf.concat(next_obs_list, axis=0) reward_data = tf.concat(reward_list, axis=0) discount_data = tf.concat(discount_list, axis=0) nonterminal_data = tf.concat(nonterminal_list, axis=0) dataset = tf.data.Dataset.from_tensor_slices(( current_obs_data, action_data, reward_data, discount_data, next_obs_data, # The last action is not valid # and should not be used. action_data, nonterminal_data)) def _reverb_sample(*data_tuple): info = reverb.SampleInfo(key=tf.constant(0, tf.uint64), probability=tf.constant(1.0, tf.float64), table_size=tf.constant(0, tf.int64), priority=tf.constant(1.0, tf.float64)) return reverb.ReplaySample(info=info, data=data_tuple) dataset = dataset.map(_reverb_sample, num_parallel_calls=tf.data.experimental.AUTOTUNE) dataset = dataset.cache() if shuffle: dataset = dataset.shuffle(batch_size * 10) dataset = dataset.repeat() dataset = dataset.batch(batch_size, drop_remainder=True) return dataset
def __init__( self, environment_spec: specs.EnvironmentSpec, network: snt.Module, demonstration_dataset: tf.data.Dataset, demonstration_ratio: float, batch_size: int = 256, prefetch_size: int = 4, target_update_period: int = 100, samples_per_insert: float = 32.0, min_replay_size: int = 1000, max_replay_size: int = 1000000, importance_sampling_exponent: float = 0.2, n_step: int = 5, epsilon: tf.Tensor = None, learning_rate: float = 1e-3, discount: float = 0.99, ): """Initialize the agent. Args: environment_spec: description of the actions, observations, etc. network: the online Q network (the one being optimized) demonstration_dataset: tf.data.Dataset producing (timestep, action) tuples containing full episodes. demonstration_ratio: Ratio of transitions coming from demonstrations. batch_size: batch size for updates. prefetch_size: size to prefetch from replay. target_update_period: number of learner steps to perform before updating the target networks. samples_per_insert: number of samples to take from replay for every insert that is made. min_replay_size: minimum replay size before updating. This and all following arguments are related to dataset construction and will be ignored if a dataset argument is passed. max_replay_size: maximum replay size. importance_sampling_exponent: power to which importance weights are raised before normalizing. n_step: number of steps to squash into a single transition. epsilon: probability of taking a random action; ignored if a policy network is given. learning_rate: learning rate for the q-network update. discount: discount to use for TD updates. """ # Create a replay server to add data to. This uses no limiter behavior in # order to allow the Agent interface to handle it. replay_table = reverb.Table( name=adders.DEFAULT_PRIORITY_TABLE, sampler=reverb.selectors.Uniform(), remover=reverb.selectors.Fifo(), max_size=max_replay_size, rate_limiter=reverb.rate_limiters.MinSize(1)) self._server = reverb.Server([replay_table], port=None) # The adder is used to insert observations into replay. address = f'localhost:{self._server.port}' adder = adders.NStepTransitionAdder(client=reverb.Client(address), n_step=n_step, discount=discount) # The dataset provides an interface to sample from replay. replay_client = reverb.TFClient(address) dataset = datasets.make_reverb_dataset( client=replay_client, environment_spec=environment_spec, transition_adder=True) # Combine with demonstration dataset. transition = functools.partial(_n_step_transition_from_episode, n_step=n_step, discount=discount) dataset_demos = demonstration_dataset.map(transition) dataset = tf.data.experimental.sample_from_datasets( [dataset, dataset_demos], [1 - demonstration_ratio, demonstration_ratio]) # Batch and prefetch. dataset = dataset.batch(batch_size, drop_remainder=True) dataset = dataset.prefetch(prefetch_size) # Use constant 0.05 epsilon greedy policy by default. if epsilon is None: epsilon = tf.Variable(0.05, trainable=False) policy_network = snt.Sequential([ network, lambda q: trfl.epsilon_greedy(q, epsilon=epsilon).sample(), ]) # Create a target network. target_network = copy.deepcopy(network) # Ensure that we create the variables before proceeding (maybe not needed). tf2_utils.create_variables(network, [environment_spec.observations]) tf2_utils.create_variables(target_network, [environment_spec.observations]) # Create the actor which defines how we take actions. actor = actors.FeedForwardActor(policy_network, adder) # The learner updates the parameters (and initializes them). learner = dqn.DQNLearner( network=network, target_network=target_network, discount=discount, importance_sampling_exponent=importance_sampling_exponent, learning_rate=learning_rate, target_update_period=target_update_period, dataset=dataset, replay_client=replay_client) super().__init__(actor=actor, learner=learner, min_observations=max(batch_size, min_replay_size), observations_per_step=float(batch_size) / samples_per_insert)
def main(_): wb_run = init_or_resume() if FLAGS.seed: tf.random.set_seed(FLAGS.seed) # Create an environment and grab the spec. environment, env_spec = _build_environment( FLAGS.environment_name, max_steps=FLAGS.max_eval_episode_len) # Load demonstration dataset. raw_dataset = load_tf_dataset(directory=FLAGS.dataset_dir) empirical_policy = compute_empirical_policy(raw_dataset) dataset = preprocess_dataset(raw_dataset, FLAGS.batch_size, FLAGS.n_step_returns, FLAGS.discount) # Create the policy and critic networks. critic_network = networks.get_default_critic(env_spec) policy_network = snt.Sequential( [copy.deepcopy(critic_network), tfp.distributions.Categorical]) if FLAGS.greedy: head = networks.GreedyHead() else: head = StochasticSamplingHead() behaviour_network = snt.Sequential([policy_network, head]) # Ensure that we create the variables before proceeding (maybe not needed). tf2_utils.create_variables(policy_network, [env_spec.observations]) tf2_utils.create_variables(critic_network, [env_spec.observations]) # Create the actor which defines how we take actions. evaluation_actor = actors.FeedForwardActor(behaviour_network) counter = counting.Counter() disp, disp_loop = _build_custom_loggers(wb_run) eval_loop = EnvironmentLoop(environment=environment, actor=evaluation_actor, counter=counter, logger=disp_loop) learner = CRRLearner( policy_network=policy_network, critic_network=critic_network, dataset=dataset, discount=0.99, policy_improvement_modes=FLAGS.policy_improvement_mode, beta=FLAGS.crr_beta, cql_alpha=FLAGS.cql_alpha, empirical_policy=empirical_policy, logger=disp, counter=counter) # Run the environment loop. for e in tqdm(range(FLAGS.epochs)): for _ in range(FLAGS.evaluate_every): learner.step() eval_loop.run(FLAGS.evaluation_episodes) # Visualization of the policy Q = evaluate_q(learner._critic_network, environment) plot = visualize_policy(Q, environment) wb_run.log({'chart': plot, 'epoch_counter': e}) learner.save(tag=FLAGS.logs_tag)
def __init__( self, environment_spec: specs.EnvironmentSpec, network: snt.Module, batch_size: int = 32, prefetch_size: int = 4, target_update_period: int = 100, samples_per_insert: float = 32.0, min_replay_size: int = 1000, max_replay_size: int = 100000, importance_sampling_exponent: float = 0.2, priority_exponent: float = 0.6, n_step: int = 5, epsilon: Optional[float] = 0.05, learning_rate: float = 1e-3, discount: float = 0.99, logger: loggers.Logger = None, max_gradient_norm: Optional[float] = None, expert_data: List[Dict] = None, ) -> None: """ Initialize the agent. """ # Create a replay server to add data to. This uses no limiter behavior # in order to allow the Agent interface to handle it. replay_table = reverb.Table( name=adders.DEFAULT_PRIORITY_TABLE, sampler=reverb.selectors.Prioritized(priority_exponent), remover=reverb.selectors.Fifo(), max_size=max_replay_size, rate_limiter=reverb.rate_limiters.MinSize(1), signature=adders.NStepTransitionAdder.signature(environment_spec)) self._server = reverb.Server([replay_table], port=None) # The adder is used to insert observations into replay. address = f'localhost:{self._server.port}' adder = adders.NStepTransitionAdder(client=reverb.Client(address), n_step=n_step, discount=discount) # Adding expert data to the replay memory: if expert_data is not None: for d in expert_data: adder.add_first(d["first"]) for (action, next_ts) in d["mid"]: adder.add(np.int32(action), next_ts) # The dataset provides an interface to sample from replay. replay_client = reverb.TFClient(address) dataset = datasets.make_reverb_dataset(server_address=address, batch_size=batch_size, prefetch_size=prefetch_size) # Creating the epsilon greedy policy network: epsilon = tf.Variable(epsilon) policy_network = snt.Sequential([ network, lambda q: trfl.epsilon_greedy(q, epsilon=epsilon).sample(), ]) # Create a target network. target_network = copy.deepcopy(network) # Ensure that we create the variables before proceeding (maybe not # needed). tf2_utils.create_variables(network, [environment_spec.observations]) tf2_utils.create_variables(target_network, [environment_spec.observations]) # Create the actor which defines how we take actions. actor = actors.FeedForwardActor(policy_network, adder) # The learner updates the parameters (and initializes them). learner = learning.DQNLearner( network=network, target_network=target_network, discount=discount, importance_sampling_exponent=importance_sampling_exponent, learning_rate=learning_rate, target_update_period=target_update_period, dataset=dataset, replay_client=replay_client, max_gradient_norm=max_gradient_norm, logger=logger, ) super().__init__(actor=actor, learner=learner, min_observations=max(batch_size, min_replay_size), observations_per_step=float(batch_size) / samples_per_insert)
behavior_network = snt.Sequential([ observation_network, policy_network, networks.ClippedGaussian(0.3), #sigma = 0.3 networks.ClipToSpec(act_spec), ]) # We must create the variables in the networks before passing them to learner. # Create variables. tf2_utils.create_variables(policy_network, [emb_spec]) tf2_utils.create_variables(critic_network, [emb_spec, act_spec]) tf2_utils.create_variables(target_policy_network, [emb_spec]) tf2_utils.create_variables(target_critic_network, [emb_spec, act_spec]) tf2_utils.create_variables(target_observation_network, [obs_spec]) actor = actors.FeedForwardActor(behavior_network, adder=adder) learner = d4pg.D4PGLearner(policy_network=policy_network, critic_network=critic_network, observation_network=observation_network, target_policy_network=target_policy_network, target_critic_network=target_critic_network, target_observation_network=target_observation_network, dataset=dataset, discount=0.99, clipping=True, target_update_period=100, policy_optimizer=snt.optimizers.Adam(1e-4), critic_optimizer=snt.optimizers.Adam(1e-4), # Log learner updates to console every 10 seconds. logger=loggers.TerminalLogger(time_delta=10.),
def main(_): wb_run = init_or_resume() if FLAGS.seed: tf.random.set_seed(FLAGS.seed) # Create an environment and grab the spec. environment, env_spec = _build_environment( FLAGS.environment_name, max_steps=FLAGS.max_eval_episode_len) # Load demonstration dataset. raw_dataset = load_tf_dataset(directory=FLAGS.dataset_dir) empirical_policy = compute_empirical_policy(raw_dataset) dataset = preprocess_dataset(raw_dataset, FLAGS.batch_size, FLAGS.n_step_returns, FLAGS.discount) # Create the main critic network critic_network = networks.get_default_critic(env_spec) policy_network = snt.Sequential([ critic_network, lambda q: trfl.epsilon_greedy(q, epsilon=FLAGS.epsilon).sample(), ]) tf2_utils.create_variables(critic_network, [env_spec.observations]) # Create the actor which defines how we take actions. evaluation_actor = actors.FeedForwardActor(policy_network) counter = counting.Counter() disp, disp_loop = _build_custom_loggers(wb_run) eval_loop = EnvironmentLoop(environment=environment, actor=evaluation_actor, counter=counter, logger=disp_loop) learner = CQLLearner(network=critic_network, dataset=dataset, discount=FLAGS.discount, importance_sampling_exponent=0.2, learning_rate=FLAGS.learning_rate, cql_alpha=FLAGS.cql_alpha, translate_lse=FLAGS.translate_lse, target_update_period=100, empirical_policy=empirical_policy, logger=disp, counter=counter) # Run the environment loop. for e in tqdm(range(FLAGS.epochs)): for _ in range(FLAGS.evaluate_every): learner.step() eval_loop.run(FLAGS.evaluation_episodes) # Visualization of the policy Q = evaluate_q(learner._network, environment) plot = visualize_policy(Q, environment) wb_run.log({'chart': plot, 'epoch_counter': e}) learner.save(tag=FLAGS.logs_tag)
def __init__(self, environment_spec: specs.EnvironmentSpec, policy_network: snt.Module, critic_network: snt.Module, observation_network: types.TensorTransformation = tf.identity, discount: float = 0.99, batch_size: int = 256, prefetch_size: int = 4, target_update_period: int = 100, min_replay_size: int = 1000, max_replay_size: int = 1000000, samples_per_insert: float = 32.0, n_step: int = 5, sigma: float = 0.3, clipping: bool = True, logger: loggers.Logger = None, counter: counting.Counter = None, checkpoint: bool = True, replay_table_name: str = adders.DEFAULT_PRIORITY_TABLE): """Initialize the agent. Args: environment_spec: description of the actions, observations, etc. policy_network: the online (optimized) policy. critic_network: the online critic. observation_network: optional network to transform the observations before they are fed into any network. discount: discount to use for TD updates. batch_size: batch size for updates. prefetch_size: size to prefetch from replay. target_update_period: number of learner steps to perform before updating the target networks. min_replay_size: minimum replay size before updating. max_replay_size: maximum replay size. samples_per_insert: number of samples to take from replay for every insert that is made. n_step: number of steps to squash into a single transition. sigma: standard deviation of zero-mean, Gaussian exploration noise. clipping: whether to clip gradients by global norm. logger: logger object to be used by learner. counter: counter object used to keep track of steps. checkpoint: boolean indicating whether to checkpoint the learner. replay_table_name: string indicating what name to give the replay table. """ # Create a replay server to add data to. This uses no limiter behavior in # order to allow the Agent interface to handle it. replay_table = reverb.Table( name=replay_table_name, sampler=reverb.selectors.Uniform(), remover=reverb.selectors.Fifo(), max_size=max_replay_size, rate_limiter=reverb.rate_limiters.MinSize(1), signature=adders.NStepTransitionAdder.signature(environment_spec)) self._server = reverb.Server([replay_table], port=None) # The adder is used to insert observations into replay. address = f'localhost:{self._server.port}' adder = adders.NStepTransitionAdder( priority_fns={replay_table_name: lambda x: 1.}, client=reverb.Client(address), n_step=n_step, discount=discount) # The dataset provides an interface to sample from replay. dataset = datasets.make_reverb_dataset( table=replay_table_name, client=reverb.TFClient(address), environment_spec=environment_spec, batch_size=batch_size, prefetch_size=prefetch_size, transition_adder=True) # Get observation and action specs. act_spec = environment_spec.actions obs_spec = environment_spec.observations emb_spec = tf2_utils.create_variables(observation_network, [obs_spec]) # pytype: disable=wrong-arg-types # Make sure observation network is a Sonnet Module. observation_network = tf2_utils.to_sonnet_module(observation_network) # Create target networks. target_policy_network = copy.deepcopy(policy_network) target_critic_network = copy.deepcopy(critic_network) target_observation_network = copy.deepcopy(observation_network) # Create the behavior policy. behavior_network = snt.Sequential([ observation_network, policy_network, networks.ClippedGaussian(sigma), networks.ClipToSpec(act_spec), ]) # Create variables. tf2_utils.create_variables(policy_network, [emb_spec]) tf2_utils.create_variables(critic_network, [emb_spec, act_spec]) tf2_utils.create_variables(target_policy_network, [emb_spec]) tf2_utils.create_variables(target_critic_network, [emb_spec, act_spec]) tf2_utils.create_variables(target_observation_network, [obs_spec]) # Create the actor which defines how we take actions. actor = actors.FeedForwardActor(behavior_network, adder=adder) # Create optimizers. policy_optimizer = snt.optimizers.Adam(learning_rate=1e-4) critic_optimizer = snt.optimizers.Adam(learning_rate=1e-4) # The learner updates the parameters (and initializes them). learner = learning.DDPGLearner( policy_network=policy_network, critic_network=critic_network, observation_network=observation_network, target_policy_network=target_policy_network, target_critic_network=target_critic_network, target_observation_network=target_observation_network, policy_optimizer=policy_optimizer, critic_optimizer=critic_optimizer, clipping=clipping, discount=discount, target_update_period=target_update_period, dataset=dataset, counter=counter, logger=logger, checkpoint=checkpoint, ) super().__init__(actor=actor, learner=learner, min_observations=max(batch_size, min_replay_size), observations_per_step=float(batch_size) / samples_per_insert)
def __init__( self, environment_spec: specs.EnvironmentSpec, network: snt.Module, batch_size: int = 256, prefetch_size: int = 4, target_update_period: int = 100, samples_per_insert: float = 32.0, min_replay_size: int = 20, max_replay_size: int = 1000000, importance_sampling_exponent: float = 0.2, priority_exponent: float = 0.6, n_step: int = 5, epsilon_init: float = 1.0, epsilon_final: float = 0.01, epsilon_schedule_timesteps: int = 20000, learning_rate: float = 1e-3, discount: float = 0.99, max_gradient_norm: Optional[float] = None, logger: loggers.Logger = None, ): """Initialize the agent. Args: environment_spec: description of the actions, observations, etc. network: the online Q network (the one being optimized) batch_size: batch size for updates. prefetch_size: size to prefetch from replay. target_update_period: number of learner steps to perform before updating the target networks. samples_per_insert: number of samples to take from replay for every insert that is made. min_replay_size: minimum replay size before updating. This and all following arguments are related to dataset construction and will be ignored if a dataset argument is passed. max_replay_size: maximum replay size. importance_sampling_exponent: power to which importance weights are raised before normalizing (beta). See https://arxiv.org/pdf/1710.02298.pdf priority_exponent: exponent used in prioritized sampling (omega). See https://arxiv.org/pdf/1710.02298.pdf n_step: number of steps to squash into a single transition. epsilon_init: Initial epsilon value (probability of taking a random action) epsilon_final: Final epsilon value (probability of taking a random action) epsilon_schedule_timesteps: timesteps to decay epsilon from 'epsilon_init' to 'epsilon_final'. learning_rate: learning rate for the q-network update. discount: discount to use for TD updates. logger: logger object to be used by learner. max_gradient_norm: used for gradient clipping. """ # Create a replay server to add data to. This uses no limiter behavior in # order to allow the Agent interface to handle it. replay_table = reverb.Table( name=adders.DEFAULT_PRIORITY_TABLE, sampler=reverb.selectors.Prioritized(priority_exponent), remover=reverb.selectors.Fifo(), max_size=max_replay_size, rate_limiter=reverb.rate_limiters.MinSize(1), signature=adders.NStepTransitionAdder.signature(environment_spec)) self._server = reverb.Server([replay_table], port=None) # The adder is used to insert observations into replay. address = f'localhost:{self._server.port}' self._adder = adders.NStepTransitionAdder( client=reverb.Client(address), n_step=n_step, discount=discount) # The dataset provides an interface to sample from replay. replay_client = reverb.TFClient(address) dataset = make_reverb_dataset(server_address=address, batch_size=batch_size, prefetch_size=prefetch_size) policy_network = snt.Sequential([ network, EpsilonGreedyExploration( epsilon_init=epsilon_init, epsilon_final=epsilon_final, epsilon_schedule_timesteps=epsilon_schedule_timesteps) ]) # Create a target network. target_network = copy.deepcopy(network) # Ensure that we create the variables before proceeding (maybe not needed). tf2_utils.create_variables(network, [environment_spec.observations]) tf2_utils.create_variables(target_network, [environment_spec.observations]) # Create the actor which defines how we take actions. actor = actors_tf2.FeedForwardActor(policy_network, self._adder) # The learner updates the parameters (and initializes them). learner = learning.DQNLearner( network=network, target_network=target_network, discount=discount, importance_sampling_exponent=importance_sampling_exponent, learning_rate=learning_rate, target_update_period=target_update_period, dataset=dataset, replay_client=replay_client, max_gradient_norm=max_gradient_norm, logger=logger, checkpoint=False) self._saver = tf2_savers.Saver(learner.state) # Deterministic (max-Q) actor. max_Q_network = snt.Sequential([ network, lambda q: trfl.epsilon_greedy(q, epsilon=0.0).sample(), ]) self._deterministic_actor = actors_tf2.FeedForwardActor(max_Q_network) super().__init__(actor=actor, learner=learner, min_observations=max(batch_size, min_replay_size), observations_per_step=float(batch_size) / samples_per_insert)