def action_spec(self): return tensor_spec.from_spec( array_spec.BoundedArraySpec((), np.int32, minimum=0, maximum=2))
def __init__(self, time_step_spec: ts.TimeStep, action_spec: types.NestedTensorSpec, q_network: network.Network, emit_log_probability: bool = False, observation_and_action_constraint_splitter: Optional[ types.Splitter] = None, validate_action_spec_and_network: bool = True, name: Optional[Text] = None): """Builds a Q-Policy given a q_network. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of BoundedTensorSpec representing the actions. q_network: An instance of a `tf_agents.network.Network`, callable via `network(observation, step_type) -> (output, final_state)`. emit_log_probability: Whether to emit log-probs in info of `PolicyStep`. observation_and_action_constraint_splitter: A function used to process observations with action constraints. These constraints can indicate, for example, a mask of valid/invalid actions for a given state of the environment. The function takes in a full observation and returns a tuple consisting of 1) the part of the observation intended as input to the network and 2) the constraint. An example `observation_and_action_constraint_splitter` could be as simple as: ``` def observation_and_action_constraint_splitter(observation): return observation['network_input'], observation['constraint'] ``` *Note*: when using `observation_and_action_constraint_splitter`, make sure the provided `q_network` is compatible with the network-specific half of the output of the `observation_and_action_constraint_splitter`. In particular, `observation_and_action_constraint_splitter` will be called on the observation before passing to the network. If `observation_and_action_constraint_splitter` is None, action constraints are not applied. validate_action_spec_and_network: If `True` (default), action_spec is checked to make sure it is a single scalar spec with a minimum of zero. Also validates that the network's output matches the spec. name: The name of this policy. All variables in this module will fall under that name. Defaults to the class name. Raises: ValueError: If `q_network.action_spec` exists and is not compatible with `action_spec`. NotImplementedError: If `action_spec` contains more than one `BoundedTensorSpec`. """ action_spec = tensor_spec.from_spec(action_spec) time_step_spec = tensor_spec.from_spec(time_step_spec) network_action_spec = getattr(q_network, 'action_spec', None) if network_action_spec is not None: action_spec = cast(tf.TypeSpec, action_spec) if not action_spec.is_compatible_with(network_action_spec): raise ValueError( 'action_spec must be compatible with q_network.action_spec; ' 'instead got action_spec=%s, q_network.action_spec=%s' % (action_spec, network_action_spec)) flat_action_spec = tf.nest.flatten(action_spec) if len(flat_action_spec) > 1: raise ValueError( 'Only scalar actions are supported now, but action spec is: {}' .format(action_spec)) if validate_action_spec_and_network: spec = flat_action_spec[0] if spec.shape.rank > 0: raise ValueError( 'Only scalar actions are supported now, but action spec is: {}' .format(action_spec)) if spec.minimum != 0: raise ValueError( 'Action specs should have minimum of 0, but saw: {0}'. format(spec)) num_actions = spec.maximum - spec.minimum + 1 network_utils.check_single_floating_network_output( q_network.create_variables(), (num_actions, ), str(q_network)) # We need to maintain the flat action spec for dtype, shape and range. self._flat_action_spec = flat_action_spec[0] self._q_network = q_network super(QPolicy, self).__init__(time_step_spec, action_spec, policy_state_spec=q_network.state_spec, clip=False, emit_log_probability=emit_log_probability, observation_and_action_constraint_splitter=( observation_and_action_constraint_splitter), name=name)
def __init__( self, root_dir, env_name, num_iterations=200, max_episode_frames=108000, # ALE frames terminal_on_life_loss=False, conv_layer_params=((32, (8, 8), 4), (64, (4, 4), 2), (64, (3, 3), 1)), fc_layer_params=(512, ), # Params for collect initial_collect_steps=80000, # ALE frames epsilon_greedy=0.01, epsilon_decay_period=1000000, # ALE frames replay_buffer_capacity=1000000, # Params for train train_steps_per_iteration=1000000, # ALE frames update_period=16, # ALE frames target_update_tau=1.0, target_update_period=32000, # ALE frames batch_size=32, learning_rate=2.5e-4, n_step_update=2, gamma=0.99, reward_scale_factor=1.0, gradient_clipping=None, # Params for eval do_eval=True, eval_steps_per_iteration=500000, # ALE frames eval_epsilon_greedy=0.001, # Params for checkpoints, summaries, and logging log_interval=1000, summary_interval=1000, summaries_flush_secs=10, debug_summaries=True, summarize_grads_and_vars=True, eval_metrics_callback=None): """A simple Atari train and eval for DQN. Args: root_dir: Directory to write log files to. env_name: Fully-qualified name of the Atari environment (i.e. Pong-v0). num_iterations: Number of train/eval iterations to run. max_episode_frames: Maximum length of a single episode, in ALE frames. terminal_on_life_loss: Whether to simulate an episode termination when a life is lost. conv_layer_params: Params for convolutional layers of QNetwork. fc_layer_params: Params for fully connected layers of QNetwork. initial_collect_steps: Number of frames to ALE frames to process before beginning to train. Since this is in ALE frames, there will be initial_collect_steps/4 items in the replay buffer when training starts. epsilon_greedy: Final epsilon value to decay to for training. epsilon_decay_period: Period over which to decay epsilon, from 1.0 to epsilon_greedy (defined above). replay_buffer_capacity: Maximum number of items to store in the replay buffer. train_steps_per_iteration: Number of ALE frames to run through for each iteration of training. update_period: Run a train operation every update_period ALE frames. target_update_tau: Coeffecient for soft target network updates (1.0 == hard updates). target_update_period: Period, in ALE frames, to copy the live network to the target network. batch_size: Number of frames to include in each training batch. learning_rate: RMS optimizer learning rate. n_step_update: The number of steps to consider when computing TD error and TD loss. Applies standard single-step updates when set to 1. gamma: Discount for future rewards. reward_scale_factor: Scaling factor for rewards. gradient_clipping: Norm length to clip gradients. do_eval: If True, run an eval every iteration. If False, skip eval. eval_steps_per_iteration: Number of ALE frames to run through for each iteration of evaluation. eval_epsilon_greedy: Epsilon value to use for the evaluation policy (0 == totally greedy policy). log_interval: Log stats to the terminal every log_interval training steps. summary_interval: Write TF summaries every summary_interval training steps. summaries_flush_secs: Flush summaries to disk every summaries_flush_secs seconds. debug_summaries: If True, write additional summaries for debugging (see dqn_agent for which summaries are written). summarize_grads_and_vars: Include gradients in summaries. eval_metrics_callback: A callback function that takes (metric_dict, global_step) as parameters. Called after every eval with the results of the evaluation. """ self._update_period = update_period / ATARI_FRAME_SKIP self._train_steps_per_iteration = (train_steps_per_iteration / ATARI_FRAME_SKIP) self._do_eval = do_eval self._eval_steps_per_iteration = eval_steps_per_iteration / ATARI_FRAME_SKIP self._eval_epsilon_greedy = eval_epsilon_greedy self._initial_collect_steps = initial_collect_steps / ATARI_FRAME_SKIP self._summary_interval = summary_interval self._num_iterations = num_iterations self._log_interval = log_interval self._eval_metrics_callback = eval_metrics_callback with gin.unlock_config(): gin.bind_parameter(('tf_agents.environments.atari_preprocessing.' 'AtariPreprocessing.terminal_on_life_loss'), terminal_on_life_loss) root_dir = os.path.expanduser(root_dir) train_dir = os.path.join(root_dir, 'train') eval_dir = os.path.join(root_dir, 'eval') train_summary_writer = tf.compat.v2.summary.create_file_writer( train_dir, flush_millis=summaries_flush_secs * 1000) train_summary_writer.set_as_default() self._train_summary_writer = train_summary_writer self._eval_summary_writer = None if self._do_eval: self._eval_summary_writer = tf.compat.v2.summary.create_file_writer( eval_dir, flush_millis=summaries_flush_secs * 1000) self._eval_metrics = [ py_metrics.AverageReturnMetric(name='PhaseAverageReturn', buffer_size=np.inf), py_metrics.AverageEpisodeLengthMetric( name='PhaseAverageEpisodeLength', buffer_size=np.inf), ] self._global_step = tf.compat.v1.train.get_or_create_global_step() with tf.compat.v2.summary.record_if(lambda: tf.math.equal( self._global_step % self._summary_interval, 0)): self._env = suite_atari.load( env_name, max_episode_steps=max_episode_frames / ATARI_FRAME_SKIP, gym_env_wrappers=suite_atari. DEFAULT_ATARI_GYM_WRAPPERS_WITH_STACKING) self._env = batched_py_environment.BatchedPyEnvironment( [self._env]) observation_spec = tensor_spec.from_spec( self._env.observation_spec()) time_step_spec = ts.time_step_spec(observation_spec) action_spec = tensor_spec.from_spec(self._env.action_spec()) with tf.device('/cpu:0'): epsilon = tf.compat.v1.train.polynomial_decay( 1.0, self._global_step, epsilon_decay_period / ATARI_FRAME_SKIP / self._update_period, end_learning_rate=epsilon_greedy) with tf.device('/gpu:0'): optimizer = tf.compat.v1.train.RMSPropOptimizer( learning_rate=learning_rate, decay=0.95, momentum=0.0, epsilon=0.00001, centered=True) categorical_q_net = AtariCategoricalQNetwork( observation_spec, action_spec, conv_layer_params=conv_layer_params, fc_layer_params=fc_layer_params) agent = categorical_dqn_agent.CategoricalDqnAgent( time_step_spec, action_spec, categorical_q_network=categorical_q_net, optimizer=optimizer, epsilon_greedy=epsilon, n_step_update=n_step_update, target_update_tau=target_update_tau, target_update_period=(target_update_period / ATARI_FRAME_SKIP / self._update_period), gamma=gamma, reward_scale_factor=reward_scale_factor, gradient_clipping=gradient_clipping, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=self._global_step) self._collect_policy = py_tf_policy.PyTFPolicy( agent.collect_policy) if self._do_eval: self._eval_policy = py_tf_policy.PyTFPolicy( epsilon_greedy_policy.EpsilonGreedyPolicy( policy=agent.policy, epsilon=self._eval_epsilon_greedy)) py_observation_spec = self._env.observation_spec() py_time_step_spec = ts.time_step_spec(py_observation_spec) py_action_spec = policy_step.PolicyStep( self._env.action_spec()) data_spec = trajectory.from_transition(py_time_step_spec, py_action_spec, py_time_step_spec) self._replay_buffer = py_hashed_replay_buffer.PyHashedReplayBuffer( data_spec=data_spec, capacity=replay_buffer_capacity) with tf.device('/cpu:0'): ds = self._replay_buffer.as_dataset( sample_batch_size=batch_size, num_steps=n_step_update + 1) ds = ds.prefetch(4) ds = ds.apply( tf.data.experimental.prefetch_to_device('/gpu:0')) with tf.device('/gpu:0'): self._ds_itr = tf.compat.v1.data.make_one_shot_iterator(ds) experience = self._ds_itr.get_next() self._train_op = agent.train(experience) self._env_steps_metric = py_metrics.EnvironmentSteps() self._step_metrics = [ py_metrics.NumberOfEpisodes(), self._env_steps_metric, ] self._train_metrics = self._step_metrics + [ py_metrics.AverageReturnMetric(buffer_size=10), py_metrics.AverageEpisodeLengthMetric(buffer_size=10), ] # The _train_phase_metrics average over an entire train iteration, # rather than the rolling average of the last 10 episodes. self._train_phase_metrics = [ py_metrics.AverageReturnMetric(name='PhaseAverageReturn', buffer_size=np.inf), py_metrics.AverageEpisodeLengthMetric( name='PhaseAverageEpisodeLength', buffer_size=np.inf), ] self._iteration_metric = py_metrics.CounterMetric( name='Iteration') # Summaries written from python should run every time they are # generated. with tf.compat.v2.summary.record_if(True): self._steps_per_second_ph = tf.compat.v1.placeholder( tf.float32, shape=(), name='steps_per_sec_ph') self._steps_per_second_summary = tf.compat.v2.summary.scalar( name='global_steps_per_sec', data=self._steps_per_second_ph, step=self._global_step) for metric in self._train_metrics: metric.tf_summaries(train_step=self._global_step, step_metrics=self._step_metrics) for metric in self._train_phase_metrics: metric.tf_summaries( train_step=self._global_step, step_metrics=(self._iteration_metric, )) self._iteration_metric.tf_summaries( train_step=self._global_step) if self._do_eval: with self._eval_summary_writer.as_default(): for metric in self._eval_metrics: metric.tf_summaries( train_step=self._global_step, step_metrics=(self._iteration_metric, )) self._train_checkpointer = common.Checkpointer( ckpt_dir=train_dir, agent=agent, global_step=self._global_step, optimizer=optimizer, metrics=metric_utils.MetricsGroup( self._train_metrics + self._train_phase_metrics + [self._iteration_metric], 'train_metrics')) self._policy_checkpointer = common.Checkpointer( ckpt_dir=os.path.join(train_dir, 'policy'), policy=agent.policy, global_step=self._global_step) self._rb_checkpointer = common.Checkpointer( ckpt_dir=os.path.join(train_dir, 'replay_buffer'), max_to_keep=1, replay_buffer=self._replay_buffer) self._init_agent_op = agent.initialize()
def train_agent(iterations, modeldir, logdir, policydir): """Train and convert the model using TF Agents.""" train_py_env = planestrike_py_environment.PlaneStrikePyEnvironment( board_size=BOARD_SIZE, discount=DISCOUNT, max_steps=BOARD_SIZE**2) eval_py_env = planestrike_py_environment.PlaneStrikePyEnvironment( board_size=BOARD_SIZE, discount=DISCOUNT, max_steps=BOARD_SIZE**2) train_env = tf_py_environment.TFPyEnvironment(train_py_env) eval_env = tf_py_environment.TFPyEnvironment(eval_py_env) # Alternatively you could use ActorDistributionNetwork as actor_net actor_net = tfa.networks.Sequential( [ tfa.keras_layers.InnerReshape([BOARD_SIZE, BOARD_SIZE], [BOARD_SIZE**2]), tf.keras.layers.Dense(FC_LAYER_PARAMS, activation='relu'), tf.keras.layers.Dense(BOARD_SIZE**2), tf.keras.layers.Lambda( lambda t: tfp.distributions.Categorical(logits=t)), ], input_spec=train_py_env.observation_spec()) optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE) train_step_counter = tf.Variable(0) tf_agent = reinforce_agent.ReinforceAgent( train_env.time_step_spec(), train_env.action_spec(), actor_network=actor_net, optimizer=optimizer, normalize_returns=True, train_step_counter=train_step_counter) tf_agent.initialize() eval_policy = tf_agent.policy collect_policy = tf_agent.collect_policy tf_policy_saver = policy_saver.PolicySaver(collect_policy) # Use reverb as replay buffer replay_buffer_signature = tensor_spec.from_spec(tf_agent.collect_data_spec) table = reverb.Table( REPLAY_BUFFER_TABLE_NAME, max_size=REPLAY_BUFFER_CAPACITY, sampler=reverb.selectors.Uniform(), remover=reverb.selectors.Fifo(), rate_limiter=reverb.rate_limiters.MinSize(1), signature=replay_buffer_signature ) # specify signature here for validation at insertion time reverb_server = reverb.Server([table]) replay_buffer = reverb_replay_buffer.ReverbReplayBuffer( tf_agent.collect_data_spec, sequence_length=None, table_name=REPLAY_BUFFER_TABLE_NAME, local_server=reverb_server) replay_buffer_observer = reverb_utils.ReverbAddEpisodeObserver( replay_buffer.py_client, REPLAY_BUFFER_TABLE_NAME, REPLAY_BUFFER_CAPACITY) # Optimize by wrapping some of the code in a graph using TF function. tf_agent.train = common.function(tf_agent.train) # Evaluate the agent's policy once before training. avg_return = compute_avg_return_and_steps(eval_env, tf_agent.policy, NUM_EVAL_EPISODES) summary_writer = tf.summary.create_file_writer(logdir) for i in range(iterations): # Collect a few episodes using collect_policy and save to the replay buffer. collect_episode(train_py_env, collect_policy, COLLECT_EPISODES_PER_ITERATION, replay_buffer_observer) # Use data from the buffer and update the agent's network. iterator = iter(replay_buffer.as_dataset(sample_batch_size=1)) trajectories, _ = next(iterator) tf_agent.train(experience=trajectories) replay_buffer.clear() logger = tf.get_logger() if i % EVAL_INTERVAL == 0: avg_return, avg_episode_length = compute_avg_return_and_steps( eval_env, eval_policy, NUM_EVAL_EPISODES) with summary_writer.as_default(): tf.summary.scalar('Average return', avg_return, step=i) tf.summary.scalar('Average episode length', avg_episode_length, step=i) summary_writer.flush() logger.info( 'iteration = {0}: Average Return = {1}, Average Episode Length = {2}' .format(i, avg_return, avg_episode_length)) summary_writer.close() tf_policy_saver.save(policydir) # Convert to tflite model converter = tf.lite.TFLiteConverter.from_saved_model( policydir, signature_keys=['action']) converter.target_spec.supported_ops = [ tf.lite.OpsSet.TFLITE_BUILTINS, # enable TensorFlow Lite ops. tf.lite.OpsSet.SELECT_TF_OPS # enable TensorFlow ops. ] tflite_policy = converter.convert() with open(os.path.join(modeldir, 'planestrike_tf_agents.tflite'), 'wb') as f: f.write(tflite_policy)
def testFromStringSpec(self): spec = tensor_spec.from_spec(array_spec.ArraySpec([1], np.string_)) self.assertEqual(tf.string, spec.dtype)
def main(_): logging.set_verbosity(logging.INFO) # Wait for the collect policy to become available, then load it. collect_policy_dir = os.path.join(FLAGS.root_dir, learner.POLICY_SAVED_MODEL_DIR, learner.COLLECT_POLICY_SAVED_MODEL_DIR) collect_policy = train_utils.wait_for_policy(collect_policy_dir, load_specs_from_pbtxt=True) samples_per_insert = FLAGS.samples_per_insert min_table_size_before_sampling = FLAGS.min_table_size_before_sampling # Create the signature for the variable container holding the policy weights. train_step = train_utils.create_train_step() variables = { reverb_variable_container.POLICY_KEY: collect_policy.variables(), reverb_variable_container.TRAIN_STEP_KEY: train_step } variable_container_signature = tf.nest.map_structure( lambda variable: tf.TensorSpec(variable.shape, dtype=variable.dtype), variables) logging.info('Signature of variables: \n%s', variable_container_signature) # Create the signature for the replay buffer holding observed experience. replay_buffer_signature = tensor_spec.from_spec( collect_policy.collect_data_spec) replay_buffer_signature = tf.nest.map_structure( lambda s: tf.TensorSpec((None, ) + s.shape, s.dtype, s.name), replay_buffer_signature) logging.info('Signature of experience: \n%s', replay_buffer_signature) if samples_per_insert is not None: # Use SamplesPerInsertRatio limiter samples_per_insert_tolerance = (_SAMPLES_PER_INSERT_TOLERANCE_RATIO * samples_per_insert) error_buffer = min_table_size_before_sampling * samples_per_insert_tolerance experience_rate_limiter = reverb.rate_limiters.SampleToInsertRatio( min_size_to_sample=min_table_size_before_sampling, samples_per_insert=samples_per_insert, error_buffer=error_buffer) else: # Use MinSize limiter experience_rate_limiter = reverb.rate_limiters.MinSize( min_table_size_before_sampling) # Crete and start the replay buffer and variable container server. server = reverb.Server( tables=[ reverb.Table( # Replay buffer storing experience. name=reverb_replay_buffer.DEFAULT_TABLE, sampler=reverb.selectors.Uniform(), remover=reverb.selectors.Fifo(), rate_limiter=experience_rate_limiter, max_size=FLAGS.replay_buffer_capacity, max_times_sampled=0, signature=replay_buffer_signature, ), reverb.Table( # Variable container storing policy parameters. name=reverb_variable_container.DEFAULT_TABLE, sampler=reverb.selectors.Uniform(), remover=reverb.selectors.Fifo(), rate_limiter=reverb.rate_limiters.MinSize(1), max_size=1, max_times_sampled=0, signature=variable_container_signature, ), ], port=FLAGS.port) server.wait()
def observation_spec(self): return tensor_spec.from_spec(self._envs[0].tf_env.observation_spec())
initial_collect_steps = 100 # @param {type:"integer"} collect_steps_per_iteration = 1 # @param {type:"integer"} replay_buffer_max_length = 100000 # @param {type:"integer"} batch_size = 64 # @param {type:"integer"} learning_rate = 1e-3 # @param {type:"number"} log_interval = 5 # @param {type:"integer"} num_eval_episodes = 10 # @param {type:"integer"} eval_interval = 5 # @param {type:"integer"} # Consider layers that go big -> small -> big, e.g. fc_layer_params = (1024, 256, 64, 256, 1024) #fc_layer_params = (100, 50) # Maybe change env -> tf_env action_tensor_spec = tensor_spec.from_spec(train_env.action_spec()) num_actions = action_tensor_spec.maximum - action_tensor_spec.minimum + 1 # Define a helper function to create Dense layers configured with the right # activation and kernel initializer. def dense_layer(num_units): return tf.keras.layers.Dense( num_units, activation=tf.keras.activations.relu, kernel_initializer=tf.keras.initializers.VarianceScaling( scale=2.0, mode='fan_in', distribution='truncated_normal')) # QNetwork consists of a sequence of Dense layers followed by a dense layer # with `num_actions` units to generate one q_value per available action as # it's output. flatten_layer = tf.keras.layers.Flatten()
def __init__(self, time_step_spec: ts.TimeStep, action_spec: types.NestedTensorSpec, policy_state_spec: types.NestedTensorSpec = (), info_spec: types.NestedTensorSpec = (), clip: bool = True, emit_log_probability: bool = False, automatic_state_reset: bool = True, observation_and_action_constraint_splitter: Optional[ types.Splitter] = None, validate_args: bool = True, name: Optional[Text] = None): """Initialization of TFPolicy class. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. Usually provided by the user to the subclass. action_spec: A nest of BoundedTensorSpec representing the actions. Usually provided by the user to the subclass. policy_state_spec: A nest of TensorSpec representing the policy_state. Provided by the subclass, not directly by the user. info_spec: A nest of TensorSpec representing the policy info. Provided by the subclass, not directly by the user. clip: Whether to clip actions to spec before returning them. Default True. Most policy-based algorithms (PCL, PPO, REINFORCE) use unclipped continuous actions for training. emit_log_probability: Emit log-probabilities of actions, if supported. If True, policy_step.info will have CommonFields.LOG_PROBABILITY set. Please consult utility methods provided in policy_step for setting and retrieving these. When working with custom policies, either provide a dictionary info_spec or a namedtuple with the field 'log_probability'. automatic_state_reset: If `True`, then `get_initial_policy_state` is used to clear state in `action()` and `distribution()` for for time steps where `time_step.is_first()`. observation_and_action_constraint_splitter: A function used to process observations with action constraints. These constraints can indicate, for example, a mask of valid/invalid actions for a given state of the environment. The function takes in a full observation and returns a tuple consisting of 1) the part of the observation intended as input to the network and 2) the constraint. An example `observation_and_action_constraint_splitter` could be as simple as: ``` def observation_and_action_constraint_splitter(observation): return observation['network_input'], observation['constraint'] ``` *Note*: when using `observation_and_action_constraint_splitter`, make sure the provided `q_network` is compatible with the network-specific half of the output of the `observation_and_action_constraint_splitter`. In particular, `observation_and_action_constraint_splitter` will be called on the observation before passing to the network. If `observation_and_action_constraint_splitter` is None, action constraints are not applied. validate_args: Python bool. Whether to verify inputs to, and outputs of, functions like `action` and `distribution` against spec structures, dtypes, and shapes. Research code may prefer to set this value to `False` to allow iterating on input and output structures without being hamstrung by overly rigid checking (at the cost of harder-to-debug errors). See also `TFAgent.validate_args`. name: A name for this module. Defaults to the class name. """ super(TFPolicy, self).__init__(name=name) common.check_tf1_allowed() common.tf_agents_gauge.get_cell('TFAPolicy').set(True) common.assert_members_are_not_overridden(base_cls=TFPolicy, instance=self) if not isinstance(time_step_spec, ts.TimeStep): raise ValueError( 'The `time_step_spec` must be an instance of `TimeStep`, but is `{}`.' .format(type(time_step_spec))) self._time_step_spec = tensor_spec.from_spec(time_step_spec) self._action_spec = tensor_spec.from_spec(action_spec) self._policy_state_spec = tensor_spec.from_spec(policy_state_spec) self._emit_log_probability = emit_log_probability self._validate_args = validate_args if emit_log_probability: log_probability_spec = tensor_spec.BoundedTensorSpec( shape=(), dtype=tf.float32, maximum=0, minimum=-float('inf'), name='log_probability') log_probability_spec = tf.nest.map_structure( lambda _: log_probability_spec, action_spec) info_spec = policy_step.set_log_probability( info_spec, log_probability_spec) # pytype: disable=wrong-arg-types self._info_spec = tensor_spec.from_spec(info_spec) self._setup_specs() self._clip = clip self._action_fn = common.function_in_tf1()(self._action) self._automatic_state_reset = automatic_state_reset self._observation_and_action_constraint_splitter = ( observation_and_action_constraint_splitter)
memory_features = True lose_on_illegal_move = False drawless = False conv_features = True train_py_environment = gofish_env.GoFishEnv(bot, max_visible_opponent_hand_size=10, max_visible_deck_size=10, drawless=drawless, lose_on_illegal_move=lose_on_illegal_move, memory_features=memory_features, conv_features=conv_features) print('Validating env.') utils.validate_py_environment(train_py_environment, episodes=5) print('Validation complete.') eval_py_environment = gofish_env.GoFishEnv(bot, max_visible_opponent_hand_size=10, max_visible_deck_size=10, drawless=drawless, lose_on_illegal_move=lose_on_illegal_move, memory_features=memory_features, conv_features=conv_features) train_env = tf_py_environment.TFPyEnvironment(train_py_environment) eval_env = tf_py_environment.TFPyEnvironment(eval_py_environment) action_tensor_spec = tensor_spec.from_spec(train_py_environment.action_spec()) num_actions = action_tensor_spec.maximum - action_tensor_spec.minimum + 1 if conv_features: else: # Define a helper function to create Dense layers configured with the right # activation and kernel initializer. def dense_layer(num_units): return tf.keras.layers.Dense( num_units, activation=tf.keras.activations.relu, kernel_initializer=tf.keras.initializers.VarianceScaling( scale=2.0, mode='fan_in', distribution='truncated_normal')) # QNetwork consists of a sequence of Dense layers followed by a dense layer
py_env = suite_pybullet.load('AntBulletEnv-v0') py_env.render(mode="human") env = tf_py_environment.TFPyEnvironment(py_env) strategy = strategy_utils.get_strategy(tpu=False, use_gpu=True) replay_buffer_capacity = 2000 learning_rate = 1e-3 fc_layer_params = [128, 64, 64] num_iterations = 100 log_interval = 2 eval_interval = 2 action_tensor_spec = tensor_spec.from_spec(env.action_spec()) num_actions = action_tensor_spec.shape[0] with strategy.scope(): collect_policy = tf.saved_model.load( '/tmp/models/expert/AntBulletEnv-v0') dense_layers = [ Dense(num_units, activation=relu) for num_units in fc_layer_params ] output_layer = Dense(num_actions, activation=None) cloning_net = Sequential(dense_layers + [output_layer]) optimizer = Adam(learning_rate=learning_rate)
def _action(self, time_step, policy_state, seed): observation_and_action_constraint_splitter = ( self.observation_and_action_constraint_splitter) outer_dims = nest_utils.get_outer_shape(time_step, self._time_step_spec) if observation_and_action_constraint_splitter is not None: observation, mask = observation_and_action_constraint_splitter( time_step.observation) if self._stationary_mask is not None: mask = mask * self._stationary_mask action_spec = tensor_spec.from_spec(self.action_spec) action_spec = cast(tensor_spec.BoundedTensorSpec, action_spec) zero_logits = tf.cast(tf.zeros_like(mask), tf.float32) masked_categorical = masked.MaskedCategorical(zero_logits, mask) action_ = tf.cast(masked_categorical.sample() + action_spec.minimum, action_spec.dtype) # If the action spec says each action should be shaped (1,), add another # dimension so the final shape is (B, 1) rather than (B,). if action_spec.shape.rank == 1: action_ = tf.expand_dims(action_, axis=-1) policy_info = tensor_spec.sample_spec_nest( self._info_spec, outer_dims=outer_dims) else: observation = time_step.observation action_spec = cast(tensor_spec.BoundedTensorSpec, self.action_spec) if self._accepts_per_arm_features: max_num_arms = action_spec.maximum - action_spec.minimum + 1 batch_size = tf.shape(time_step.step_type)[0] num_actions = observation.get( bandit_spec_utils.NUM_ACTIONS_FEATURE_KEY, tf.ones(shape=(batch_size,), dtype=tf.int32) * max_num_arms) mask = tf.sequence_mask(num_actions, max_num_arms) zero_logits = tf.cast(tf.zeros_like(mask), tf.float32) masked_categorical = masked.MaskedCategorical(zero_logits, mask) action_ = tf.nest.map_structure( lambda t: tf.cast(masked_categorical.sample() + t.minimum, t.dtype), action_spec) elif self._stationary_mask is not None: batch_size = tf.shape(time_step.step_type)[0] mask = tf.tile(self._stationary_mask, [batch_size, 1]) zero_logits = tf.cast(tf.zeros_like(mask), tf.float32) masked_categorical = masked.MaskedCategorical(zero_logits, mask) action_ = tf.cast(masked_categorical.sample() + action_spec.minimum, action_spec.dtype) else: action_ = tensor_spec.sample_spec_nest( self._action_spec, seed=seed, outer_dims=outer_dims) policy_info = tensor_spec.sample_spec_nest( self._info_spec, outer_dims=outer_dims) # Update policy info with chosen arm features. if self._accepts_per_arm_features: def _gather_fn(t): return tf.gather(params=t, indices=action_, batch_dims=1) chosen_arm_features = tf.nest.map_structure( _gather_fn, observation[bandit_spec_utils.PER_ARM_FEATURE_KEY]) if policy_utilities.has_chosen_arm_features(self._info_spec): policy_info = policy_info._replace( chosen_arm_features=chosen_arm_features) # TODO(b/78181147): Investigate why this control dependency is required. def _maybe_convert_sparse_tensor(t): if isinstance(t, tf.SparseTensor): return tf.sparse.to_dense(t) else: return t if time_step is not None: with tf.control_dependencies( tf.nest.flatten(tf.nest.map_structure(_maybe_convert_sparse_tensor, time_step))): action_ = tf.nest.map_structure(tf.identity, action_) if self.emit_log_probability: if (self._accepts_per_arm_features or observation_and_action_constraint_splitter is not None or self._stationary_mask is not None): action_spec = cast(tensor_spec.BoundedTensorSpec, self.action_spec) log_probability = masked_categorical.log_prob( action_ - action_spec.minimum) else: log_probability = tf.nest.map_structure( lambda s: _calculate_log_probability(outer_dims, s), self._action_spec) policy_info = policy_step.set_log_probability(policy_info, log_probability) step = policy_step.PolicyStep(action_, policy_state, policy_info) return step
def get_action_spec(robot_type): return tensor_spec.from_spec(specs.BoundedArraySpec( shape=(), dtype=np.int32, minimum=0, maximum=(VectorEnv.get_action_space(robot_type) - 1), name='action'))
def __init__(self, input_tensor_spec, output_tensor_spec, fc_layer_params=None, dropout_layer_params=None, conv_layer_params=None, activation_fn=tf.keras.activations.relu, kernel_initializer=None, last_kernel_initializer=None, name='ActorNetwork'): """Creates an instance of `ActorNetwork`. Args: input_tensor_spec: A nest of `tensor_spec.TensorSpec` representing the inputs. output_tensor_spec: A nest of `tensor_spec.BoundedTensorSpec` representing the outputs. fc_layer_params: Optional list of fully_connected parameters, where each item is the number of units in the layer. dropout_layer_params: Optional list of dropout layer parameters, each item is the fraction of input units to drop or a dictionary of parameters according to the keras.Dropout documentation. The additional parameter `permanent`, if set to True, allows to apply dropout at inference for approximated Bayesian inference. The dropout layers are interleaved with the fully connected layers; there is a dropout layer after each fully connected layer, except if the entry in the list is None. This list must have the same length of fc_layer_params, or be None. conv_layer_params: Optional list of convolution layers parameters, where each item is a length-three tuple indicating (filters, kernel_size, stride). activation_fn: Activation function, e.g. tf.nn.relu, slim.leaky_relu, ... kernel_initializer: kernel initializer for all layers except for the value regression layer. If None, a VarianceScaling initializer will be used. last_kernel_initializer: kernel initializer for the value regression layer. If None, a RandomUniform initializer will be used. name: A string representing name of the network. Raises: ValueError: If `input_tensor_spec` or `action_spec` contains more than one item, or if the action data type is not `float`. """ super(ActorNetwork, self).__init__( input_tensor_spec=input_tensor_spec, state_spec=(), name=name) output_tensor_spec = tensor_spec.from_spec(output_tensor_spec) if len(tf.nest.flatten(input_tensor_spec)) > 1: raise ValueError('Only a single observation is supported by this network') flat_action_spec = tf.nest.flatten(output_tensor_spec) if len(flat_action_spec) > 1: raise ValueError('Only a single action is supported by this network') self._single_action_spec = flat_action_spec[0] if self._single_action_spec.dtype not in [tf.float32, tf.float64]: raise ValueError('Only float actions are supported by this network.') if kernel_initializer is None: kernel_initializer = tf.compat.v1.keras.initializers.VarianceScaling( scale=1. / 3., mode='fan_in', distribution='uniform') if last_kernel_initializer is None: last_kernel_initializer = tf.keras.initializers.RandomUniform( minval=-0.003, maxval=0.003) # TODO(kbanoop): Replace mlp_layers with encoding networks. self._mlp_layers = utils.mlp_layers( conv_layer_params, fc_layer_params, dropout_layer_params, activation_fn=activation_fn, kernel_initializer=kernel_initializer, name='input_mlp') self._mlp_layers.append( tf.keras.layers.Dense( flat_action_spec[0].shape.num_elements(), activation=tf.keras.activations.tanh, kernel_initializer=last_kernel_initializer, name='action')) self._output_tensor_spec = output_tensor_spec
def _action_space_fixture(gym_space_bound, gym_space_shape): gym_space = gym.spaces.Box(low=-gym_space_bound, high=gym_space_bound, shape=gym_space_shape, dtype=np.float32) return tensor_spec.from_spec(spec_from_gym_space(gym_space, name="action"))
def __init__( self, time_step_spec: ts.TimeStep, action_spec: types.NestedTensorSpec, q_network: network.Network, optimizer: types.Optimizer, observation_and_action_constraint_splitter: Optional[ types.Splitter] = None, epsilon_greedy: Optional[types.FloatOrReturningFloat] = 0.1, n_step_update: int = 1, boltzmann_temperature: Optional[ types.FloatOrReturningFloat] = None, emit_log_probability: bool = False, # Params for target network updates target_q_network: Optional[network.Network] = None, target_update_tau: types.Float = 1.0, target_update_period: int = 1, # Params for training. td_errors_loss_fn: Optional[types.LossFn] = None, gamma: types.Float = 1.0, reward_scale_factor: types.Float = 1.0, gradient_clipping: Optional[types.Float] = None, # Params for debugging debug_summaries: bool = False, summarize_grads_and_vars: bool = False, train_step_counter: Optional[tf.Variable] = None, name: Optional[Text] = None): """Creates a DQN Agent. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of BoundedTensorSpec representing the actions. q_network: A `tf_agents.network.Network` to be used by the agent. The network will be called with `call(observation, step_type)` and should emit logits over the action space. optimizer: The optimizer to use for training. observation_and_action_constraint_splitter: A function used to process observations with action constraints. These constraints can indicate, for example, a mask of valid/invalid actions for a given state of the environment. The function takes in a full observation and returns a tuple consisting of 1) the part of the observation intended as input to the network and 2) the constraint. An example `observation_and_action_constraint_splitter` could be as simple as: ``` def observation_and_action_constraint_splitter(observation): return observation['network_input'], observation['constraint'] ``` *Note*: when using `observation_and_action_constraint_splitter`, make sure the provided `q_network` is compatible with the network-specific half of the output of the `observation_and_action_constraint_splitter`. In particular, `observation_and_action_constraint_splitter` will be called on the observation before passing to the network. If `observation_and_action_constraint_splitter` is None, action constraints are not applied. epsilon_greedy: probability of choosing a random action in the default epsilon-greedy collect policy (used only if a wrapper is not provided to the collect_policy method). Only one of epsilon_greedy and boltzmann_temperature should be provided. n_step_update: The number of steps to consider when computing TD error and TD loss. Defaults to single-step updates. Note that this requires the user to call train on Trajectory objects with a time dimension of `n_step_update + 1`. However, note that we do not yet support `n_step_update > 1` in the case of RNNs (i.e., non-empty `q_network.state_spec`). boltzmann_temperature: Temperature value to use for Boltzmann sampling of the actions during data collection. The closer to 0.0, the higher the probability of choosing the best action. Only one of epsilon_greedy and boltzmann_temperature should be provided. emit_log_probability: Whether policies emit log probabilities or not. target_q_network: (Optional.) A `tf_agents.network.Network` to be used as the target network during Q learning. Every `target_update_period` train steps, the weights from `q_network` are copied (possibly with smoothing via `target_update_tau`) to `target_q_network`. If `target_q_network` is not provided, it is created by making a copy of `q_network`, which initializes a new network with the same structure and its own layers and weights. Network copying is performed via the `Network.copy` superclass method, and may inadvertently lead to the resulting network to share weights with the original. This can happen if, for example, the original network accepted a pre-built Keras layer in its `__init__`, or accepted a Keras layer that wasn't built, but neglected to create a new copy. In these cases, it is up to you to provide a target Network having weights that are not shared with the original `q_network`. If you provide a `target_q_network` that shares any weights with `q_network`, a warning will be logged but no exception is thrown. Note; shallow copies of Keras layers may be built via the code: ```python new_layer = type(layer).from_config(layer.get_config()) ``` target_update_tau: Factor for soft update of the target networks. target_update_period: Period for soft update of the target networks. td_errors_loss_fn: A function for computing the TD errors loss. If None, a default value of element_wise_huber_loss is used. This function takes as input the target and the estimated Q values and returns the loss for each element of the batch. gamma: A discount factor for future rewards. reward_scale_factor: Multiplicative scale for the reward. gradient_clipping: Norm length to clip gradients. debug_summaries: A bool to gather debug summaries. summarize_grads_and_vars: If True, gradient and network variable summaries will be written during training. train_step_counter: An optional counter to increment every time the train op is run. Defaults to the global_step. name: The name of this agent. All variables in this module will fall under that name. Defaults to the class name. Raises: ValueError: If `action_spec` contains more than one action or action spec minimum is not equal to 0. ValueError: If the q networks do not emit floating point outputs with inner shape matching `action_spec`. NotImplementedError: If `q_network` has non-empty `state_spec` (i.e., an RNN is provided) and `n_step_update > 1`. """ tf.Module.__init__(self, name=name) action_spec = tensor_spec.from_spec(action_spec) self._check_action_spec(action_spec) if epsilon_greedy is not None and boltzmann_temperature is not None: raise ValueError( 'Configured both epsilon_greedy value {} and temperature {}, ' 'however only one of them can be used for exploration.'.format( epsilon_greedy, boltzmann_temperature)) self._observation_and_action_constraint_splitter = ( observation_and_action_constraint_splitter) self._q_network = q_network net_observation_spec = time_step_spec.observation if observation_and_action_constraint_splitter: net_observation_spec, _ = observation_and_action_constraint_splitter( net_observation_spec) q_network.create_variables(net_observation_spec) if target_q_network: target_q_network.create_variables(net_observation_spec) self._target_q_network = common.maybe_copy_target_network_with_checks( self._q_network, target_q_network, input_spec=net_observation_spec, name='TargetQNetwork') self._check_network_output(self._q_network, 'q_network') self._check_network_output(self._target_q_network, 'target_q_network') self._epsilon_greedy = epsilon_greedy self._n_step_update = n_step_update self._boltzmann_temperature = boltzmann_temperature self._optimizer = optimizer self._td_errors_loss_fn = (td_errors_loss_fn or common.element_wise_huber_loss) self._gamma = gamma self._reward_scale_factor = reward_scale_factor self._gradient_clipping = gradient_clipping self._update_target = self._get_target_updater(target_update_tau, target_update_period) policy, collect_policy = self._setup_policy(time_step_spec, action_spec, boltzmann_temperature, emit_log_probability) if q_network.state_spec and n_step_update != 1: raise NotImplementedError( 'DqnAgent does not currently support n-step updates with stateful ' 'networks (i.e., RNNs), but n_step_update = {}'.format( n_step_update)) train_sequence_length = (n_step_update + 1 if not q_network.state_spec else None) super(DqnAgent, self).__init__( time_step_spec, action_spec, policy, collect_policy, train_sequence_length=train_sequence_length, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter, ) if q_network.state_spec: # AsNStepTransition does not support emitting [B, T, ...] tensors, # which we need for DQN-RNN. self._as_transition = data_converter.AsTransition( self.data_context, squeeze_time_dim=False) else: # This reduces the n-step return and removes the extra time dimension, # allowing the rest of the computations to be independent of the # n-step parameter. self._as_transition = data_converter.AsNStepTransition( self.data_context, gamma=gamma, n=n_step_update)
def _nb_actions(self): """return number of actions""" action_tensor_spec = tensor_spec.from_spec(self.env.action_spec()) return action_tensor_spec.maximum - action_tensor_spec.minimum + 1
def __init__( self, time_step_spec: ts.TimeStep, action_spec: types.NestedTensorSpec, cloning_network: network.Network, optimizer: types.Optimizer, num_outer_dims: Literal[1, 2] = 1, # pylint: disable=bad-whitespace epsilon_greedy: types.Float = 0.1, loss_fn: Optional[Callable[[types.NestedTensor, bool], types.Tensor]] = None, gradient_clipping: Optional[types.Float] = None, # Params for debugging. debug_summaries: bool = False, summarize_grads_and_vars: bool = False, train_step_counter: Optional[tf.Variable] = None, name: Optional[Text] = None): """Creates an instance of a Behavioral Cloning agent. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of BoundedTensorSpec representing the actions. cloning_network: A `tf_agents.networks.Network` to be used by the agent. The network will be called as ``` network(observation, step_type=step_type, network_state=initial_state) ``` and must return a 2-tuple with elements `(output, next_network_state)` optimizer: The optimizer to use for training. num_outer_dims: The number of outer dimensions for the agent. Must be either 1 or 2. If 2, training will require both a batch_size and time dimension on every Tensor; if 1, training will require only a batch_size outer dimension. epsilon_greedy: probability of choosing a random action in the default epsilon-greedy collect policy (used only if actions are discrete) loss_fn: A function for computing the error between the output of the cloning network and the action that was taken. If None, the loss depends on the action dtype. The `loss_fn` is called with parameters: `(experience, training)`, and must return a loss value for each element of the batch. gradient_clipping: Norm length to clip gradients. debug_summaries: A bool to gather debug summaries. summarize_grads_and_vars: If True, gradient and network variable summaries will be written during training. train_step_counter: An optional counter to increment every time the train op is run. Defaults to the global_step. name: The name of this agent. All variables in this module will fall under that name. Defaults to the class name. """ tf.Module.__init__(self, name=name) self._cloning_network = cloning_network self._optimizer = optimizer self._gradient_clipping = gradient_clipping action_spec = tensor_spec.from_spec(action_spec) flat_action_spec = tf.nest.flatten(action_spec) continuous_specs = [ tensor_spec.is_continuous(s) for s in flat_action_spec ] if not flat_action_spec: raise ValueError( 'The `action_spec` must contain at least one action.') single_discrete_scalar_action = ( len(flat_action_spec) == 1 and flat_action_spec[0].shape.rank == 0 and not tensor_spec.is_continuous(flat_action_spec[0])) single_continuous_action = (len(flat_action_spec) == 1 and tensor_spec.is_continuous( flat_action_spec[0])) if (not loss_fn and not single_discrete_scalar_action and not single_continuous_action): raise ValueError( 'A `loss_fn` must be provided unless there is a single, scalar ' 'discrete action or a single (scalar or non-scalar) continuous ' 'action.') self._network_output_spec = cloning_network.create_variables( time_step_spec.observation) # If there is a mix of continuous and discrete actions we want to use an # actor policy so we can use the `setup_as_continuous` method as long as the # user provided a custom loss_fn which we verified above. if any(continuous_specs): policy, collect_policy = self._setup_as_continuous( time_step_spec, action_spec, loss_fn) else: policy, collect_policy = self._setup_as_discrete( time_step_spec, action_spec, loss_fn, epsilon_greedy) super(BehavioralCloningAgent, self).__init__(time_step_spec, action_spec, policy, collect_policy, train_sequence_length=None, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter) self._as_trajectory = data_converter.AsTrajectory( self.data_context, sequence_length=None, num_outer_dims=num_outer_dims)
def __init__(self, time_step_spec: ts.TimeStep, action_spec: types.NestedTensorSpec, policy: tf_policy.TFPolicy, collect_policy: tf_policy.TFPolicy, train_sequence_length: Optional[int], num_outer_dims: int = 2, training_data_spec: Optional[types.NestedTensorSpec] = None, debug_summaries: bool = False, summarize_grads_and_vars: bool = False, enable_summaries: bool = True, train_step_counter: Optional[tf.Variable] = None): """Meant to be called by subclass constructors. Args: time_step_spec: A nest of tf.TypeSpec representing the time_steps. Provided by the user. action_spec: A nest of BoundedTensorSpec representing the actions. Provided by the user. policy: An instance of `tf_policy.TFPolicy` representing the Agent's current policy. collect_policy: An instance of `tf_policy.TFPolicy` representing the Agent's current data collection policy (used to set `self.step_spec`). train_sequence_length: A python integer or `None`, signifying the number of time steps required from tensors in `experience` as passed to `train()`. All tensors in `experience` will be shaped `[B, T, ...]` but for certain agents, `T` should be fixed. For example, DQN requires transitions in the form of 2 time steps, so for a non-RNN DQN Agent, set this value to 2. For agents that don't care, or which can handle `T` unknown at graph build time (i.e. most RNN-based agents), set this argument to `None`. num_outer_dims: The number of outer dimensions for the agent. Must be either 1 or 2. If 2, training will require both a batch_size and time dimension on every Tensor; if 1, training will require only a batch_size outer dimension. training_data_spec: A nest of TensorSpec specifying the structure of data the train() function expects. If None, defaults to the trajectory_spec of the collect_policy. debug_summaries: A bool; if true, subclasses should gather debug summaries. summarize_grads_and_vars: A bool; if true, subclasses should additionally collect gradient and variable summaries. enable_summaries: A bool; if false, subclasses should not gather any summaries (debug or otherwise); subclasses should gate *all* summaries using either `summaries_enabled`, `debug_summaries`, or `summarize_grads_and_vars` properties. train_step_counter: An optional counter to increment every time the train op is run. Defaults to the global_step. Raises: ValueError: If `num_outer_dims` is not in `[1, 2]`. """ common.check_tf1_allowed() common.tf_agents_gauge.get_cell("TFAgent").set(True) common.tf_agents_gauge.get_cell(str(type(self))).set(True) if not isinstance(time_step_spec, ts.TimeStep): raise TypeError( "The `time_step_spec` must be an instance of `TimeStep`, but is `{}`." .format(type(time_step_spec))) if num_outer_dims not in [1, 2]: raise ValueError("num_outer_dims must be in [1, 2].") time_step_spec = tensor_spec.from_spec(time_step_spec) action_spec = tensor_spec.from_spec(action_spec) self._time_step_spec = time_step_spec self._action_spec = action_spec self._policy = policy self._collect_policy = collect_policy self._train_sequence_length = train_sequence_length self._num_outer_dims = num_outer_dims self._debug_summaries = debug_summaries self._summarize_grads_and_vars = summarize_grads_and_vars self._enable_summaries = enable_summaries self._training_data_spec = training_data_spec # Data context for data collected directly from the collect policy. self._collect_data_context = data_converter.DataContext( time_step_spec=self._time_step_spec, action_spec=self._action_spec, info_spec=collect_policy.info_spec) # Data context for data passed to train(). May be different if # training_data_spec is provided. if training_data_spec is not None: training_data_spec = tensor_spec.from_spec(training_data_spec) # training_data_spec can be anything; so build a data_context # via best-effort with fall-backs to the collect data spec. training_discount_spec = getattr(training_data_spec, "discount", time_step_spec.discount) training_observation_spec = getattr(training_data_spec, "observation", time_step_spec.observation) training_reward_spec = getattr(training_data_spec, "reward", time_step_spec.reward) training_step_type_spec = getattr(training_data_spec, "step_type", time_step_spec.step_type) training_policy_info_spec = getattr(training_data_spec, "policy_info", collect_policy.info_spec) training_action_spec = getattr(training_data_spec, "action", action_spec) self._data_context = data_converter.DataContext( time_step_spec=ts.TimeStep( discount=training_discount_spec, observation=training_observation_spec, reward=training_reward_spec, step_type=training_step_type_spec), action_spec=training_action_spec, info_spec=training_policy_info_spec) else: self._data_context = data_converter.DataContext( time_step_spec=time_step_spec, action_spec=action_spec, info_spec=collect_policy.info_spec) if train_step_counter is None: train_step_counter = tf.compat.v1.train.get_or_create_global_step() self._train_step_counter = train_step_counter self._train_fn = common.function_in_tf1()(self._train) self._initialize_fn = common.function_in_tf1()(self._initialize) self._preprocess_sequence_fn = common.function_in_tf1()( self._preprocess_sequence) self._loss_fn = common.function_in_tf1()(self._loss)
def main(_): # setting up start_time = time.time() tf.compat.v1.enable_resource_variables() tf.compat.v1.disable_eager_execution() logging.set_verbosity(logging.INFO) global observation_omit_size, goal_coord, sample_count, iter_count, episode_size_buffer, episode_return_buffer root_dir = os.path.abspath(os.path.expanduser(FLAGS.logdir)) if not tf.io.gfile.exists(root_dir): tf.io.gfile.makedirs(root_dir) log_dir = os.path.join(root_dir, FLAGS.environment) if not tf.io.gfile.exists(log_dir): tf.io.gfile.makedirs(log_dir) save_dir = os.path.join(log_dir, "models") if not tf.io.gfile.exists(save_dir): tf.io.gfile.makedirs(save_dir) print("directory for recording experiment data:", log_dir) # in case training is paused and resumed, so can be restored try: sample_count = np.load(os.path.join(log_dir, "sample_count.npy")).tolist() iter_count = np.load(os.path.join(log_dir, "iter_count.npy")).tolist() episode_size_buffer = np.load( os.path.join(log_dir, "episode_size_buffer.npy")).tolist() episode_return_buffer = np.load( os.path.join(log_dir, "episode_return_buffer.npy")).tolist() except: sample_count = 0 iter_count = 0 episode_size_buffer = [] episode_return_buffer = [] train_summary_writer = tf.compat.v2.summary.create_file_writer( os.path.join(log_dir, "train", "in_graph_data"), flush_millis=10 * 1000) train_summary_writer.set_as_default() global_step = tf.compat.v1.train.get_or_create_global_step() with tf.compat.v2.summary.record_if(True): # environment related stuff env = do.get_environment(env_name=FLAGS.environment) py_env = wrap_env( skill_wrapper.SkillWrapper( env, num_latent_skills=FLAGS.num_skills, skill_type=FLAGS.skill_type, preset_skill=None, min_steps_before_resample=FLAGS.min_steps_before_resample, resample_prob=FLAGS.resample_prob, ), max_episode_steps=FLAGS.max_env_steps, ) # all specifications required for all networks and agents py_action_spec = py_env.action_spec() tf_action_spec = tensor_spec.from_spec( py_action_spec) # policy, critic action spec env_obs_spec = py_env.observation_spec() py_env_time_step_spec = ts.time_step_spec( env_obs_spec) # replay buffer time_step spec if observation_omit_size > 0: agent_obs_spec = array_spec.BoundedArraySpec( (env_obs_spec.shape[0] - observation_omit_size, ), env_obs_spec.dtype, minimum=env_obs_spec.minimum, maximum=env_obs_spec.maximum, name=env_obs_spec.name, ) # policy, critic observation spec else: agent_obs_spec = env_obs_spec py_agent_time_step_spec = ts.time_step_spec( agent_obs_spec) # policy, critic time_step spec tf_agent_time_step_spec = tensor_spec.from_spec( py_agent_time_step_spec) if not FLAGS.reduced_observation: skill_dynamics_observation_size = ( py_env_time_step_spec.observation.shape[0] - FLAGS.num_skills) else: skill_dynamics_observation_size = FLAGS.reduced_observation # TODO(architsh): Shift co-ordinate hiding to actor_net and critic_net (good for futher image based processing as well) actor_net = actor_distribution_network.ActorDistributionNetwork( tf_agent_time_step_spec.observation, tf_action_spec, fc_layer_params=(FLAGS.hidden_layer_size, ) * 2, continuous_projection_net=do._normal_projection_net, ) critic_net = critic_network.CriticNetwork( (tf_agent_time_step_spec.observation, tf_action_spec), observation_fc_layer_params=None, action_fc_layer_params=None, joint_fc_layer_params=(FLAGS.hidden_layer_size, ) * 2, ) if (FLAGS.skill_dynamics_relabel_type is not None and "importance_sampling" in FLAGS.skill_dynamics_relabel_type and FLAGS.is_clip_eps > 1.0): reweigh_batches_flag = True else: reweigh_batches_flag = False agent = dads_agent.DADSAgent( # DADS parameters save_dir, skill_dynamics_observation_size, observation_modify_fn=do.process_observation, restrict_input_size=observation_omit_size, latent_size=FLAGS.num_skills, latent_prior=FLAGS.skill_type, prior_samples=FLAGS.random_skills, fc_layer_params=(FLAGS.hidden_layer_size, ) * 2, normalize_observations=FLAGS.normalize_data, network_type=FLAGS.graph_type, num_mixture_components=FLAGS.num_components, fix_variance=FLAGS.fix_variance, reweigh_batches=reweigh_batches_flag, skill_dynamics_learning_rate=FLAGS.skill_dynamics_lr, # SAC parameters time_step_spec=tf_agent_time_step_spec, action_spec=tf_action_spec, actor_network=actor_net, critic_network=critic_net, target_update_tau=0.005, target_update_period=1, actor_optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=FLAGS.agent_lr), critic_optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=FLAGS.agent_lr), alpha_optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=FLAGS.agent_lr), td_errors_loss_fn=tf.compat.v1.losses.mean_squared_error, gamma=FLAGS.agent_gamma, reward_scale_factor=1.0 / (FLAGS.agent_entropy + 1e-12), gradient_clipping=None, debug_summaries=FLAGS.debug, train_step_counter=global_step, ) # evaluation policy eval_policy = py_tf_policy.PyTFPolicy(agent.policy) # collection policy if FLAGS.collect_policy == "default": collect_policy = py_tf_policy.PyTFPolicy(agent.collect_policy) elif FLAGS.collect_policy == "ou_noise": collect_policy = py_tf_policy.PyTFPolicy( ou_noise_policy.OUNoisePolicy(agent.collect_policy, ou_stddev=0.2, ou_damping=0.15)) # relabelling policy deals with batches of data, unlike collect and eval relabel_policy = py_tf_policy.PyTFPolicy(agent.collect_policy) # constructing a replay buffer, need a python spec policy_step_spec = policy_step.PolicyStep(action=py_action_spec, state=(), info=()) if (FLAGS.skill_dynamics_relabel_type is not None and "importance_sampling" in FLAGS.skill_dynamics_relabel_type and FLAGS.is_clip_eps > 1.0): policy_step_spec = policy_step_spec._replace( info=policy_step.set_log_probability( policy_step_spec.info, array_spec.ArraySpec( shape=(), dtype=np.float32, name="action_log_prob"), )) trajectory_spec = from_transition(py_env_time_step_spec, policy_step_spec, py_env_time_step_spec) capacity = FLAGS.replay_buffer_capacity # for all the data collected rbuffer = py_uniform_replay_buffer.PyUniformReplayBuffer( capacity=capacity, data_spec=trajectory_spec) if FLAGS.train_skill_dynamics_on_policy: # for on-policy data (if something special is required) on_buffer = py_uniform_replay_buffer.PyUniformReplayBuffer( capacity=FLAGS.initial_collect_steps + FLAGS.collect_steps + 10, data_spec=trajectory_spec, ) # insert experience manually with relabelled rewards and skills agent.build_agent_graph() agent.build_skill_dynamics_graph() agent.create_savers() # saving this way requires the saver to be out the object train_checkpointer = common.Checkpointer( ckpt_dir=os.path.join(save_dir, "agent"), agent=agent, global_step=global_step, ) policy_checkpointer = common.Checkpointer( ckpt_dir=os.path.join(save_dir, "policy"), policy=agent.policy, global_step=global_step, ) rb_checkpointer = common.Checkpointer( ckpt_dir=os.path.join(save_dir, "replay_buffer"), max_to_keep=1, replay_buffer=rbuffer, ) setup_time = time.time() - start_time print("Setup time:", setup_time) with tf.compat.v1.Session().as_default() as sess: eval_policy.session = sess eval_policy.initialize(None) eval_policy.restore(os.path.join(FLAGS.logdir, "models", "policy")) plotdir = os.path.join(FLAGS.logdir, "plots") if not os.path.exists(plotdir): os.mkdir(plotdir) do.FLAGS = FLAGS do.eval_loop(eval_dir=plotdir, eval_policy=eval_policy, plot_name="plot")
def action_spec(self): return tensor_spec.from_spec(self._envs[0].tf_env.action_spec())
def load(self): # setting up tf.compat.v1.enable_resource_variables() tf.compat.v1.disable_eager_execution() root_dir = os.path.abspath(os.path.expanduser(self.flags.logdir)) if not tf.io.gfile.exists(root_dir): tf.io.gfile.makedirs(root_dir) log_dir = os.path.join(root_dir, self.flags.environment) if not tf.io.gfile.exists(log_dir): tf.io.gfile.makedirs(log_dir) save_dir = os.path.join(log_dir, "models") if not tf.io.gfile.exists(save_dir): tf.io.gfile.makedirs(save_dir) train_summary_writer = tf.compat.v2.summary.create_file_writer( os.path.join(log_dir, "train", "in_graph_data"), flush_millis=10 * 1000) train_summary_writer.set_as_default() global_step = tf.compat.v1.train.get_or_create_global_step() with tf.compat.v2.summary.record_if(True): # environment related stuff env = do.get_environment(env_name=self.flags.environment) py_env = wrap_env( skill_wrapper.SkillWrapper( env, num_latent_skills=self.flags.num_skills, skill_type=self.flags.skill_type, preset_skill=None, min_steps_before_resample=self.flags. min_steps_before_resample, resample_prob=self.flags.resample_prob, ), max_episode_steps=self.flags.max_env_steps, ) # all specifications required for all networks and agents py_action_spec = py_env.action_spec() tf_action_spec = tensor_spec.from_spec( py_action_spec) # policy, critic action spec env_obs_spec = py_env.observation_spec() py_env_time_step_spec = ts.time_step_spec( env_obs_spec) # replay buffer time_step spec if self.flags.observation_omission_size > 0: agent_obs_spec = array_spec.BoundedArraySpec( (env_obs_spec.shape[0] - self.flags.observation_omission_size), env_obs_spec.dtype, minimum=env_obs_spec.minimum, maximum=env_obs_spec.maximum, name=env_obs_spec.name, ) # policy, critic observation spec else: agent_obs_spec = env_obs_spec py_agent_time_step_spec = ts.time_step_spec( agent_obs_spec) # policy, critic time_step spec tf_agent_time_step_spec = tensor_spec.from_spec( py_agent_time_step_spec) if not self.flags.reduced_observation: skill_dynamics_observation_size = ( py_env_time_step_spec.observation.shape[0] - self.flags.num_skills) else: skill_dynamics_observation_size = self.flags.reduced_observation # TODO(architsh): Shift co-ordinate hiding to actor_net and critic_net (good for futher image based processing as well) actor_net = actor_distribution_network.ActorDistributionNetwork( tf_agent_time_step_spec.observation, tf_action_spec, fc_layer_params=(self.flags.hidden_layer_size, ) * 2, continuous_projection_net=do._normal_projection_net, ) critic_net = critic_network.CriticNetwork( (tf_agent_time_step_spec.observation, tf_action_spec), observation_fc_layer_params=None, action_fc_layer_params=None, joint_fc_layer_params=(self.flags.hidden_layer_size, ) * 2, ) if (self.flags.skill_dynamics_relabel_type is not None and "importance_sampling" in self.flags.skill_dynamics_relabel_type and self.flags.is_clip_eps > 1.0): reweigh_batches_flag = True else: reweigh_batches_flag = False agent = dads_agent.DADSAgent( # DADS parameters save_dir, skill_dynamics_observation_size, observation_modify_fn=self.process_observation, restrict_input_size=self.flags.observation_omission_size, latent_size=self.flags.num_skills, latent_prior=self.flags.skill_type, prior_samples=self.flags.random_skills, fc_layer_params=(self.flags.hidden_layer_size, ) * 2, normalize_observations=self.flags.normalize_data, network_type=self.flags.graph_type, num_mixture_components=self.flags.num_components, fix_variance=self.flags.fix_variance, reweigh_batches=reweigh_batches_flag, skill_dynamics_learning_rate=self.flags.skill_dynamics_lr, # SAC parameters time_step_spec=tf_agent_time_step_spec, action_spec=tf_action_spec, actor_network=actor_net, critic_network=critic_net, target_update_tau=0.005, target_update_period=1, actor_optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=self.flags.agent_lr), critic_optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=self.flags.agent_lr), alpha_optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=self.flags.agent_lr), td_errors_loss_fn=tf.compat.v1.losses.mean_squared_error, gamma=self.flags.agent_gamma, reward_scale_factor=1.0 / (self.flags.agent_entropy + 1e-12), gradient_clipping=None, debug_summaries=self.flags.debug, train_step_counter=global_step, ) # evaluation policy eval_policy = py_tf_policy.PyTFPolicy(agent.policy) # constructing a replay buffer, need a python spec policy_step_spec = policy_step.PolicyStep(action=py_action_spec, state=(), info=()) if (self.flags.skill_dynamics_relabel_type is not None and "importance_sampling" in self.flags.skill_dynamics_relabel_type and self.flags.is_clip_eps > 1.0): policy_step_spec = policy_step_spec._replace( info=policy_step.set_log_probability( policy_step_spec.info, array_spec.ArraySpec( shape=( ), dtype=np.float32, name="action_log_prob"), )) # insert experience manually with relabelled rewards and skills agent.build_agent_graph() agent.build_skill_dynamics_graph() with tf.compat.v1.Session().as_default() as sess: eval_policy.session = sess eval_policy.initialize(None) eval_policy.restore( os.path.join(self.flags.logdir, "models", "policy")) self.policy = eval_policy
def train_eval( root_dir, env_name='CartPole-v0', # Training params initial_collect_steps=1000, num_iterations=100000, fc_layer_params=(100, ), # Agent params epsilon_greedy=0.1, batch_size=64, learning_rate=1e-3, n_step_update=1, gamma=0.99, target_update_tau=0.05, target_update_period=5, reward_scale_factor=1.0, # Replay params reverb_port=None, replay_capacity=100000, # Others policy_save_interval=1000, eval_interval=1000, eval_episodes=10): """Trains and evaluates DQN.""" collect_env = suite_gym.load(env_name) eval_env = suite_gym.load(env_name) time_step_tensor_spec = tensor_spec.from_spec(collect_env.time_step_spec()) action_tensor_spec = tensor_spec.from_spec(collect_env.action_spec()) train_step = train_utils.create_train_step() num_actions = action_tensor_spec.maximum - action_tensor_spec.minimum + 1 # Define a helper function to create Dense layers configured with the right # activation and kernel initializer. def dense_layer(num_units): return tf.keras.layers.Dense( num_units, activation=tf.keras.activations.relu, kernel_initializer=tf.keras.initializers.VarianceScaling( scale=2.0, mode='fan_in', distribution='truncated_normal')) # QNetwork consists of a sequence of Dense layers followed by a dense layer # with `num_actions` units to generate one q_value per available action as # it's output. dense_layers = [dense_layer(num_units) for num_units in fc_layer_params] q_values_layer = tf.keras.layers.Dense( num_actions, activation=None, kernel_initializer=tf.keras.initializers.RandomUniform(minval=-0.03, maxval=0.03), bias_initializer=tf.keras.initializers.Constant(-0.2)) q_net = sequential.Sequential(dense_layers + [q_values_layer]) agent = dqn_agent.DqnAgent( time_step_tensor_spec, action_tensor_spec, q_network=q_net, epsilon_greedy=epsilon_greedy, n_step_update=n_step_update, target_update_tau=target_update_tau, target_update_period=target_update_period, optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), td_errors_loss_fn=common.element_wise_squared_loss, gamma=gamma, reward_scale_factor=reward_scale_factor, train_step_counter=train_step) table_name = 'uniform_table' table = reverb.Table(table_name, max_size=replay_capacity, sampler=reverb.selectors.Uniform(), remover=reverb.selectors.Fifo(), rate_limiter=reverb.rate_limiters.MinSize(1)) reverb_server = reverb.Server([table], port=reverb_port) reverb_replay = reverb_replay_buffer.ReverbReplayBuffer( agent.collect_data_spec, sequence_length=2, table_name=table_name, local_server=reverb_server) rb_observer = reverb_utils.ReverbAddTrajectoryObserver( reverb_replay.py_client, table_name, sequence_length=2, stride_length=1) dataset = reverb_replay.as_dataset(num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2).prefetch(3) experience_dataset_fn = lambda: dataset saved_model_dir = os.path.join(root_dir, learner.POLICY_SAVED_MODEL_DIR) env_step_metric = py_metrics.EnvironmentSteps() learning_triggers = [ triggers.PolicySavedModelTrigger( saved_model_dir, agent, train_step, interval=policy_save_interval, metadata_metrics={triggers.ENV_STEP_METADATA_KEY: env_step_metric}), triggers.StepPerSecondLogTrigger(train_step, interval=100), ] dqn_learner = learner.Learner(root_dir, train_step, agent, experience_dataset_fn, triggers=learning_triggers) # If we haven't trained yet make sure we collect some random samples first to # fill up the Replay Buffer with some experience. random_policy = random_py_policy.RandomPyPolicy( collect_env.time_step_spec(), collect_env.action_spec()) initial_collect_actor = actor.Actor(collect_env, random_policy, train_step, steps_per_run=initial_collect_steps, observers=[rb_observer]) logging.info('Doing initial collect.') initial_collect_actor.run() tf_collect_policy = agent.collect_policy collect_policy = py_tf_eager_policy.PyTFEagerPolicy(tf_collect_policy, use_tf_function=True) collect_actor = actor.Actor( collect_env, collect_policy, train_step, steps_per_run=1, observers=[rb_observer, env_step_metric], metrics=actor.collect_metrics(10), summary_dir=os.path.join(root_dir, learner.TRAIN_DIR), ) tf_greedy_policy = agent.policy greedy_policy = py_tf_eager_policy.PyTFEagerPolicy(tf_greedy_policy, use_tf_function=True) eval_actor = actor.Actor( eval_env, greedy_policy, train_step, episodes_per_run=eval_episodes, metrics=actor.eval_metrics(eval_episodes), summary_dir=os.path.join(root_dir, 'eval'), ) if eval_interval: logging.info('Evaluating.') eval_actor.run_and_log() logging.info('Training.') for _ in range(num_iterations): collect_actor.run() dqn_learner.run(iterations=1) if eval_interval and dqn_learner.train_step_numpy % eval_interval == 0: logging.info('Evaluating.') eval_actor.run_and_log() rb_observer.close() reverb_server.stop()
def create_tensor_specs(data_spec, episode_len): spec = tuple([data_spec for _ in range(episode_len)]) tensor_data_spec = tensor_spec.from_spec(data_spec) tensor_episode_spec = tensor_spec.from_spec((spec, spec)) return tensor_data_spec, tensor_episode_spec
def train_eval( root_dir, env_name='CartPole-v0', num_iterations=100000, fc_layer_params=(100, ), # Params for collect initial_collect_steps=1000, collect_steps_per_iteration=1, epsilon_greedy=0.1, replay_buffer_capacity=100000, # Params for target update target_update_tau=0.05, target_update_period=5, # Params for train train_steps_per_iteration=1, batch_size=64, learning_rate=1e-3, n_step_update=1, gamma=0.99, reward_scale_factor=1.0, gradient_clipping=None, # Params for eval num_eval_episodes=10, eval_interval=1000, # Params for checkpoints, summaries and logging train_checkpoint_interval=10000, policy_checkpoint_interval=5000, log_interval=1000, summaries_flush_secs=10, debug_summaries=False, summarize_grads_and_vars=False, eval_metrics_callback=None): """A simple train and eval for DQN.""" root_dir = os.path.expanduser(root_dir) train_dir = os.path.join(root_dir, 'train') eval_dir = os.path.join(root_dir, 'eval') train_summary_writer = tf.compat.v2.summary.create_file_writer( train_dir, flush_millis=summaries_flush_secs * 1000) train_summary_writer.set_as_default() eval_summary_writer = tf.compat.v2.summary.create_file_writer( eval_dir, flush_millis=summaries_flush_secs * 1000) eval_metrics = [ py_metrics.AverageReturnMetric(buffer_size=num_eval_episodes), py_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes), ] # Note this is a python environment. env = batched_py_environment.BatchedPyEnvironment( [suite_gym.load(env_name)]) eval_py_env = suite_gym.load(env_name) # Convert specs to BoundedTensorSpec. action_spec = tensor_spec.from_spec(env.action_spec()) observation_spec = tensor_spec.from_spec(env.observation_spec()) time_step_spec = ts.time_step_spec(observation_spec) q_net = q_network.QNetwork(tensor_spec.from_spec(env.observation_spec()), tensor_spec.from_spec(env.action_spec()), fc_layer_params=fc_layer_params) # The agent must be in graph. global_step = tf.compat.v1.train.get_or_create_global_step() agent = dqn_agent.DqnAgent( time_step_spec, action_spec, q_network=q_net, epsilon_greedy=epsilon_greedy, n_step_update=n_step_update, target_update_tau=target_update_tau, target_update_period=target_update_period, optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=learning_rate), td_errors_loss_fn=dqn_agent.element_wise_squared_loss, gamma=gamma, reward_scale_factor=reward_scale_factor, gradient_clipping=gradient_clipping, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=global_step) tf_collect_policy = agent.collect_policy collect_policy = py_tf_policy.PyTFPolicy(tf_collect_policy) greedy_policy = py_tf_policy.PyTFPolicy(agent.policy) random_policy = random_py_policy.RandomPyPolicy(env.time_step_spec(), env.action_spec()) # Python replay buffer. replay_buffer = py_uniform_replay_buffer.PyUniformReplayBuffer( capacity=replay_buffer_capacity, data_spec=tensor_spec.to_nest_array_spec(agent.collect_data_spec)) time_step = env.reset() # Initialize the replay buffer with some transitions. We use the random # policy to initialize the replay buffer to make sure we get a good # distribution of actions. for _ in range(initial_collect_steps): time_step = collect_step(env, time_step, random_policy, replay_buffer) # TODO(b/112041045) Use global_step as counter. train_checkpointer = common.Checkpointer(ckpt_dir=train_dir, agent=agent, global_step=global_step) policy_checkpointer = common.Checkpointer(ckpt_dir=os.path.join( train_dir, 'policy'), policy=agent.policy, global_step=global_step) ds = replay_buffer.as_dataset(sample_batch_size=batch_size, num_steps=n_step_update + 1) ds = ds.prefetch(4) itr = tf.compat.v1.data.make_initializable_iterator(ds) experience = itr.get_next() train_op = common.function(agent.train)(experience) with eval_summary_writer.as_default(), \ tf.compat.v2.summary.record_if(True): for eval_metric in eval_metrics: eval_metric.tf_summaries(train_step=global_step) with tf.compat.v1.Session() as session: train_checkpointer.initialize_or_restore(session) common.initialize_uninitialized_variables(session) session.run(itr.initializer) # Copy critic network values to the target critic network. session.run(agent.initialize()) train = session.make_callable(train_op) global_step_call = session.make_callable(global_step) session.run(train_summary_writer.init()) session.run(eval_summary_writer.init()) # Compute initial evaluation metrics. global_step_val = global_step_call() metric_utils.compute_summaries( eval_metrics, eval_py_env, greedy_policy, num_episodes=num_eval_episodes, global_step=global_step_val, log=True, callback=eval_metrics_callback, ) timed_at_step = global_step_val collect_time = 0 train_time = 0 steps_per_second_ph = tf.compat.v1.placeholder(tf.float32, shape=(), name='steps_per_sec_ph') steps_per_second_summary = tf.compat.v2.summary.scalar( name='global_steps_per_sec', data=steps_per_second_ph, step=global_step) for _ in range(num_iterations): start_time = time.time() for _ in range(collect_steps_per_iteration): time_step = collect_step(env, time_step, collect_policy, replay_buffer) collect_time += time.time() - start_time start_time = time.time() for _ in range(train_steps_per_iteration): loss = train() train_time += time.time() - start_time global_step_val = global_step_call() if global_step_val % log_interval == 0: logging.info('step = %d, loss = %f', global_step_val, loss.loss) steps_per_sec = ((global_step_val - timed_at_step) / (collect_time + train_time)) session.run(steps_per_second_summary, feed_dict={steps_per_second_ph: steps_per_sec}) logging.info('%.3f steps/sec', steps_per_sec) logging.info( '%s', 'collect_time = {}, train_time = {}'.format( collect_time, train_time)) timed_at_step = global_step_val collect_time = 0 train_time = 0 if global_step_val % train_checkpoint_interval == 0: train_checkpointer.save(global_step=global_step_val) if global_step_val % policy_checkpoint_interval == 0: policy_checkpointer.save(global_step=global_step_val) if global_step_val % eval_interval == 0: metric_utils.compute_summaries( eval_metrics, eval_py_env, greedy_policy, num_episodes=num_eval_episodes, global_step=global_step_val, log=True, callback=eval_metrics_callback, ) # Reset timing to avoid counting eval time. timed_at_step = global_step_val start_time = time.time()
if __name__ == '__main__': #COMMAND-LINE ARGUMENTS parser = argparse.ArgumentParser('Read-From-Bigtable Script') parser.add_argument('--gcp-project-id', type=str, default='for-robolab-cbai') parser.add_argument('--cbt-instance-id', type=str, default='rab-rl-bigtable') parser.add_argument('--cbt-table-name', type=str, default='cartpole-experience-replay') args = parser.parse_args() #INITIALIZE RL AGENT observation_spec = tensor_spec.BoundedTensorSpec( # Make observation spec manually shape=(len(min_array_obs),), dtype=np.float32, minimum=min_array_obs, maximum=max_array_obs) action_spec = tensor_spec.BoundedTensorSpec( # Make action spec manually shape=(), dtype=np.int32, minimum=0, maximum=max_nb_actions - 1) time_step_spec = ts.time_step_spec(tensor_spec.from_spec(observation_spec)) q_net = q_network.QNetwork( observation_spec, action_spec, fc_layer_params=fc_layer_params) optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate) train_step_counter = tf.compat.v2.Variable(0, dtype='int64') tf_agent = dqn_agent.DqnAgent( time_step_spec, action_spec, q_network=q_net, optimizer=optimizer, td_errors_loss_fn=tf.compat.v2.keras.losses.MSE,
def ai_game(): pygame.init() display = pygame.display.set_mode((HEIGHT, WIDTH)) pygame.display.set_caption("Snake") font = pygame.font.SysFont("Times New Roman", 24) # snake_agent = SnakeAgent() # game(display, snake_agent) time.sleep(5) train_env = SnakeGameEnv(display, font) eval_env = SnakeGameEnv(display, font) # env = CardGameEnv() # utils.validate_py_environment(env) fc_layer_params = (100, 50) action_tensor_spec = tensor_spec.from_spec(train_env.action_spec()) num_actions = action_tensor_spec.maximum - action_tensor_spec.minimum + 1 train_env = tf_py_environment.TFPyEnvironment(train_env) eval_env = tf_py_environment.TFPyEnvironment(eval_env) # QNetwork consists of a sequence of Dense layers followed by a dense layer # with `num_actions` units to generate one q_value per available action as # it's output. dense_layers = [dense_layer(num_units) for num_units in fc_layer_params] q_values_layer = tf.keras.layers.Dense( num_actions, activation=None, kernel_initializer=tf.keras.initializers.RandomUniform( minval=-0.03, maxval=0.03), bias_initializer=tf.keras.initializers.Constant(-0.2)) q_net = sequential.Sequential(dense_layers + [q_values_layer]) optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3) train_step_counter = tf.Variable(0) agent = dqn_agent.DqnAgent( train_env.time_step_spec(), train_env.action_spec(), q_network=q_net, optimizer=optimizer, td_errors_loss_fn=common.element_wise_squared_loss, train_step_counter=train_step_counter) agent.initialize() print('Initialized Agent') random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(), train_env.action_spec()) print('Reset time spec') time_step = train_env.reset() random_policy.action(time_step) print('Successfully instantiated random policy') replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=agent.collect_data_spec, batch_size=train_env.batch_size, max_length=REPLAY_BUFFER_MAX_LEN) print('Created replay buffer, collecting data ... ') collect_data(train_env, random_policy, replay_buffer, INITIAL_COLLECT_STEPS) dataset = replay_buffer.as_dataset( num_parallel_calls=3, sample_batch_size=BATCH_SIZE, num_steps=2).prefetch(3) print('Collecting data complete') iterator = iter(dataset) # Reset the train step agent.train_step_counter.assign(0) avg_return = compute_avg_return(train_env, agent.policy, NUM_EVAL_EPISODES) returns = [avg_return] print('Beginning to train...') for i in range(NUM_ITERATIONS): collect_data(train_env, agent.collect_policy, replay_buffer, COLLECT_STEPS_PER_ITERATION) experience, unused_info = next(iterator) train_loss = agent.train(experience).loss step = agent.train_step_counter.numpy() #print(train_env.time_step_spec()) print(f"Training agent through iteration {(i / NUM_ITERATIONS) * 100:.2f}%...") if step % LOG_INTERVAL == 0: pass #print('step = {0}: loss = {1}'.format(step, train_loss)) if step % EVAL_INTERVAL == 0: #avg_return = compute_avg_return(train_env, agent.policy, NUM_EVAL_EPISODES) #print('step = {0}: Average Return = {1}'.format(step, avg_return)) #returns.append(avg_return) pass """
def to_event(s): return (s.event_spec if isinstance(s, DistributionSpecV2) else tensor_spec.from_spec(s))
def __init__(self, environment, check_dims=False, isolation=False): """Initializes a new `TFPyEnvironment`. Args: environment: Environment to interact with, implementing `py_environment.PyEnvironment`. Or a `callable` that returns an environment of this form. If a `callable` is provided and `thread_isolation` is provided, the callable is executed in the dedicated thread. check_dims: Whether should check batch dimensions of actions in `step`. isolation: If this value is `False` (default), interactions with the environment will occur within whatever thread the methods of the `TFPyEnvironment` are run from. For example, in TF graph mode, methods like `step` are called from multiple threads created by the TensorFlow engine; calls to step the environment are guaranteed to be sequential, but not from the same thread. This creates problems for environments that are not thread-safe. Using isolation ensures not only that a dedicated thread (or thread-pool) is used to interact with the environment, but also that interaction with the environment happens in a serialized manner. If `isolation == True`, a dedicated thread is created for interactions with the environment. If `isolation` is an instance of `multiprocessing.pool.Pool` (this includes instances of `multiprocessing.pool.ThreadPool`, nee `multiprocessing.dummy.Pool` and `multiprocessing.Pool`, then this pool is used to interact with the environment. **NOTE** If using `isolation` with a `BatchedPyEnvironment`, ensure you create the `BatchedPyEnvironment` with `multithreading=False`, since otherwise the multithreading in that wrapper reverses the effects of this one. Raises: TypeError: If `environment` is not an instance of `py_environment.PyEnvironment` or subclasses, or is a callable that does not return an instance of `PyEnvironment`. TypeError: If `isolation` is not `True`, `False`, or an instance of `multiprocessing.pool.Pool`. """ if not isolation: self._pool = None elif isinstance(isolation, pool.Pool): self._pool = isolation elif isolation: self._pool = pool.ThreadPool(1) else: raise TypeError( 'isolation should be True, False, or an instance of ' 'a multiprocessing Pool or ThreadPool. Saw: {}'.format( isolation)) if callable(environment): environment = self._execute(environment) if not isinstance(environment, py_environment.PyEnvironment): raise TypeError( 'Environment should implement py_environment.PyEnvironment') if not environment.batched: # If executing in an isolated thread, do not enable multiprocessing for # this environment. environment = batched_py_environment.BatchedPyEnvironment( [environment], multithreading=not self._pool) self._env = environment self._check_dims = check_dims if isolation and getattr(self._env, '_parallel_execution', None): logging.warning( 'Wrapped environment is executing in parallel. ' 'Perhaps it is a BatchedPyEnvironment with multithreading=True, ' 'or it is a ParallelPyEnvironment. This conflicts with the ' '`isolation` arg passed to TFPyEnvironment: interactions with the ' 'wrapped environment are no longer guaranteed to happen in a common ' 'thread. Environment: %s', (self._env, )) action_spec = tensor_spec.from_spec(self._env.action_spec()) time_step_spec = tensor_spec.from_spec(self._env.time_step_spec()) batch_size = self._env.batch_size if self._env.batch_size else 1 super(TFPyEnvironment, self).__init__(time_step_spec, action_spec, batch_size) # Gather all the dtypes and shapes of the elements in time_step. self._time_step_dtypes = [ s.dtype for s in tf.nest.flatten(self.time_step_spec()) ] self._time_step = None self._lock = threading.Lock()
def observation_spec(self): return tensor_spec.from_spec( array_spec.BoundedArraySpec((1, ), np.int32, minimum=[1], maximum=[2]))