示例#1
0
 def testFromArraySpecToTensorSpec(self):
     array_spec = specs.ArraySpec([1, 2, 3], np.int32)
     tensor_spec = specs.TensorSpec.from_spec(array_spec)
     self.assertEqual(array_spec.shape, tensor_spec.shape)
     self.assertEqual(array_spec.dtype, tensor_spec.dtype.as_numpy_dtype())
     self.assertEqual(array_spec.name, tensor_spec.name)
     self.assertEqual(type(tensor_spec), specs.tensor_spec.TensorSpec)
示例#2
0
    def __init__(self, vocab_size, code_len, seq_len, lstm_size):
        """Create an instance of `TextDecodeNetwork`
        See Methods 2.2.3 of "Unsupervised Predictive Memory in a Goal-Directed
        Agent"

        Args:
            vocab_size (int): vocabulary size
            code_len (int): encoded length
            seq_len (int): output sequence length
            lstm_size (int): lstm size for decoding
        """
        super(TextDecodeNetwork, self).__init__(
            input_tensor_spec=specs.ArraySpec(
                shape=(None, code_len), dtype=np.float),
            state_spec=(),
            name='TextDecodeNetwork')
        self._vocab_size = vocab_size
        self._seq_len = seq_len
        self._lstm_size = lstm_size
        model = tf.keras.Sequential()

        model.add(tf.keras.layers.RepeatVector(self._seq_len))
        model.add(tf.keras.layers.LSTM(self._lstm_size, return_sequences=True))
        model.add(
            tf.keras.layers.TimeDistributed(
                tf.keras.layers.Dense(
                    units=self._vocab_size, activation="linear")))
        model.add(tf.keras.layers.Softmax())
        self._model = model
示例#3
0
    def __init__(self, vocab_size, seq_len, embed_size, lstm_size):
        """Create an instance of `TextEncodeNetwork`
        See Methods 2.1.5 of "Unsupervised Predictive Memory in a Goal-Directed
        Agent"

        Args:
            vocab_size (int): vocabulary size
            seq_len (int): sequence length
            embed_size (int): embedding size
            lstm_size (int): lstm size for encoding
        """
        super(TextEncodeNetwork, self).__init__(
            input_tensor_spec=specs.ArraySpec(
                shape=(None, seq_len), dtype=np.int),
            state_spec=(),
            name='TextEncodeNetwork')
        self._vocab_size = vocab_size
        self._seq_len = seq_len
        self._embed_size = embed_size
        self._lstm_size = lstm_size

        model = tf.keras.Sequential()
        model.add(
            tf.keras.layers.Embedding(
                self._vocab_size, self._embed_size, mask_zero=True))
        model.add(tf.keras.layers.LSTM(self._lstm_size))
        self._model = model
示例#4
0
 def testFromArraySpecToBoundedTensorSpec(self):
   array_spec = specs.ArraySpec([1, 2, 3], np.int32)
   tensor_spec = specs.BoundedTensorSpec.from_spec(array_spec)
   self.assertEqual(array_spec.shape, tensor_spec.shape)
   self.assertEqual(array_spec.dtype, tensor_spec.dtype.as_numpy_dtype(0))
   self.assertEqual(array_spec.name, tensor_spec.name)
   self.assertEqual(tensor_spec.dtype.min, tensor_spec.minimum)
   self.assertEqual(tensor_spec.dtype.max, tensor_spec.maximum)
   self.assertEqual(type(tensor_spec), specs.tensor_spec.BoundedTensorSpec)
示例#5
0
 def __init__(self, final_state=3):
   self._state = 0
   self._action_spec = specs.BoundedArraySpec([],
                                              np.int32,
                                              minimum=1,
                                              maximum=2,
                                              name='action')
   self._observation_spec = specs.ArraySpec([], np.int64, name='observation')
   self._final_state = final_state
示例#6
0
 def __init__(self, final_state=3):
   self._state = np.int32(0)
   self._action_spec = specs.BoundedArraySpec([],
                                              np.int32,
                                              minimum=1,
                                              maximum=2,
                                              name='action')
   self._observation_spec = specs.ArraySpec([], np.int32, name='observation')
   self._final_state = final_state
   super(PyEnvironmentMock, self).__init__()
示例#7
0
    def setUp(self):

        mock_agent = MagicMock(Agent)
        dataspec = agent_application.DataSpec(
            observation_spec=specs.ArraySpec([1, 2, 3], int),
            action_spec=specs.ArraySpec([1], float))
        conf = agent_application.make_config(AgentConfig(), [])

        today = datetime(date.today().year,
                         date.today().month,
                         date.today().day)
        env = MagicMock()
        self._mock_agent_init = "MOCKED AGENT"
        mock_agent.init_agent = MagicMock(return_value=self._mock_agent_init)
        self._application = agent_application.AgentApplication(
            data_spec=dataspec,
            agent=mock_agent,
            env=env,
            config=conf,
            first_timestep_dt=today,
            training_interval=timedelta(days=1))
示例#8
0
def create_py_policy_from_table(probability_table, obs_to_index_fn):
    """Creates a callable policy function given a table of state to distribution.

  Args:
    probability_table: A NumPy array determining the action distribution.
    obs_to_index_fn: A function mapping environment observation to index in table.

  Returns:
    policy_fn: A function mapping observations to sampled actions and policy
      info.
    policy_info_spec: A spec that determines the type of objects returned by
      policy info.
  """
    def policy_fn(observation,
                  probability_table=probability_table,
                  obs_to_index_fn=obs_to_index_fn,
                  dtype=np.int32):
        state = obs_to_index_fn(observation)
        distributions = probability_table[state]
        batched = np.ndim(distributions) > 1
        if not batched:
            distributions = distributions[None, :]

        cum_probs = distributions.cumsum(axis=-1)
        uniform_samples = np.random.rand(len(cum_probs), 1)
        actions = (uniform_samples < cum_probs).argmax(axis=1)
        probs = distributions[np.arange(len(actions)), actions]

        if not batched:
            action = actions[0]
            log_prob = np.log(1e-8 + probs[0])
        else:
            action = actions
            log_prob = np.log(1e-8 + probs)

        policy_info = {
            'log_probability': log_prob,
            'distribution': distributions
        }
        return action.astype(dtype), policy_info

    policy_info_spec = {
        'log_probability':
        specs.ArraySpec([], np.float),
        'distribution':
        specs.BoundedArraySpec([np.shape(probability_table)[-1]],
                               np.float,
                               minimum=0.0,
                               maximum=1.0)
    }
    return policy_fn, policy_info_spec
示例#9
0
    def test_ignore_missing_config_dqn(self, mock_qnetwork, mock_agent):
        params = ["agent.fc_layer_params=[100, 150, 90]"]

        dataspec = agent_application.DataSpec(
            observation_spec=specs.ArraySpec([1, 2, 3], int),
            action_spec=specs.ArraySpec([1], float))
        conf = agent_application.make_config(QConfig(), params)

        agent_trainer = DQNAgent(dataspec, conf)
        agent = agent_trainer.init_agent()

        mock_qnetwork.assert_called_once_with(dataspec.observation_spec,
                                              dataspec.action_spec,
                                              fc_layer_params=[100, 150, 90])
        mock_agent.assert_called_once_with(
            time_step_spec=mock.ANY,  # TODO
            action_spec=dataspec.action_spec,
            q_network=mock_qnetwork.return_value,
            train_step_counter=mock.ANY,  # TODO
            optimizer=mock.ANY,  #TODO
            epsilon_greedy=conf.policy.epsilon_greedy,
            n_step_update=conf.trajectory.n_step)
        self.assertEqual(agent, mock_agent.return_value)
示例#10
0
 def testLoad(self):
   specs.ArraySpec([1, 2, 3], np.int32)
   specs.BoundedArraySpec([1, 2, 3], np.int32, 0, 1)
   specs.TensorSpec([1, 2, 3], np.int32)
   specs.BoundedTensorSpec([1, 2, 3], np.int32, 0, 1)
示例#11
0
 def observation_spec(self):
     return specs.ArraySpec([], np.int64, name='observation')
示例#12
0
def main(_):
  if FLAGS.eager:
    tf.config.experimental_run_functions_eagerly(FLAGS.eager)

  tf.random.set_seed(FLAGS.seed)
  # np.random.seed(FLAGS.seed)
  # random.seed(FLAGS.seed)

  print('Env name: %s'%FLAGS.env_name)

  if 'procgen' in FLAGS.env_name:
    _, env_name, train_levels, _ = FLAGS.env_name.split('-')
    env = procgen_wrappers.TFAgentsParallelProcGenEnv(
        1,
        normalize_rewards=True,
        env_name=env_name,
        num_levels=int(train_levels),
        start_level=0)
    state_env = None

    timestep_spec = trajectories.time_step_spec(
        observation_spec=specs.ArraySpec(env._observation_spec.shape, np.uint8),  # pylint: disable=protected-access
        reward_spec=specs.ArraySpec(shape=(), dtype=np.float32))

    data_spec = trajectory.from_transition(
        timestep_spec,
        policy_step.PolicyStep(
            action=env._action_spec,  # pylint: disable=protected-access
            info=specs.ArraySpec(shape=(), dtype=np.int32)), timestep_spec)

    n_state = None
    # ckpt_steps = [10_000_000,15_000_000,20_000_000,25_000_000]
    ckpt_steps = [25_000_000]
  elif FLAGS.env_name.startswith('pixels-dm'):
    if 'distractor' in FLAGS.env_name:
      _, _, domain_name, _, _ = FLAGS.env_name.split('-')
    else:
      _, _, domain_name, _ = FLAGS.env_name.split('-')

    if domain_name in ['cartpole']:
      FLAGS.set_default('action_repeat', 8)
    elif domain_name in ['reacher', 'cheetah', 'ball_in_cup', 'hopper']:
      FLAGS.set_default('action_repeat', 4)
    elif domain_name in ['finger', 'walker']:
      FLAGS.set_default('action_repeat', 2)

    env, state_env = utils.load_env(FLAGS.env_name, FLAGS.seed,
                                    FLAGS.action_repeat, FLAGS.frame_stack,
                                    FLAGS.obs_type)

    if FLAGS.obs_type == 'pixels':
      data_spec = trajectory.from_transition(
          env.time_step_spec(), policy_step.PolicyStep(env.action_spec()),
          env.time_step_spec())
      ckpt_steps = FLAGS.ckpt_timesteps[0]
    else:
      data_spec = trajectory.from_transition(
          state_env.time_step_spec(),
          policy_step.PolicyStep(state_env.action_spec()),
          state_env.time_step_spec())
      n_state = state_env.observation_spec().shape[0]
      ckpt_steps = FLAGS.ckpt_timesteps[0]

  if FLAGS.numpy_dataset:
    tf.io.gfile.makedirs(os.path.join(FLAGS.save_dir, 'datasets'))
    def shard_fn(shard):
      return os.path.join(
          FLAGS.save_dir, 'datasets', FLAGS.env_name + '__%d__%d__%d.npy' %
          (int(ckpt_steps[-1]), FLAGS.max_timesteps, shard))

    observer = tf_utils.NumpyObserver(shard_fn, env)
    observer.allocate_arrays(FLAGS.max_timesteps)
  else:
    shard_fn = os.path.join(
        FLAGS.save_dir, 'datasets',
        FLAGS.env_name + '__%d__%d.tfrecord.shard-%d-of-%d' %
        (int(ckpt_steps[-1]), FLAGS.max_timesteps, FLAGS.worker_id,
         FLAGS.total_workers))
    observer = DummyObserver(
        shard_fn, data_spec, py_mode=True, compress_image=True)

  def load_model(checkpoint):
    checkpoint = int(checkpoint)
    print(checkpoint)
    if FLAGS.env_name.startswith('procgen'):
      env_id = [i for i, name in enumerate(
          PROCGEN_ENVS) if name == env_name][0]+1
      if checkpoint == 10_000_000:
        ckpt_iter = '0000020480'
      elif checkpoint == 15_000_000:
        ckpt_iter = '0000030720'
      elif checkpoint == 20_000_000:
        ckpt_iter = '0000040960'
      elif checkpoint == 25_000_000:
        ckpt_iter = '0000051200'
      policy_weights_dir = ('ppo_darts/'
                            '2021-06-22-16-36-54/%d/policies/checkpoints/'
                            'policy_checkpoint_%s/' % (env_id, ckpt_iter))
      policy_def_dir = ('ppo_darts/'
                        '2021-06-22-16-36-54/%d/policies/policy/' % (env_id))
      model = py_tf_eager_policy.SavedModelPyTFEagerPolicy(
          policy_def_dir,
          time_step_spec=env._time_step_spec,  # pylint: disable=protected-access
          action_spec=env._action_spec,  # pylint: disable=protected-access
          policy_state_spec=env._observation_spec,  # pylint: disable=protected-access
          info_spec=tf.TensorSpec(shape=(None,)),
          load_specs_from_pbtxt=False)
      model.update_from_checkpoint(policy_weights_dir)
      model.actor = model.action
    else:
      if 'ddpg' in FLAGS.algo_name:
        model = ddpg.DDPG(
            env.observation_spec(),
            env.action_spec(),
            cross_norm='crossnorm' in FLAGS.algo_name)
      elif 'crr' in FLAGS.algo_name:
        model = awr.AWR(
            env.observation_spec(),
            env.action_spec(), f='bin_max')
      elif 'awr' in FLAGS.algo_name:
        model = awr.AWR(
            env.observation_spec(),
            env.action_spec(), f='exp_mean')
      elif 'sac_v1' in FLAGS.algo_name:
        model = sac_v1.SAC(
            env.observation_spec(),
            env.action_spec(),
            target_entropy=-env.action_spec().shape[0])
      elif 'asac' in FLAGS.algo_name:
        model = asac.ASAC(
            env.observation_spec(),
            env.action_spec(),
            target_entropy=-env.action_spec().shape[0])
      elif 'sac' in FLAGS.algo_name:
        model = sac.SAC(
            env.observation_spec(),
            env.action_spec(),
            target_entropy=-env.action_spec().shape[0],
            cross_norm='crossnorm' in FLAGS.algo_name,
            pcl_actor_update='pc' in FLAGS.algo_name)
      elif 'pcl' in FLAGS.algo_name:
        model = pcl.PCL(
            env.observation_spec(),
            env.action_spec(),
            target_entropy=-env.action_spec().shape[0])
      if 'distractor' in FLAGS.env_name:
        ckpt_path = os.path.join(
            ('experiments/'
             '20210622_2023.policy_weights_sac_1M_dmc_distractor_hard_pixel/'),
            'results', FLAGS.env_name+'__'+str(checkpoint))
      else:
        ckpt_path = os.path.join(
            ('experiments/'
             '20210607_2023.policy_weights_dmc_1M_SAC_pixel'), 'results',
            FLAGS.env_name + '__' + str(checkpoint))

      model.load_weights(ckpt_path)
    print('Loaded model weights')
    return model

  # previous_time = time.time()
  timestep = env.reset()
  episode_return = 0
  episode_timesteps = 0

  actions = []
  time_steps = []

  def get_state_or_pixels(obs, obs_type):
    # obs of shape 1 x 84 x 84 x (n_state*frame_stack + 3*frame_stack)
    if len(obs.shape) == 4:
      obs = obs[0]
    if obs_type == 'state':
      obs = obs[0, 0, :n_state]
    else:
      obs_tmp = []
      for i in range(FLAGS.frame_stack):
        obs_tmp.append(obs[:, :, (i + 1) * (n_state) +
                           i * 3:((i + 1) * (n_state) + (i + 1) * 3)])
      obs = np.concatenate(obs_tmp, axis=-1)
    return obs

  k_model = 0
  model = load_model(ckpt_steps[k_model])
  reload_model = False

  def linear_scheduling(t):  # pylint: disable=unused-variable
    return 0.1 - 3.96e-9* t

  mixture_freq = FLAGS.max_timesteps // len(ckpt_steps)
  for i in tqdm.tqdm(range(FLAGS.max_timesteps)):
    if (i % mixture_freq) == 0 and i > 0:
      reload_model = True
    if np.all(timestep.is_last()):
      if FLAGS.env_name.startswith('procgen'):
        timestep = trajectories.TimeStep(
            timestep.step_type[0], timestep.reward[0], timestep.discount[0],
            (timestep.observation[0] * 255).astype(np.uint8))

      time_steps.append(
          ts.termination(
              get_state_or_pixels(timestep.observation()[0], 'state')
              if FLAGS.obs_type == 'state' else timestep.observation,
              timestep.reward if timestep.reward is not None else 1.0))
      # Write the episode into the TF Record

      for l in range(len(time_steps) - 1):
        t_ = min(l + FLAGS.n_step_returns, len(time_steps) - 1)
        n_step_return = 0.
        for j in range(l, t_):
          if len(time_steps[j].reward.shape) == 1:
            r_t = time_steps[j].reward[0]
          else:
            r_t = time_steps[j].reward

          n_step_return += FLAGS.discount**j * r_t

        t_ = min(l + 1 + FLAGS.n_step_returns, len(time_steps) - 1)
        n_step_return_tp1 = 0.
        for j in range(l + 1, t_):
          if len(time_steps[j].reward.shape) == 1:
            r_t = time_steps[j].reward[0]
          else:
            r_t = time_steps[j].reward

          n_step_return_tp1 += FLAGS.discount**j * r_t

        if len(time_steps[l].observation.shape) == 4:
          if len(time_steps[l].reward.shape) == 1:
            time_steps[l] = trajectories.TimeStep(time_steps[l].step_type[0],
                                                  n_step_return,
                                                  time_steps[l].discount[0],
                                                  time_steps[l].observation[0])
          else:
            time_steps[l] = trajectories.TimeStep(time_steps[l].step_type,
                                                  n_step_return,
                                                  time_steps[l].discount,
                                                  time_steps[l].observation[0])
        if len(time_steps[l + 1].observation.shape) == 4:
          if len(time_steps[l + 1].reward.shape) == 1:
            time_steps[l + 1] = trajectories.TimeStep(
                time_steps[l + 1].step_type[0], n_step_return_tp1,
                time_steps[l + 1].discount[0], time_steps[l + 1].observation[0])
          else:
            time_steps[l + 1] = trajectories.TimeStep(
                time_steps[l + 1].step_type, n_step_return_tp1,
                time_steps[l + 1].discount, time_steps[l + 1].observation[0])
        traj = trajectory.from_transition(time_steps[l], actions[l],
                                          time_steps[l + 1])
        if FLAGS.numpy_dataset:
          traj = Traj(traj, next_obs=time_steps[l+1].observation)
          observer(traj)
        else:
          observer(traj)

      timestep = env.reset()
      print(episode_return)
      episode_return = 0
      episode_timesteps = 0
      # previous_time = time.time()

      actions = []
      time_steps = []

      if reload_model:
        k_model += 1
        model = load_model(ckpt_steps[k_model])
        reload_model = False
    if FLAGS.env_name.startswith('procgen'):
      timestep = trajectories.TimeStep(
          timestep.step_type[0], timestep.reward[0], timestep.discount[0],
          (timestep.observation[0] * 255).astype(np.uint8))

    if episode_timesteps == 0:
      time_steps.append(
          ts.restart(
              get_state_or_pixels(timestep.observation, 'state') if FLAGS
              .obs_type == 'state' else (timestep.observation)))
    elif not timestep.is_last():
      time_steps.append(
          ts.transition(
              get_state_or_pixels(timestep.observation[0], 'state')
              if FLAGS.obs_type == 'state' else (timestep.observation),
              timestep.reward if timestep.reward is not None else 0.0,
              timestep.discount))

    if FLAGS.env_name.startswith('procgen'):
      # eps_t = linear_scheduling(i)
      eps_t = 0
      u = np.random.uniform(0, 1, size=1)
      if u > eps_t:
        timestep_act = trajectories.TimeStep(
            timestep.step_type, timestep.reward, timestep.discount,
            timestep.observation.astype(np.float32) / 255.)
        action = model.actor(timestep_act)
        action = action.action
      else:
        action = np.random.choice(
            env.action_spec().maximum.item() + 1, size=1)[0]
      next_timestep = env.step(action)
      info_arr = np.array(env._infos[0]['level_seed'], dtype=np.int32)  # pylint: disable=protected-access
      actions.append(policy_step.PolicyStep(action=action, state=(),
                                            info=info_arr))
    else:
      action = model.actor(
          tf.expand_dims(
              get_state_or_pixels(timestep.observation[0], 'pixel')
              if FLAGS.obs_type == 'state' else (timestep.observation[0]), 0),
          sample=True)
      next_timestep = env.step(action)
      actions.append(
          policy_step.PolicyStep(action=action.numpy()[0], state=(), info=()))

    episode_return += next_timestep.reward[0]
    episode_timesteps += 1

    timestep = next_timestep

  if FLAGS.numpy_dataset:
    observer.save(n_shards=10)
 def reward_spec(self):
     return {
         'reward': specs.ArraySpec([], np.float32, name='reward'),
         'constraint': specs.ArraySpec([], np.float32, name='constraint')
     }