示例#1
0
def main(argv):
    del argv

    # Create the task environment.
    test_w = [float(x) for x in FLAGS.test_w]
    env_config = configs.get_fig5_task_config(test_w)
    env = scavenger.Scavenger(**env_config)
    env = environment_wrappers.EnvironmentWithLogging(env)

    # Create the flat agent.
    agent = dqn_agent.Agent(obs_spec=env.observation_spec(),
                            action_spec=env.action_spec(),
                            network_kwargs=dict(
                                output_sizes=(64, 128),
                                activate_final=True,
                            ),
                            epsilon=0.1,
                            additional_discount=0.9,
                            batch_size=10,
                            optimizer_name="AdamOptimizer",
                            optimizer_kwargs=dict(learning_rate=3e-4, ))

    _, ema_returns = experiment.run(env,
                                    agent,
                                    num_episodes=FLAGS.num_episodes,
                                    report_every=FLAGS.report_every)
    if FLAGS.output_path:
        experiment.write_returns_to_file(FLAGS.output_path, ema_returns)
def main(argv):
    del argv

    # Load the keyboard.
    keyboard = smart_module.SmartModuleImport(hub.Module(FLAGS.keyboard_path))

    # Create the task environment.
    base_env_config = configs.get_fig4_task_config()
    base_env = scavenger.Scavenger(**base_env_config)
    base_env = environment_wrappers.EnvironmentWithLogging(base_env)

    # Wrap the task environment with the keyboard.
    additional_discount = 0.9
    env = environment_wrappers.EnvironmentWithKeyboardDirect(
        env=base_env,
        keyboard=keyboard,
        keyboard_ckpt_path=None,
        additional_discount=additional_discount,
        call_and_return=False)

    # Create the player agent.
    agent = regressed_agent.Agent(
        batch_size=10,
        optimizer_name="AdamOptimizer",
        optimizer_kwargs=dict(learning_rate=3e-2, ),
        init_w=np.random.normal(size=keyboard.num_cumulants) * 0.1,
    )

    _, ema_returns = experiment.run(env,
                                    agent,
                                    num_episodes=FLAGS.num_episodes,
                                    report_every=FLAGS.report_every,
                                    num_eval_reps=100)
    if FLAGS.output_path:
        experiment.write_returns_to_file(FLAGS.output_path, ema_returns)
示例#3
0
def create_and_train_keyboard_with_phi(num_episodes,
                                       phi_model_path,
                                       policy_weights,
                                       export_path=None):
    """Train an option keyboard."""
    env_config = configs.get_pretrain_config()
    env = scavenger.Scavenger(**env_config)
    env = environment_wrappers.EnvironmentWithLogging(env)
    env = environment_wrappers.EnvironmentWithLearnedPhi(env, phi_model_path)

    agent = keyboard_agent.Agent(obs_spec=env.observation_spec(),
                                 action_spec=env.action_spec(),
                                 policy_weights=policy_weights,
                                 network_kwargs=dict(
                                     output_sizes=(64, 128),
                                     activate_final=True,
                                 ),
                                 epsilon=0.1,
                                 additional_discount=0.9,
                                 batch_size=10,
                                 optimizer_name="AdamOptimizer",
                                 optimizer_kwargs=dict(learning_rate=3e-4, ))

    if num_episodes:
        experiment.run(env, agent, num_episodes=num_episodes)
        agent.export(export_path)

    return agent
示例#4
0
def _train_keyboard(num_episodes):
    """Train an option keyboard."""
    env_config = configs.get_pretrain_config()
    env = scavenger.Scavenger(**env_config)
    env = environment_wrappers.EnvironmentWithLogging(env)

    agent = keyboard_agent.Agent(obs_spec=env.observation_spec(),
                                 action_spec=env.action_spec(),
                                 policy_weights=np.array([
                                     [1.0, 0.0],
                                     [0.0, 1.0],
                                 ]),
                                 network_kwargs=dict(
                                     output_sizes=(64, 128),
                                     activate_final=True,
                                 ),
                                 epsilon=0.1,
                                 additional_discount=0.9,
                                 batch_size=10,
                                 optimizer_name="AdamOptimizer",
                                 optimizer_kwargs=dict(learning_rate=3e-4, ))

    experiment.run(env, agent, num_episodes=num_episodes)

    return agent
def main(argv):
  del argv

  # Load the keyboard.
  keyboard = smart_module.SmartModuleImport(hub.Module(FLAGS.keyboard_path))

  # Create the task environment.
  base_env_config = configs.get_fig4_task_config()
  base_env = scavenger.Scavenger(**base_env_config)
  base_env = environment_wrappers.EnvironmentWithLogging(base_env)

  # Wrap the task environment with the keyboard.
  additional_discount = 0.9
  env = environment_wrappers.EnvironmentWithKeyboardDirect(
      env=base_env,
      keyboard=keyboard,
      keyboard_ckpt_path=None,
      additional_discount=additional_discount,
      call_and_return=False)

  # Create the player agent.
  agent = regressed_agent.Agent(
      batch_size=10,
      optimizer_name="AdamOptimizer",
      # Disable training.
      optimizer_kwargs=dict(learning_rate=0.0,),
      init_w=[1., -1.])

  returns = []
  for _ in range(FLAGS.num_episodes):
    returns.append(experiment.run_episode(env, agent))
  tf.logging.info("#" * 80)
  tf.logging.info(
      f"Avg. return over {FLAGS.num_episodes} episodes is {np.mean(returns)}")
  tf.logging.info("#" * 80)
示例#6
0
def main(argv):
    del argv

    # Pretrain the keyboard and save a checkpoint.
    if FLAGS.keyboard_path:
        keyboard_path = FLAGS.keyboard_path
    else:
        with tf.Graph().as_default():
            export_path = "/tmp/option_keyboard/keyboard"
            _ = keyboard_utils.create_and_train_keyboard(
                num_episodes=FLAGS.num_pretrain_episodes,
                export_path=export_path)
            keyboard_path = os.path.join(export_path, "tfhub")

    # Load the keyboard.
    keyboard = smart_module.SmartModuleImport(hub.Module(keyboard_path))

    # Create the task environment.
    base_env_config = configs.get_task_config()
    base_env = scavenger.Scavenger(**base_env_config)
    base_env = environment_wrappers.EnvironmentWithLogging(base_env)

    # Wrap the task environment with the keyboard.
    additional_discount = 0.9
    env = environment_wrappers.EnvironmentWithKeyboard(
        env=base_env,
        keyboard=keyboard,
        keyboard_ckpt_path=None,
        n_actions_per_dim=3,
        additional_discount=additional_discount,
        call_and_return=False)

    # Create the player agent.
    agent = dqn_agent.Agent(obs_spec=env.observation_spec(),
                            action_spec=env.action_spec(),
                            network_kwargs=dict(
                                output_sizes=(64, 128),
                                activate_final=True,
                            ),
                            epsilon=0.1,
                            additional_discount=additional_discount,
                            batch_size=10,
                            optimizer_name="AdamOptimizer",
                            optimizer_kwargs=dict(learning_rate=3e-4, ))

    _, ema_returns = experiment.run(env,
                                    agent,
                                    num_episodes=FLAGS.num_episodes,
                                    report_every=FLAGS.report_every)
    if FLAGS.output_path:
        experiment.write_returns_to_file(FLAGS.output_path, ema_returns)
def evaluate_keyboard(keyboard_path):
  """Evaluate a keyboard."""

  angles_to_sweep = np.deg2rad(np.linspace(-90, 180, num=19, endpoint=True))
  weights_to_sweep = np.stack(
      [np.cos(angles_to_sweep),
       np.sin(angles_to_sweep)], axis=-1)
  weights_to_sweep /= np.sum(
      np.maximum(weights_to_sweep, 0.0), axis=-1, keepdims=True)
  weights_to_sweep = np.clip(weights_to_sweep, -1000, 1000)
  tf.logging.info(weights_to_sweep)

  # Load the keyboard.
  keyboard = smart_module.SmartModuleImport(hub.Module(keyboard_path))

  # Create the task environment.
  all_returns = []
  for w_to_sweep in weights_to_sweep.tolist():
    base_env_config = configs.get_fig5_task_config(w_to_sweep)
    base_env = scavenger.Scavenger(**base_env_config)
    base_env = environment_wrappers.EnvironmentWithLogging(base_env)

    # Wrap the task environment with the keyboard.
    with tf.variable_scope(None, default_name="inner_loop"):
      additional_discount = 0.9
      env = environment_wrappers.EnvironmentWithKeyboardDirect(
          env=base_env,
          keyboard=keyboard,
          keyboard_ckpt_path=None,
          additional_discount=additional_discount,
          call_and_return=False)

      # Create the player agent.
      agent = regressed_agent.Agent(
          batch_size=10,
          optimizer_name="AdamOptimizer",
          # Disable training.
          optimizer_kwargs=dict(learning_rate=0.0,),
          init_w=w_to_sweep)

    returns = []
    for _ in range(FLAGS.num_episodes):
      returns.append(experiment.run_episode(env, agent))
    tf.logging.info(f"Task: {w_to_sweep}, mean returns over "
                    f"{FLAGS.num_episodes} episodes is {np.mean(returns)}")
    all_returns.append(returns)

  return all_returns, weights_to_sweep
def main(argv):
    del argv

    # Load the keyboard.
    keyboard = smart_module.SmartModuleImport(hub.Module(FLAGS.keyboard_path))

    # Create the task environment.
    base_env_config = configs.get_task_config()
    base_env = scavenger.Scavenger(**base_env_config)
    base_env = environment_wrappers.EnvironmentWithLogging(base_env)

    # Wrap the task environment with the keyboard.
    additional_discount = 0.9
    env = environment_wrappers.EnvironmentWithKeyboardDirect(
        env=base_env,
        keyboard=keyboard,
        keyboard_ckpt_path=None,
        additional_discount=additional_discount,
        call_and_return=False)

    # Create the player agent.
    agent = regressed_agent.Agent(
        batch_size=10,
        optimizer_name="AdamOptimizer",
        # Disable training.
        optimizer_kwargs=dict(learning_rate=0.0, ),
        init_w=[float(x) for x in FLAGS.test_w])

    returns = []
    for _ in range(FLAGS.num_episodes):
        returns.append(experiment.run_episode(env, agent))
    tf.logging.info("#" * 80)
    tf.logging.info(
        f"Avg. return over {FLAGS.num_episodes} episodes is {np.mean(returns)}"
    )
    tf.logging.info("#" * 80)

    if FLAGS.output_path:
        with gfile.GFile(FLAGS.output_path, "w") as file:
            writer = csv.writer(file, delimiter=" ", quoting=csv.QUOTE_MINIMAL)
            writer.writerow(["return"])
            for val in returns:
                writer.writerow([val])
def main(argv):
    del argv

    # Create the task environment.
    env_config = configs.get_fig4_task_config()
    env = scavenger.Scavenger(**env_config)
    env = environment_wrappers.EnvironmentWithLogging(env)

    # Create the flat agent.
    agent = dqn_agent.Agent(obs_spec=env.observation_spec(),
                            action_spec=env.action_spec(),
                            network_kwargs=dict(
                                output_sizes=(64, 128),
                                activate_final=True,
                            ),
                            epsilon=0.1,
                            additional_discount=0.9,
                            batch_size=10,
                            optimizer_name="AdamOptimizer",
                            optimizer_kwargs=dict(learning_rate=3e-4, ))

    experiment.run(env, agent, num_episodes=FLAGS.num_episodes)
示例#10
0
def main(argv):
    del argv

    # Pretrain the keyboard and save a checkpoint.
    pretrain_agent = _train_keyboard(num_episodes=FLAGS.num_pretrain_episodes)
    keyboard_ckpt_path = "/tmp/option_keyboard/keyboard.ckpt"
    pretrain_agent.export(keyboard_ckpt_path)

    # Create the task environment.
    base_env_config = configs.get_task_config()
    base_env = scavenger.Scavenger(**base_env_config)
    base_env = environment_wrappers.EnvironmentWithLogging(base_env)

    # Wrap the task environment with the keyboard.
    additional_discount = 0.9
    env = environment_wrappers.EnvironmentWithKeyboard(
        env=base_env,
        keyboard=pretrain_agent.keyboard,
        keyboard_ckpt_path=keyboard_ckpt_path,
        n_actions_per_dim=3,
        additional_discount=additional_discount,
        call_and_return=True)

    # Create the player agent.
    agent = dqn_agent.Agent(obs_spec=env.observation_spec(),
                            action_spec=env.action_spec(),
                            network_kwargs=dict(
                                output_sizes=(64, 128),
                                activate_final=True,
                            ),
                            epsilon=0.1,
                            additional_discount=additional_discount,
                            batch_size=10,
                            optimizer_name="AdamOptimizer",
                            optimizer_kwargs=dict(learning_rate=3e-4, ))

    experiment.run(env, agent, num_episodes=FLAGS.num_episodes)
def main(argv):
    del argv

    if FLAGS.use_random_tasks:
        tasks = np.random.normal(size=(8, 2))
    else:
        tasks = [
            [1.0, 0.0],
            [0.0, 1.0],
            [-1.0, 0.0],
            [0.0, -1.0],
            [0.7, 0.3],
            [-0.3, -0.7],
        ]

    if FLAGS.normalisation == "L1":
        tasks /= np.sum(np.abs(tasks), axis=-1, keepdims=True)
    elif FLAGS.normalisation == "L2":
        tasks /= np.linalg.norm(tasks, axis=-1, keepdims=True)
    else:
        raise ValueError("Unknown normlisation_method {}".format(
            FLAGS.normalisation))

    logging.info("Tasks: %s", tasks)

    env_config = dict(arena_size=11,
                      num_channels=2,
                      max_num_steps=100,
                      num_init_objects=10,
                      object_priors=[1.0, 1.0],
                      egocentric=True,
                      default_w=None,
                      aux_tasks_w=tasks)
    env = scavenger.Scavenger(**env_config)
    num_actions = env.action_spec().maximum + 1

    model_config = dict(
        n_actions=num_actions,
        n_phis=FLAGS.num_phis,
        network_kwargs=dict(
            output_sizes=(64, 128),
            activate_final=True,
        ),
    )
    model = smart_module.SmartModuleExport(lambda: PhiModel(**model_config))

    dummy_steps = collect_experience(env, num_episodes=10, verbose=True)
    num_rewards = dummy_steps.rewards.shape[-1]

    # Placeholders
    steps_ph = tree.map_structure(create_ph, dummy_steps)

    phis = model(steps_ph.obs, steps_ph.actions)
    phis_to_rewards = snt.Linear(num_rewards,
                                 initializers=dict(w=tf.zeros),
                                 use_bias=False)
    preds = phis_to_rewards(phis)
    loss_per_batch = tf.square(preds - steps_ph.rewards)
    loss_op = tf.reduce_mean(loss_per_batch)

    replay = []

    # Optimizer and train op.
    with tf.variable_scope("optimizer"):
        optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate)
        train_op = optimizer.minimize(loss_op)
        # Add normalisation of weights in phis_to_rewards
        if FLAGS.normalisation == "L1":
            w_norm = tf.reduce_sum(tf.abs(phis_to_rewards.w),
                                   axis=0,
                                   keepdims=True)
        elif FLAGS.normalisation == "L2":
            w_norm = tf.norm(phis_to_rewards.w, axis=0, keepdims=True)
        else:
            raise ValueError("Unknown normlisation_method {}".format(
                FLAGS.normalisation))

        normalise_w = tf.assign(phis_to_rewards.w,
                                phis_to_rewards.w / tf.maximum(w_norm, 1e-6))

    def filter_steps(steps):
        mask = np.sum(np.abs(steps.rewards), axis=-1) > 0.1
        nonzero_inds = np.where(mask)[0]
        zero_inds = np.where(np.logical_not(mask))[0]
        zero_inds = np.random.choice(zero_inds,
                                     size=len(nonzero_inds),
                                     replace=False)
        selected_inds = np.concatenate([nonzero_inds, zero_inds])
        selected_steps = tree.map_structure(lambda x: x[selected_inds], steps)
        return selected_steps, selected_inds

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        step = 0
        while step < FLAGS.num_train_steps:
            step += 1
            steps_output = collect_experience(env, num_episodes=10)
            selected_step_outputs, selected_inds = filter_steps(steps_output)

            if len(replay) > FLAGS.min_replay_size:
                # Do training.
                for _ in range(FLAGS.num_train_repeats):
                    train_samples = random.choices(replay, k=128)
                    train_samples = tree.map_structure(
                        lambda *x: np.stack(x, axis=0), *train_samples)
                    train_samples = tree.unflatten_as(steps_ph, train_samples)
                    feed_dict = dict(
                        zip(tree.flatten(steps_ph),
                            tree.flatten(train_samples)))
                    _, train_loss = sess.run([train_op, loss_op],
                                             feed_dict=feed_dict)
                    sess.run(normalise_w)

                # Do evaluation.
                if step % 50 == 0:
                    feed_dict = dict(
                        zip(tree.flatten(steps_ph),
                            tree.flatten(selected_step_outputs)))
                    eval_loss = sess.run(loss_op, feed_dict=feed_dict)
                    logging.info(
                        "Step %d,   train loss %f,   eval loss %f,   replay %s",
                        step, train_loss, eval_loss, len(replay))
                    print(sess.run(phis_to_rewards.get_variables())[0].T)

                    values = dict(step=step,
                                  train_loss=train_loss,
                                  eval_loss=eval_loss)
                    logging.info(values)

            # Add to replay.
            if step <= FLAGS.num_replay_steps:

                def select_fn(ind):
                    return lambda x: x[ind]

                for idx in range(len(selected_inds)):
                    replay.append(
                        tree.flatten(
                            tree.map_structure(select_fn(idx),
                                               selected_step_outputs)))

        # Export trained model.
        if FLAGS.export_path:
            model.export(FLAGS.export_path, sess, overwrite=True)