Exemplo n.º 1
0
def main():
    """
    Run the atari test
    """
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--prioritized', type=int, default=1)
    parser.add_argument('--dueling', type=int, default=1)
    parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6)
    parser.add_argument('--num-timesteps', type=int, default=int(1e7))

    args = parser.parse_args()
    logger.configure()
    set_global_seeds(args.seed)
    env = make_atari(args.env)
    env = bench.Monitor(env, logger.get_dir())
    env = wrap_atari_dqn(env)
    policy = partial(CnnPolicy, dueling=args.dueling == 1)

    # model = DQN(
    #     env=env,
    #     policy=policy,
    #     learning_rate=1e-4,
    #     buffer_size=10000,
    #     exploration_fraction=0.1,
    #     exploration_final_eps=0.01,
    #     train_freq=4,
    #     learning_starts=10000,
    #     target_network_update_freq=1000,
    #     gamma=0.99,
    #     prioritized_replay=bool(args.prioritized),
    #     prioritized_replay_alpha=args.prioritized_replay_alpha,
    # )
    model = DQN(
        env=env,
        policy_class=CnnPolicy,
        learning_rate=1e-4,
        buffer_size=10000,
        double_q=False,
        prioritized_replay=True,
        prioritized_replay_alpha=0.6,
        dueling=True,
        train_freq=4,
        learning_starts=10000,
        exploration_fraction=0.1,
        exploration_final_eps=0.01,
        target_network_update_freq=1000,
        model_path='atari_Breakout_duel'
    )
    # model.learn(total_timesteps=args.num_timesteps, seed=args.seed)
    model.load('atari_Breakout_duel')
    model.evaluate(100)
    env.close()
Exemplo n.º 2
0
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    # Create envs.
    env = gym.make(env_id)
    env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(0)))

    if evaluation:
        eval_env = gym.make(env_id)
        eval_env = Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval'))
        env = Monitor(env, None)
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions),
                                                        sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    logger.info('seed={}, logdir={}'.format(seed, logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    start_time = time.time()
    training.train(env=env, eval_env=eval_env, param_noise=param_noise,
                   action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    logger.info('total runtime: {}s'.format(time.time() - start_time))
Exemplo n.º 3
0
def main():
    """
    Run the atari test
    """
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env',
                        help='environment ID',
                        default='BreakoutNoFrameskip-v4')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--prioritized', type=int, default=1)
    parser.add_argument('--dueling', type=int, default=1)
    parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6)
    parser.add_argument('--num-timesteps', type=int, default=int(1e7))

    args = parser.parse_args()
    logger.configure()
    set_global_seeds(args.seed)

    env = make_atari(args.env)
    env.action_space.seed(args.seed)
    env = bench.Monitor(env, logger.get_dir())
    env = wrap_atari_dqn(env)

    model = DQN(env=env,
                policy_class=CnnPolicy,
                buffer_size=10000,
                learning_rate=1e-4,
                learning_starts=10000,
                target_network_update_freq=1000,
                train_freq=4,
                exploration_final_eps=0.01,
                exploration_fraction=0.1,
                prioritized_replay=True,
                model_path='atari_test_Breakout')
    model.learn(total_timesteps=args.num_timesteps)
    env.close()
Exemplo n.º 4
0
def play_single_thread(
        actor, critic, target_actor, target_critic, args, prepare_fn,
        global_episode, global_update_step, episodes_queue,
        best_reward):
    workerseed = args.seed + 241 * args.thread
    set_global_seeds(workerseed)

    args.logdir = "{}/thread_{}".format(args.logdir, args.thread)
    create_if_need(args.logdir)

    act_fn, _, save_fn = prepare_fn(actor, critic, target_actor, target_critic, args)

    logger = Logger(args.logdir)
    env = create_env(args)
    random_process = create_random_process(args)

    epsilon_cycle_len = random.randint(args.epsilon_cycle_len // 2, args.epsilon_cycle_len * 2)

    epsilon_decay_fn = create_decay_fn(
        "cycle",
        initial_value=args.initial_epsilon,
        final_value=args.final_epsilon,
        cycle_len=epsilon_cycle_len,
        num_cycles=args.max_episodes // epsilon_cycle_len)

    episode = 1
    step = 0
    start_time = time.time()
    while global_episode.value < args.max_episodes * (args.num_threads - args.num_train_threads) \
            and global_update_step.value < args.max_update_steps * args.num_train_threads:
        if episode % 100 == 0:
            env = create_env(args)
        seed = random.randrange(2 ** 32 - 2)

        epsilon = min(args.initial_epsilon, max(args.final_epsilon, epsilon_decay_fn(episode)))

        episode_metrics = {
            "reward": 0.0,
            "step": 0,
            "epsilon": epsilon
        }

        observation = env.reset(seed=seed, difficulty=args.difficulty)
        random_process.reset_states()
        done = False

        replay = []
        while not done:
            action = act_fn(observation, noise=epsilon * random_process.sample())
            next_observation, reward, done, _ = env.step(action)

            replay.append((observation, action, reward, next_observation, done))
            episode_metrics["reward"] += reward
            episode_metrics["step"] += 1

            observation = next_observation

        episodes_queue.put(replay)

        episode += 1
        global_episode.value += 1

        if episode_metrics["reward"] > best_reward.value:
            best_reward.value = episode_metrics["reward"]
            logger.scalar_summary("best reward", best_reward.value, episode)

            if episode_metrics["reward"] > 15.0 * args.reward_scale:
                save_fn(episode)

        step += episode_metrics["step"]
        elapsed_time = time.time() - start_time

        for key, value in episode_metrics.items():
            logger.scalar_summary(key, value, episode)
        logger.scalar_summary(
            "episode per minute",
            episode / elapsed_time * 60,
            episode)
        logger.scalar_summary(
            "step per second",
            step / elapsed_time,
            episode)

        if elapsed_time > 86400 * args.max_train_days:
            global_episode.value = args.max_episodes * (args.num_threads - args.num_train_threads) + 1

    raise KeyboardInterrupt
Exemplo n.º 5
0
def train_single_thread(
        actor, critic, target_actor, target_critic, args, prepare_fn,
        global_episode, global_update_step, episodes_queue):
    workerseed = args.seed + 241 * args.thread
    set_global_seeds(workerseed)

    args.logdir = "{}/thread_{}".format(args.logdir, args.thread)
    create_if_need(args.logdir)

    _, update_fn, save_fn = prepare_fn(actor, critic, target_actor, target_critic, args)

    logger = Logger(args.logdir)

    buffer = create_buffer(args)

    if args.prioritized_replay:
        beta_deacy_fn = create_decay_fn(
            "linear",
            initial_value=args.prioritized_replay_beta0,
            final_value=1.0,
            max_step=args.max_update_steps)

    actor_learning_rate_decay_fn = create_decay_fn(
        "linear",
        initial_value=args.actor_lr,
        final_value=args.actor_lr_end,
        max_step=args.max_update_steps)
    critic_learning_rate_decay_fn = create_decay_fn(
        "linear",
        initial_value=args.critic_lr,
        final_value=args.critic_lr_end,
        max_step=args.max_update_steps)

    update_step = 0
    received_examples = 1  # just hack
    while global_episode.value < args.max_episodes * (args.num_threads - args.num_train_threads) \
            and global_update_step.value < args.max_update_steps * args.num_train_threads:
        actor_lr = actor_learning_rate_decay_fn(update_step)
        critic_lr = critic_learning_rate_decay_fn(update_step)

        actor_lr = min(args.actor_lr, max(args.actor_lr_end, actor_lr))
        critic_lr = min(args.critic_lr, max(args.critic_lr_end, critic_lr))

        while True:
            try:
                replay = episodes_queue.get_nowait()
                for (observation, action, reward, next_observation, done) in replay:
                    buffer.add(observation, action, reward, next_observation, done)
                received_examples += len(replay)
            except py_queue.Empty:
                break

        if len(buffer) >= args.train_steps:
            if args.prioritized_replay:
                beta = beta_deacy_fn(update_step)
                beta = min(1.0, max(args.prioritized_replay_beta0, beta))
                (tr_observations, tr_actions, tr_rewards, tr_next_observations, tr_dones,
                 weights, batch_idxes) = \
                    buffer.sample(
                        batch_size=args.batch_size,
                        beta=beta)
            else:
                (tr_observations, tr_actions, tr_rewards, tr_next_observations, tr_dones) = \
                    buffer.sample(batch_size=args.batch_size)
                weights, batch_idxes = np.ones_like(tr_rewards), None

            step_metrics, step_info = update_fn(
                tr_observations, tr_actions, tr_rewards,
                tr_next_observations, tr_dones,
                weights, actor_lr, critic_lr)

            update_step += 1
            global_update_step.value += 1

            if args.prioritized_replay:
                new_priorities = np.abs(step_info["td_error"]) + 1e-6
                buffer.update_priorities(batch_idxes, new_priorities)

            for key, value in step_metrics.items():
                value = to_numpy(value)[0]
                logger.scalar_summary(key, value, update_step)

            logger.scalar_summary("actor lr", actor_lr, update_step)
            logger.scalar_summary("critic lr", critic_lr, update_step)

            if update_step % args.save_step == 0:
                save_fn(update_step)
        else:
            time.sleep(1)

        logger.scalar_summary("buffer size", len(buffer), global_episode.value)
        logger.scalar_summary(
            "updates per example",
            update_step * args.batch_size / received_examples,
            global_episode.value)

    save_fn(update_step)

    raise KeyboardInterrupt
Exemplo n.º 6
0
def train_multi_thread(actor, critic, target_actor, target_critic, args, prepare_fn, best_reward):
    workerseed = args.seed + 241 * args.thread
    set_global_seeds(workerseed)

    args.logdir = "{}/thread_{}".format(args.logdir, args.thread)
    create_if_need(args.logdir)

    act_fn, update_fn, save_fn = prepare_fn(actor, critic, target_actor, target_critic, args)
    logger = Logger(args.logdir)

    buffer = create_buffer(args)
    if args.prioritized_replay:
        beta_deacy_fn = create_decay_fn(
            "linear",
            initial_value=args.prioritized_replay_beta0,
            final_value=1.0,
            max_step=args.max_episodes)

    env = create_env(args)
    random_process = create_random_process(args)

    actor_learning_rate_decay_fn = create_decay_fn(
        "linear",
        initial_value=args.actor_lr,
        final_value=args.actor_lr_end,
        max_step=args.max_episodes)
    critic_learning_rate_decay_fn = create_decay_fn(
        "linear",
        initial_value=args.critic_lr,
        final_value=args.critic_lr_end,
        max_step=args.max_episodes)

    epsilon_cycle_len = random.randint(args.epsilon_cycle_len // 2, args.epsilon_cycle_len * 2)

    epsilon_decay_fn = create_decay_fn(
        "cycle",
        initial_value=args.initial_epsilon,
        final_value=args.final_epsilon,
        cycle_len=epsilon_cycle_len,
        num_cycles=args.max_episodes // epsilon_cycle_len)

    episode = 0
    step = 0
    start_time = time.time()
    while episode < args.max_episodes:
        if episode % 100 == 0:
            env = create_env(args)
        seed = random.randrange(2 ** 32 - 2)

        actor_lr = actor_learning_rate_decay_fn(episode)
        critic_lr = critic_learning_rate_decay_fn(episode)
        epsilon = min(args.initial_epsilon, max(args.final_epsilon, epsilon_decay_fn(episode)))

        episode_metrics = {
            "value_loss": 0.0,
            "policy_loss": 0.0,
            "reward": 0.0,
            "step": 0,
            "epsilon": epsilon
        }

        observation = env.reset(seed=seed, difficulty=args.difficulty)
        random_process.reset_states()
        done = False

        while not done:
            action = act_fn(observation, noise=epsilon*random_process.sample())
            next_observation, reward, done, _ = env.step(action)

            buffer.add(observation, action, reward, next_observation, done)
            episode_metrics["reward"] += reward
            episode_metrics["step"] += 1

            if len(buffer) >= args.train_steps:

                if args.prioritized_replay:
                    (tr_observations, tr_actions, tr_rewards, tr_next_observations, tr_dones,
                     weights, batch_idxes) = \
                        buffer.sample(batch_size=args.batch_size, beta=beta_deacy_fn(episode))
                else:
                    (tr_observations, tr_actions, tr_rewards, tr_next_observations, tr_dones) = \
                        buffer.sample(batch_size=args.batch_size)
                    weights, batch_idxes = np.ones_like(tr_rewards), None

                step_metrics, step_info = update_fn(
                    tr_observations, tr_actions, tr_rewards,
                    tr_next_observations, tr_dones,
                    weights, actor_lr, critic_lr)

                if args.prioritized_replay:
                    new_priorities = np.abs(step_info["td_error"]) + 1e-6
                    buffer.update_priorities(batch_idxes, new_priorities)

                for key, value in step_metrics.items():
                    value = to_numpy(value)[0]
                    episode_metrics[key] += value

            observation = next_observation

        episode += 1

        if episode_metrics["reward"] > 15.0 * args.reward_scale \
                and episode_metrics["reward"] > best_reward.value:
            best_reward.value = episode_metrics["reward"]
            logger.scalar_summary("best reward", best_reward.value, episode)
            save_fn(episode)

        step += episode_metrics["step"]
        elapsed_time = time.time() - start_time

        for key, value in episode_metrics.items():
            value = value if "loss" not in key else value / episode_metrics["step"]
            logger.scalar_summary(key, value, episode)
        logger.scalar_summary(
            "episode per minute",
            episode / elapsed_time * 60,
            episode)
        logger.scalar_summary(
            "step per second",
            step / elapsed_time,
            episode)
        logger.scalar_summary("actor lr", actor_lr, episode)
        logger.scalar_summary("critic lr", critic_lr, episode)

        if episode % args.save_step == 0:
            save_fn(episode)

        if elapsed_time > 86400 * args.max_train_days:
            episode = args.max_episodes + 1

    save_fn(episode)

    raise KeyboardInterrupt
Exemplo n.º 7
0
def learn(env,
          network,
          seed=None,
          lr=5e-5,
          total_timesteps=100000,
          buffer_size=500000,
          exploration_fraction=0.1,
          exploration_final_eps=0.01,
          train_freq=1,
          batch_size=32,
          print_freq=10,
          checkpoint_freq=100000,
          checkpoint_path=None,
          learning_starts=0,
          gamma=0.99,
          target_network_update_freq=10000,
          prioritized_replay=True,
          prioritized_replay_alpha=0.4,
          prioritized_replay_beta0=0.6,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-3,
          param_noise=False,
          callback=None,
          load_path=None,
          load_idx=None,
          demo_path=None,
          n_step=10,
          demo_prioritized_replay_eps=1.0,
          pre_train_timesteps=750000,
          epsilon_schedule="constant",
          **network_kwargs):
    # Create all the functions necessary to train the model
    set_global_seeds(seed)
    q_func = build_q_func(network, **network_kwargs)

    with tf.device('/GPU:0'):
        model = DQfD(q_func=q_func,
                     observation_shape=env.observation_space.shape,
                     num_actions=env.action_space.n,
                     lr=lr,
                     grad_norm_clipping=10,
                     gamma=gamma,
                     param_noise=param_noise)

    # Load model from checkpoint
    if load_path is not None:
        load_path = osp.expanduser(load_path)
        ckpt = tf.train.Checkpoint(model=model)
        manager = tf.train.CheckpointManager(ckpt, load_path, max_to_keep=None)
        if load_idx is None:
            ckpt.restore(manager.latest_checkpoint)
            print("Restoring from {}".format(manager.latest_checkpoint))
        else:
            ckpt.restore(manager.checkpoints[load_idx])
            print("Restoring from {}".format(manager.checkpoints[load_idx]))

    # Setup demo trajectory
    assert demo_path is not None
    with open(demo_path, "rb") as f:
        trajectories = pickle.load(f)

    # Create the replay buffer
    replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                            prioritized_replay_alpha)
    if prioritized_replay_beta_iters is None:
        prioritized_replay_beta_iters = total_timesteps
    beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                   initial_p=prioritized_replay_beta0,
                                   final_p=1.0)
    temp_buffer = deque(maxlen=n_step)
    is_demo = True
    for epi in trajectories:
        for obs, action, rew, new_obs, done in epi:
            obs, new_obs = np.expand_dims(
                np.array(obs), axis=0), np.expand_dims(np.array(new_obs),
                                                       axis=0)
            if n_step:
                temp_buffer.append((obs, action, rew, new_obs, done, is_demo))
                if len(temp_buffer) == n_step:
                    n_step_sample = get_n_step_sample(temp_buffer, gamma)
                    replay_buffer.demo_len += 1
                    replay_buffer.add(*n_step_sample)
            else:
                replay_buffer.demo_len += 1
                replay_buffer.add(obs[0], action, rew, new_obs[0], float(done),
                                  float(is_demo))
    logger.log("trajectory length:", replay_buffer.demo_len)
    # Create the schedule for exploration
    if epsilon_schedule == "constant":
        exploration = ConstantSchedule(exploration_final_eps)
    else:  # not used
        exploration = LinearSchedule(schedule_timesteps=int(
            exploration_fraction * total_timesteps),
                                     initial_p=1.0,
                                     final_p=exploration_final_eps)

    model.update_target()

    # ============================================== pre-training ======================================================
    start = time()
    num_episodes = 0
    temp_buffer = deque(maxlen=n_step)
    for t in tqdm(range(pre_train_timesteps)):
        # sample and train
        experience = replay_buffer.sample(batch_size,
                                          beta=prioritized_replay_beta0)
        batch_idxes = experience[-1]
        if experience[6] is None:  # for n_step = 0
            obses_t, actions, rewards, obses_tp1, dones, is_demos = tuple(
                map(tf.constant, experience[:6]))
            obses_tpn, rewards_n, dones_n = None, None, None
            weights = tf.constant(experience[-2])
        else:
            obses_t, actions, rewards, obses_tp1, dones, is_demos, obses_tpn, rewards_n, dones_n, weights = tuple(
                map(tf.constant, experience[:-1]))
        td_errors, n_td_errors, loss_dq, loss_n, loss_E, loss_l2, weighted_error = model.train(
            obses_t, actions, rewards, obses_tp1, dones, is_demos, weights,
            obses_tpn, rewards_n, dones_n)

        # Update priorities
        new_priorities = np.abs(td_errors) + np.abs(
            n_td_errors) + demo_prioritized_replay_eps
        replay_buffer.update_priorities(batch_idxes, new_priorities)

        # Update target network periodically
        if t > 0 and t % target_network_update_freq == 0:
            model.update_target()

        # Logging
        elapsed_time = timedelta(time() - start)
        if print_freq is not None and t % 10000 == 0:
            logger.record_tabular("steps", t)
            logger.record_tabular("episodes", num_episodes)
            logger.record_tabular("mean 100 episode reward", 0)
            logger.record_tabular("max 100 episode reward", 0)
            logger.record_tabular("min 100 episode reward", 0)
            logger.record_tabular("demo sample rate", 1)
            logger.record_tabular("epsilon", 0)
            logger.record_tabular("loss_td", np.mean(loss_dq.numpy()))
            logger.record_tabular("loss_n_td", np.mean(loss_n.numpy()))
            logger.record_tabular("loss_margin", np.mean(loss_E.numpy()))
            logger.record_tabular("loss_l2", np.mean(loss_l2.numpy()))
            logger.record_tabular("losses_all", weighted_error.numpy())
            logger.record_tabular("% time spent exploring",
                                  int(100 * exploration.value(t)))
            logger.record_tabular("pre_train", True)
            logger.record_tabular("elapsed time", elapsed_time)
            logger.dump_tabular()

    # ============================================== exploring =========================================================
    sample_counts = 0
    demo_used_counts = 0
    episode_rewards = deque(maxlen=100)
    this_episode_reward = 0.
    best_score = 0.
    saved_mean_reward = None
    is_demo = False
    obs = env.reset()
    # Always mimic the vectorized env
    obs = np.expand_dims(np.array(obs), axis=0)
    reset = True
    for t in tqdm(range(total_timesteps)):
        if callback is not None:
            if callback(locals(), globals()):
                break
        kwargs = {}
        if not param_noise:
            update_eps = tf.constant(exploration.value(t))
            update_param_noise_threshold = 0.
        else:  # not used
            update_eps = tf.constant(0.)
            update_param_noise_threshold = -np.log(1. - exploration.value(t) +
                                                   exploration.value(t) /
                                                   float(env.action_space.n))
            kwargs['reset'] = reset
            kwargs[
                'update_param_noise_threshold'] = update_param_noise_threshold
            kwargs['update_param_noise_scale'] = True
        action, epsilon, _, _ = model.step(tf.constant(obs),
                                           update_eps=update_eps,
                                           **kwargs)
        action = action[0].numpy()
        reset = False
        new_obs, rew, done, _ = env.step(action)

        # Store transition in the replay buffer.
        new_obs = np.expand_dims(np.array(new_obs), axis=0)
        if n_step:
            temp_buffer.append((obs, action, rew, new_obs, done, is_demo))
            if len(temp_buffer) == n_step:
                n_step_sample = get_n_step_sample(temp_buffer, gamma)
                replay_buffer.add(*n_step_sample)
        else:
            replay_buffer.add(obs[0], action, rew, new_obs[0], float(done), 0.)
        obs = new_obs

        # invert log scaled score for logging
        this_episode_reward += np.sign(rew) * (np.exp(np.sign(rew) * rew) - 1.)
        if done:
            num_episodes += 1
            obs = env.reset()
            obs = np.expand_dims(np.array(obs), axis=0)
            episode_rewards.append(this_episode_reward)
            reset = True
            if this_episode_reward > best_score:
                best_score = this_episode_reward
                ckpt = tf.train.Checkpoint(model=model)
                manager = tf.train.CheckpointManager(ckpt,
                                                     './best_model',
                                                     max_to_keep=1)
                manager.save(t)
                logger.log("saved best model")
            this_episode_reward = 0.0

        if t % train_freq == 0:
            experience = replay_buffer.sample(batch_size,
                                              beta=beta_schedule.value(t))
            batch_idxes = experience[-1]
            if experience[6] is None:  # for n_step = 0
                obses_t, actions, rewards, obses_tp1, dones, is_demos = tuple(
                    map(tf.constant, experience[:6]))
                obses_tpn, rewards_n, dones_n = None, None, None
                weights = tf.constant(experience[-2])
            else:
                obses_t, actions, rewards, obses_tp1, dones, is_demos, obses_tpn, rewards_n, dones_n, weights = tuple(
                    map(tf.constant, experience[:-1]))
            td_errors, n_td_errors, loss_dq, loss_n, loss_E, loss_l2, weighted_error = model.train(
                obses_t, actions, rewards, obses_tp1, dones, is_demos, weights,
                obses_tpn, rewards_n, dones_n)
            new_priorities = np.abs(td_errors) + np.abs(
                n_td_errors
            ) + demo_prioritized_replay_eps * is_demos + prioritized_replay_eps * (
                1. - is_demos)
            replay_buffer.update_priorities(batch_idxes, new_priorities)

            # for logging
            sample_counts += batch_size
            demo_used_counts += np.sum(is_demos)

        if t % target_network_update_freq == 0:
            # Update target network periodically.
            model.update_target()

        if t % checkpoint_freq == 0:
            save_path = checkpoint_path
            ckpt = tf.train.Checkpoint(model=model)
            manager = tf.train.CheckpointManager(ckpt,
                                                 save_path,
                                                 max_to_keep=10)
            manager.save(t)
            logger.log("saved checkpoint")

        elapsed_time = timedelta(time() - start)
        if done and num_episodes > 0 and num_episodes % print_freq == 0:
            logger.record_tabular("steps", t)
            logger.record_tabular("episodes", num_episodes)
            logger.record_tabular("mean 100 episode reward",
                                  np.mean(episode_rewards))
            logger.record_tabular("max 100 episode reward",
                                  np.max(episode_rewards))
            logger.record_tabular("min 100 episode reward",
                                  np.min(episode_rewards))
            logger.record_tabular("demo sample rate",
                                  demo_used_counts / sample_counts)
            logger.record_tabular("epsilon", epsilon.numpy())
            logger.record_tabular("loss_td", np.mean(loss_dq.numpy()))
            logger.record_tabular("loss_n_td", np.mean(loss_n.numpy()))
            logger.record_tabular("loss_margin", np.mean(loss_E.numpy()))
            logger.record_tabular("loss_l2", np.mean(loss_l2.numpy()))
            logger.record_tabular("losses_all", weighted_error.numpy())
            logger.record_tabular("% time spent exploring",
                                  int(100 * exploration.value(t)))
            logger.record_tabular("pre_train", False)
            logger.record_tabular("elapsed time", elapsed_time)
            logger.dump_tabular()

    return model
Exemplo n.º 8
0
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs.
    env = gym.make(env_id)
    env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))

    if evaluation and rank == 0:
        eval_env = gym.make(env_id)
        eval_env = Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval'))
        env = Monitor(env, None)
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions),
                                                        sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()
    training.train(env=env, eval_env=eval_env, param_noise=param_noise,
                   action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
Exemplo n.º 9
0
def run(mode, render, render_eval, verbose_eval, sanity_run, env_kwargs,
        model_kwargs, train_kwargs):
    if sanity_run:
        # Mode to sanity check the basic code.
        # Fixed seed and logging dir.
        # Dynamic setting of nb_rollout_steps and nb_train_steps in training.train() is disabled.
        print('SANITY CHECK MODE!!!')

    # Configure MPI, logging, random seeds, etc.
    mpi_rank = MPI.COMM_WORLD.Get_rank()
    mpi_size = MPI.COMM_WORLD.Get_size()

    if mpi_rank == 0:
        logger.configure(dir='logs' if sanity_run else datetime.datetime.now().
                         strftime("train_%m%d_%H%M"))
        logdir = logger.get_dir()
    else:
        logger.set_level(logger.DISABLED)
        logdir = None
    logdir = MPI.COMM_WORLD.bcast(logdir, root=0)

    start_time = time.time()
    # fixed seed when running sanity check, same seed hourly for training.
    seed = 1000000 * mpi_rank
    seed += int(start_time) // 3600 if not sanity_run else 0

    seed_list = MPI.COMM_WORLD.gather(seed, root=0)
    logger.info('mpi_size {}: seeds={}, logdir={}'.format(
        mpi_size, seed_list, logger.get_dir()))

    # Create envs.
    envs = []
    if mode in [MODE_TRAIN]:
        train_env = cust_env.ProsEnvMon(
            visualize=render,
            seed=seed,
            fn_step=None,
            fn_epis=logdir and os.path.join(logdir, '%d' % mpi_rank),
            reset_dflt_interval=2,
            **env_kwargs)
        logger.info('action, observation space:', train_env.action_space.shape,
                    train_env.observation_space.shape)
        envs.append(train_env)
    else:
        train_env = None

    # Always run eval_env, either in evaluation mode during MODE_TRAIN, or MODE_SAMPLE, MODE_TEST.
    # Reset to random states (reset_dflt_interval=0) in MODE_SAMPLE ,
    # Reset to default state (reset_dflt_interval=1) in evaluation of MODE_TRAIN, or MODE_TEST
    reset_dflt_interval = 0 if mode in [MODE_SAMPLE] else 1
    eval_env = cust_env.ProsEnvMon(
        visualize=render_eval,
        seed=seed,
        fn_step=logdir and os.path.join(logdir, 'eval_step_%d.csv' % mpi_rank),
        fn_epis=logdir and os.path.join(logdir, 'eval_%d' % mpi_rank),
        reset_dflt_interval=reset_dflt_interval,
        verbose=verbose_eval,
        **env_kwargs)
    envs.append(eval_env)

    # Create DDPG agent
    tf.reset_default_graph()
    set_global_seeds(seed)
    assert (eval_env is not None), 'Empty Eval Environment!'

    action_range = (min(eval_env.action_space.low),
                    max(eval_env.action_space.high))
    logger.info('\naction_range', action_range)
    nb_demo_kine, nb_key_states = eval_env.obs_cust_params
    agent = ddpg.DDPG(eval_env.observation_space.shape,
                      eval_env.action_space.shape,
                      nb_demo_kine,
                      nb_key_states,
                      action_range=action_range,
                      save_ckpt=mpi_rank == 0,
                      **model_kwargs)
    logger.debug('Using agent with the following configuration:')
    logger.debug(str(agent.__dict__.items()))

    # Set up agent mimic reward interface, for environment
    for env in envs:
        env.set_agent_intf_fp(agent.get_mimic_rwd)

    # Run..
    logger.info('\nEnv params:', env_kwargs)
    logger.info('Model params:', model_kwargs)
    if mode == MODE_TRAIN:
        logger.info('Start training', train_kwargs)
        training.train(train_env,
                       eval_env,
                       agent,
                       render=render,
                       render_eval=render_eval,
                       sanity_run=sanity_run,
                       **train_kwargs)

    elif mode == MODE_SAMPLE:
        sampling.sample(eval_env, agent, render=render_eval, **train_kwargs)
    else:
        training.test(eval_env, agent, render_eval=render_eval, **train_kwargs)

    # Close up.
    if train_env:
        train_env.close()
    if eval_env:
        eval_env.close()

    mpi_complete(start_time, mpi_rank, mpi_size, non_blocking_mpi=True)
Exemplo n.º 10
0
def train(args):
    import baselines.baselines_common.tf_util as U

    sess = U.single_threaded_session()
    sess.__enter__()

    if args.restore_args_from is not None:
        args = restore_params(args)

    rank = MPI.COMM_WORLD.Get_rank()

    workerseed = args.seed + 241 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)

    def policy_fn(name, ob_space, ac_space):
        return Actor(name=name,
                     ob_space=ob_space,
                     ac_space=ac_space,
                     hid_size=args.hid_size,
                     num_hid_layers=args.num_hid_layers,
                     noise_type=args.noise_type)

    env = create_env(args)
    env.seed(workerseed)

    if rank == 0:
        create_if_need(args.logdir)
        with open("{}/args.json".format(args.logdir), "w") as fout:
            json.dump(vars(args),
                      fout,
                      indent=4,
                      ensure_ascii=False,
                      sort_keys=True)

    try:
        args.thread = rank
        if args.agent == "trpo":
            trpo.learn(env,
                       policy_fn,
                       args,
                       timesteps_per_batch=1024,
                       gamma=args.gamma,
                       lam=0.98,
                       max_kl=0.01,
                       cg_iters=10,
                       cg_damping=0.1,
                       vf_iters=5,
                       vf_stepsize=1e-3)
        elif args.agent == "ppo":
            # optimal settings:
            # timesteps_per_batch = optim_epochs *  optim_batchsize
            ppo.learn(env,
                      policy_fn,
                      args,
                      timesteps_per_batch=256,
                      gamma=args.gamma,
                      lam=0.95,
                      clip_param=0.2,
                      entcoeff=0.0,
                      optim_epochs=4,
                      optim_stepsize=3e-4,
                      optim_batchsize=64,
                      schedule='constant')
        else:
            raise NotImplementedError
    except KeyboardInterrupt:
        print("closing envs...")

    env.close()