Exemplo n.º 1
0
def sample(eval_env,
           agent,
           render_eval=False,
           start_ckpt=None,
           demo_files=None,
           nb_epochs=1000,
           **kwargs):
    with tf_util.single_threaded_session() as sess:
        # Run demo network
        agent.initialize(sess, start_ckpt=start_ckpt)
        sess.graph.finalize()
        agent.reset()
        dbg_tf_init(sess, agent.dbg_vars)
        total_nb_train = 0
        pretrain_demo(agent,
                      eval_env,
                      demo_files,
                      total_nb_train,
                      train_params=[(100, 1.0)],
                      start_ckpt=start_ckpt)

        # run sampling
        # agent reward can be disabled.
        # eval_env.set_intf_fp(None)
        sample_fd1norm2(eval_env,
                        render_eval=render_eval,
                        target_nb_resets=nb_epochs)
Exemplo n.º 2
0
def test_multikwargs():
    with tf.Graph().as_default():
        x = tf.placeholder(tf.int32, (), name="x")
        with tf.variable_scope("other"):
            x2 = tf.placeholder(tf.int32, (), name="x")
        z = 3 * x + 2 * x2

        lin = function([x, x2], z, givens={x2: 0})
        with single_threaded_session():
            initialize()
            assert lin(2) == 6
            assert lin(2, 2) == 10
Exemplo n.º 3
0
def test_function():
    with tf.Graph().as_default():
        x = tf.placeholder(tf.int32, (), name="x")
        y = tf.placeholder(tf.int32, (), name="y")
        z = 3 * x + 2 * y
        lin = function([x, y], z, givens={y: 0})

        with single_threaded_session():
            initialize()

            assert lin(2) == 6
            assert lin(x=3) == 9
            assert lin(2, 2) == 10
            assert lin(x=2, y=3) == 12
Exemplo n.º 4
0
def test(eval_env,
         agent,
         render_eval=True,
         nb_epochs=1,
         start_ckpt=None,
         **kwargs):
    logger.info('Start testing:', start_ckpt, '\n')
    with tf_util.single_threaded_session() as sess:
        agent.initialize(sess, start_ckpt=start_ckpt)
        sess.graph.finalize()

        for _ in range(nb_epochs):
            combined_stats = {}
            eval_episode(eval_env, render_eval, agent, combined_stats)

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
Exemplo n.º 5
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        max_action * action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(
                                eval_episode_reward)
                            eval_episode_reward = 0.

            mpi_size = MPI.COMM_WORLD.Get_size()
            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(
                episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(
                epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(
                epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(
                duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)
            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = eval_episode_rewards
                combined_stats['eval/return_history'] = np.mean(
                    eval_episode_rewards_history)
                combined_stats['eval/Q'] = eval_qs
                combined_stats['eval/episodes'] = len(eval_episode_rewards)

            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s' % x)

            combined_stats_sums = MPI.COMM_WORLD.allreduce(
                np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {
                k: v / mpi_size
                for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
            }

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
Exemplo n.º 6
0
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic,
          normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise,
          popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory,
          tau=0.01, eval_env=None, param_noise_adaption_interval=50):

    assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info('scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
                 gamma=gamma, tau=tau, normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    saver = tf.train.Saver()

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        max_action * action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(eval_episode_reward)
                            eval_episode_reward = 0.

            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)
            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = eval_episode_rewards
                combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
                combined_stats['eval/Q'] = eval_qs
                combined_stats['eval/episodes'] = len(eval_episode_rewards)

            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s' % x)

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
Exemplo n.º 7
0
    parser.add_argument("--max_episode_len", type = int, default = 50, help = "max episode length")
    parser.add_argument("--warm_up_steps", type = int, default = 1000, help = "set the warm up steps")
    parser.add_argument("--lr", type = float, default = 0.001, help = "learning rate")
    parser.add_argument("--gamma", type = float, default = 0.99, help = "discount rate")
   
    return parser.parse_args()


if __name__ == '__main__':
    arglist = parse_args()
    model_path = './tmp/model'
    random.seed(arglist.seed )
    log_path = './logs/{}_{}_{}_{}.csv'.format(arglist.game, arglist.agent, arglist.seed, time.time())
    log = open(log_path, '+w', 1)

    with U.single_threaded_session():
        
        env = PongGame()
        obs_space = env.observation_space
        act_space = env.action_space
        agent1 = DDPGLearner(obs_space, act_space, "agent1",arglist)
        if arglist.adv_agent == "agent":
            agent2 = DDPGLearner(obs_space, act_space, "agent2", arglist)
        agent2 = agent1
        U.initialize()
        saver = tf.train.Saver()
        for epo in range(arglist.epoch):
            agent1.reset_replay_buffer()
            agent2.reset_replay_buffer()
            agent1_q_loss = []
            agent1_p_loss = []
Exemplo n.º 8
0
def train_maddpg(arglist):
    with U.single_threaded_session():
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env,
                                num_adversaries,
                                obs_shape_n,
                                arglist,
                                good_agent_mode=arglist.good_policy,
                                adv_agent_mode=arglist.adv_policy)
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver(max_to_keep=None)
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        t_start = time.time()

        if arglist.real_q_log:
            world_state_buffer, action_n_buffer, start_episode_step_buffer, obs_n_buffer = [], [], [], []
            q_means, real_means = [], []

        print('Starting iterations...')
        while True:
            # get action
            action_n = [
                agent.action(obs) for agent, obs in zip(trainers, obs_n)
            ]
            # environment step
            new_obs_n, rew_n, done_n, info_n = env.step(action_n)
            episode_step += 1
            done = all(done_n)  # note: unused, never happens
            terminal = (episode_step >= arglist.max_episode_len)
            done = done or terminal

            if arglist.real_q_log:
                world_state_buffer.append(deepcopy(env.world))
                obs_n_buffer.append(obs_n)
                action_n_buffer.append(action_n)
                start_episode_step_buffer.append(episode_step)

            # collect experience
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i],
                                 done, terminal)
            obs_n = new_obs_n

            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            if done or terminal:
                obs_n = env.reset()
                episode_step = 0
                episode_rewards.append(0)  # add element for next episode
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # for displaying learned policies
            if arglist.display:
                time.sleep(0.1)
                env.render()
                continue

            for agent in trainers:
                loss = agent.update(trainers, train_step)

            # save model, display training output
            if terminal and (len(episode_rewards) % arglist.save_rate == 0):
                if arglist.save_dir != '/tmp/policy/':
                    U.save_state(arglist.save_dir + arglist.exp_name,
                                 saver=saver,
                                 global_step=len(episode_rewards))
                else:
                    U.save_state(
                        arglist.save_dir, saver=saver
                    )  # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                        .format(
                            train_step, len(episode_rewards),
                            np.mean(episode_rewards[-arglist.save_rate:-1]),
                            round(time.time() - t_start, 3)))
                else:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}"
                        .format(
                            train_step, len(episode_rewards),
                            np.mean(episode_rewards[-arglist.save_rate:-1]), [
                                np.mean(rew[-arglist.save_rate:])
                                for rew in agent_rewards
                            ], round(time.time() - t_start, 3)))
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(
                    np.mean(episode_rewards[-arglist.save_rate:-1]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(
                        np.mean(rew[-arglist.save_rate:-1]))

                if arglist.real_q_log and (len(episode_rewards) %
                                           (5 * arglist.save_rate) == 0):
                    q_mean, real_mean = calculate_real_q_value(
                        deepcopy(env),
                        trainers,
                        world_state_buffer=world_state_buffer,
                        action_n_buffer=action_n_buffer,
                        obs_n_buffer=obs_n_buffer,
                        start_episode_step_buffer=start_episode_step_buffer,
                        num_start_states=200,
                        args=arglist)
                    world_state_buffer, action_n_buffer, start_episode_step_buffer, obs_n_buffer = [], [], [], []
                    q_means.append(q_mean)
                    real_means.append(real_mean)
                    print('Q-mean: ' + str(q_mean) + ' Real mean: ' +
                          str(real_mean))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                args_file_name = arglist.plots_dir + arglist.exp_name + '_args.pkl'
                with open(args_file_name, 'wb') as fp:
                    pickle.dump(arglist, fp)
                if arglist.real_q_log:
                    real_q_path = arglist.plots_dir + arglist.exp_name + '_q_values.pkl'
                    with open(real_q_path, 'wb') as fp:
                        pickle.dump(
                            {
                                'q_means': q_means,
                                'real_means': real_means
                            }, fp)
                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards)))
                break
Exemplo n.º 9
0
def train(env,
          eval_env,
          agent,
          render=False,
          render_eval=False,
          sanity_run=False,
          nb_epochs=500,
          nb_epoch_cycles=20,
          nb_rollout_steps=100,
          nb_train_steps=50,
          param_noise_adaption_interval=50,
          hist_files=None,
          start_ckpt=None,
          demo_files=None):

    rank = MPI.COMM_WORLD.Get_rank()
    mpi_size = MPI.COMM_WORLD.Get_size()
    if rank == 0:
        logdir = logger.get_dir()
    else:
        logdir = None

    memory = agent.memory
    batch_size = agent.batch_size

    with tf_util.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess, start_ckpt=start_ckpt)
        sess.graph.finalize()
        agent.reset()
        dbg_tf_init(sess, agent.dbg_vars)

        total_nb_train = 0
        total_nb_rollout = 0
        total_nb_eval = 0

        # pre-train demo and critic_step
        # train_params: (nb_steps, lr_scale)
        total_nb_train = pretrain_demo(agent,
                                       env,
                                       demo_files,
                                       total_nb_train,
                                       train_params=[(100, 1.0)],
                                       start_ckpt=start_ckpt)
        load_history(agent, env, hist_files)

        # main training
        obs = env.reset()
        reset = False
        episode_step = 0
        last_episode_step = 0

        for i_epoch in range(nb_epochs):
            t_epoch_start = time.time()
            logger.info('\n%s epoch %d starts:' %
                        (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                         i_epoch))
            for i_cycle in range(nb_epoch_cycles):
                logger.info(
                    '\n%s cycles_%d of epoch_%d' %
                    (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f'),
                     i_cycle, i_epoch))

                # rollout
                rcd_obs, rcd_action, rcd_r, rcd_new_obs, rcd_done = [], [], [], [], []
                if not sanity_run and mpi_size == 1 and last_episode_step != 0:
                    # todo: use mpi_max(last_episode_step)
                    # dynamically set nb_rollout_steps
                    nb_rollout_steps = max(last_episode_step * 4, batch_size)
                logger.info(
                    '[%d, %d] rollout for %d steps.' %
                    (total_nb_rollout, memory.nb_entries, nb_rollout_steps))
                t_rollout_start = time.time()

                for i_rollout in range(nb_rollout_steps):
                    rollout_log = i_cycle == 0
                    # 50% param_noise, 40% action_noise
                    action, q = agent.pi(obs,
                                         total_nb_rollout,
                                         compute_Q=True,
                                         rollout_log=rollout_log,
                                         apply_param_noise=i_rollout % 10 < 5,
                                         apply_action_noise=i_rollout % 10 > 5)
                    assert action.shape == env.action_space.shape
                    new_obs, r, done, reset, info = env.step(action)

                    if rank == 0 and render:
                        env.render()

                    episode_step += 1
                    total_nb_rollout += 1

                    if rollout_log:
                        summary_list = [('rollout/%s' % tp, info[tp])
                                        for tp in ['rwd_walk', 'rwd_total']]
                        tp = 'rwd_agent'
                        summary_list += [
                            ('rollout/%s_x%d' % (tp, info['rf_agent']),
                             info[tp] * info['rf_agent'])
                        ]
                        summary_list += [('rollout/q', q)]
                        if r != 0:
                            summary_list += [('rollout/q_div_r', q / r)]
                        agent.add_list_summary(summary_list, total_nb_rollout)

                    # store at the end of cycle to speed up MPI rollout
                    # agent.store_transition(obs, action, r, new_obs, done)
                    rcd_obs.append(obs)
                    rcd_action.append(action)
                    rcd_r.append(r)
                    rcd_new_obs.append(new_obs)
                    rcd_done.append(done)

                    obs = new_obs
                    if reset:
                        # Episode done.
                        last_episode_step = episode_step
                        episode_step = 0

                        agent.reset()
                        obs = env.reset()

                agent.store_multrans(memory, rcd_obs, rcd_action, rcd_r,
                                     rcd_new_obs, rcd_done)

                t_train_start = time.time()
                steps_per_second = float(nb_rollout_steps) / (t_train_start -
                                                              t_rollout_start)
                agent.add_list_summary(
                    [('rollout/steps_per_second', steps_per_second)],
                    total_nb_rollout)

                # Train.
                if not sanity_run:
                    # dynamically set nb_train_steps
                    if memory.nb_entries > batch_size * 20:
                        # using 1% of data for training every step?
                        nb_train_steps = max(
                            int(memory.nb_entries * 0.01 / batch_size), 1)
                    else:
                        nb_train_steps = 0
                logger.info('[%d] training for %d steps.' %
                            (total_nb_train, nb_train_steps))
                for _ in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and total_nb_train % param_noise_adaption_interval == 0:
                        agent.adapt_param_noise(total_nb_train)

                    agent.train_main(total_nb_train)
                    agent.update_target_net()
                    total_nb_train += 1

                if i_epoch == 0 and i_cycle < 5:
                    rollout_duration = t_train_start - t_rollout_start
                    train_duration = time.time() - t_train_start
                    logger.info(
                        'rollout_time(%d) = %.3fs, train_time(%d) = %.3fs' %
                        (nb_rollout_steps, rollout_duration, nb_train_steps,
                         train_duration))
                    logger.info(
                        'rollout_speed=%.3fs/step, train_speed = %.3fs/step' %
                        (np.divide(rollout_duration, nb_rollout_steps),
                         np.divide(train_duration, nb_train_steps)))

            logger.info('')
            mpi_size = MPI.COMM_WORLD.Get_size()
            # Log stats.
            stats = agent.get_stats(memory)
            combined_stats = stats.copy()

            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s' % x)

            combined_stats_sums = MPI.COMM_WORLD.allreduce(
                np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {
                k: v / mpi_size
                for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
            }

            # exclude logging zobs_dbg_%d, zobs_dbg_%d_normalized
            summary_list = [(key, combined_stats[key])
                            for key, v in combined_stats.items()
                            if 'dbg' not in key]
            agent.add_list_summary(summary_list, i_epoch)

            # only print out train stats for epoch_0 for sanity check
            if i_epoch > 0:
                combined_stats = {}

            # Evaluation and statistics.
            if eval_env is not None:
                logger.info('[%d, %d] run evaluation' %
                            (i_epoch, total_nb_eval))
                total_nb_eval = eval_episode(eval_env, render_eval, agent,
                                             combined_stats, total_nb_eval)

            logger.info('epoch %d duration: %.2f mins' %
                        (i_epoch, (time.time() - t_epoch_start) / 60))
            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')

            if rank == 0:
                agent.store_ckpt(os.path.join(logdir, '%s.ckpt' % 'ddpg'),
                                 i_epoch)