def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor,
          critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr,
          action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps,
          batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50,
          gamma_reward_shaping=0.1, start_reward_shaping=10000):
    logger.info(sys._getframe().f_code.co_name)


    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high).all()
    max_action = env.action_space.high
    logger.info("scale actions by {} before executing in env".format(max_action))
    agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
                 gamma=gamma, tau=tau, normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise,
                 param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr,
                 enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale)
    logger.info("Using agent with the following configuration:")
    logger.info(str(agent.__dict__.items()))

    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0

        episode_sample = []
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                for t_rollout in range(nb_eval_steps):
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(max_action * action)
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    episode_sample.append((obs, action, r, new_obs, done))
                    if t <= start_reward_shaping:
                        agent.store_transition(obs, action, r, new_obs, done)


                    if done:
                        if t > start_reward_shaping:
                            logger.info("start reward shaping")
                            reward = r
                            agent.store_transition(obs, action, reward, new_obs, done)
                            # episode_sample.append()
                            for i in range(len(episode_sample) - 1):
                                obs_tmp, action_tmp, rew_tmp, new_obs_tmp, done_tmp = \
                                    episode_sample[len(episode_sample) - i - 1]
                                reward = round(reward * gamma_reward_shaping, 5)
                                reward = reward + rew_tmp
                                agent.store_transition(obs_tmp, action_tmp, reward, new_obs_tmp, done)

                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1
                        agent.reset()
                        obs = env.reset()
                    obs = new_obs
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(eval_episode_reward)
                            eval_episode_reward = 0.


            mpi_size = MPI.COMM_WORLD.Get_size()

            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats["rollout/return"] = np.mean(epoch_episode_rewards)
            combined_stats["rollout/return_history"] = np.mean(episode_rewards_history)
            combined_stats["rollout/episode_steps"] = np.mean(epoch_episode_steps)
            combined_stats["rollout/actions_mean"] = np.mean(epoch_actions)
            combined_stats["rollout/Q_mean"] = np.mean(epoch_qs)
            combined_stats["train/loss_actor"] = np.mean(epoch_actor_losses)
            combined_stats["train/loss_critic"] = np.mean(epoch_critic_losses)
            combined_stats["train/param_noise_distance"] = np.mean(epoch_adaptive_distances)
            combined_stats["total/duration"] = duration
            combined_stats["total/steps_per_second"] = float(t) / float(duration)
            combined_stats["total/episodes"] = episodes
            combined_stats["rollout/episodes"] = epoch_episodes
            combined_stats["rollour/actions_std"] = np.std(epoch_actions)

            if eval_env is not None:
                combined_stats["eval/return"] = eval_episode_rewards
                combined_stats["eval/return_history"] = np.mean(eval_episode_rewards_history)
                combined_stats["eval/Q"] = eval_qs
                combined_stats["eval/episodes"] = len(eval_episode_rewards)

            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise  ValueError("expected scalar, got %s" % x)
            combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x)
                                                                    for x in combined_stats.values()]))
            combined_stats = {k : v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums)}

            combined_stats["total/epochs"] = epoch + 1
            combined_stats["total/steps"] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])

            logger.dump_tabular()
            logger.info("")
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, "get_state"):
                    with open(os.path.join(logdir, "env_state.pkl"), "wb") as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, "get_state"):
                    with open(os.path.join(logdir, "eval_env_state.pkl"), "wb") as f:
                        pickle.dump(eval_env.get_state(), f)
def Test(env,
         nb_epochs,
         nb_epoch_cycles,
         render_eval,
         reward_scale,
         render,
         param_noise,
         actor,
         critic,
         normalize_returns,
         normalize_observations,
         critic_l2_reg,
         actor_lr,
         critic_lr,
         action_noise,
         popart,
         gamma,
         clip_norm,
         nb_train_steps,
         nb_rollout_steps,
         nb_eval_steps,
         batch_size,
         memory,
         tau=0.01,
         eval_env=None,
         param_noise_adaption_interval=50,
         restore=False):

    rank = MPI.COMM_WORLD.Get_rank()
    max_action = np.array([0.2, 0.2, 0.2, 0.2, 0.2, 0.2])

    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    model_directory = '/home/rvsa/RL_project/Peg_in_Hole/1-baselines/baselines/ddpg/result/'

    agent = DDPG(actor,
                 critic,
                 memory,
                 env.state_dim,
                 env.action_dim,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=None,
                 param_noise=None,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)

    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))
    """Set up logging stuff only for a single worker"""
    saver = tf.train.Saver()

    # if rank == 0:
    #     saver = tf.train.Saver()
    # else:
    #     saver = None
    # eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)

    with U.single_threaded_session() as sess:
        """Prepare everything"""
        if restore:
            saver = tf.train.import_meta_graph(model_directory +
                                               'model_fuzzy_new_3.meta')
            agent.restore_model(model_directory, saver, sess)
        else:
            agent.initialize(sess)
            sess.graph.finalize()
        """Agent Reset"""
        agent.reset()
        """Force calibration"""
        # env.robot_control.CalibFCforce()

        learning_epochs = 15
        delay_rate = np.power(10, 1 / learning_epochs)
        """Revise the last epochs"""
        # last_epochs = 0
        # actor_lr = actor_lr/np.power(delay_rate, last_epochs)
        # critic_lr = critic_lr/np.power(delay_rate, last_epochs)

        start_time = time.time()
        epoch_episode_rewards = []
        epoch_episode_steps = []
        mean_rollout_time = []
        mean_epoch_rewards = []
        mean_epoch_steps = []
        mean_epoch_time = []
        epoch_adaptive_distances = []

        epoch_actions = []
        epoch_qs = []

        epoch_episodes = 0
        total_episodes = 0
        successful_rate = []
        Force_moments = np.zeros((1, 6))
        for epoch in range(nb_epochs):
            """Show the result for cycle 20 times and Save the model"""
            # epoch_actor_losses = []
            # epoch_critic_losses = []
            """Delay the learning rate"""
            # epoch_actor_lr = actor_lr/delay_rate
            # epoch_critic_lr = critic_lr/delay_rate
            epoch_start_time = time.time()

            force_array = np.zeros((150, 6))
            for cycle in range(nb_epoch_cycles):
                """environment reset """
                agent.reset()
                obs = env.reset()
                episode_reward = 0.
                done = False
                rollout_start_time = time.time()
                forcemoments = []
                for t_rollout in range(nb_rollout_steps):
                    """Predict next action"""
                    action, q = agent.pi(obs,
                                         apply_noise=False,
                                         compute_Q=True)
                    assert action.shape[0] == env.action_dim
                    """scale for execution in env"""
                    new_obs, r, done, info = env.step(action, t_rollout)
                    logger.info("The maximum force:" +
                                str(max(abs(new_obs[0:3]))) +
                                " The maximum moments:" +
                                str(max(abs(new_obs[3:6]))))
                    episode_reward += r

                    force_array[t_rollout, :] = new_obs[0:6]
                    """Plot the force and moments"""
                    if render:
                        forcemoments.append(new_obs[0:6])
                        Force_moments.append(new_obs[0:6])
                        env.plot_force(forcemoments, t_rollout + 1)

                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs
                    """Episode done and start pull the pegs step by step"""
                    if done:
                        logger.info('Peg-in-hole assembly done!!!')
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(t_rollout)
                        epoch_episodes += 1
                        pull_done = False
                        pull_safe = True
                        while pull_done is False and pull_safe:

                            pull_done, pull_safe = env.pull_up()  # True env

                        # if pull_safe is False:
                        #     logger.info('###############################################')
                        #     logger.info('Pull up the pegs failed for the exceed force!!!')
                        #     exit()
                        break
                    """Episode failed and start pull the pegs step by step"""
                    if info is False:
                        logger.info(
                            'Peg-in-hole assembly failed for the exceed force!!!'
                        )
                        pull_done = False
                        pull_safe = True
                        while pull_done is False and pull_safe:

                            pull_done, pull_safe = env.pull_up()  # True env

                        # if pull_safe is False:
                        #     logger.info('###############################################')
                        #     logger.info('Peg-in-hole assembly failed for the exceed force!!!')
                        #     exit()
                        break

                total_episodes += 1
                roluout_time = time.time() - rollout_start_time
                mean_rollout_time.append(roluout_time)

                Force_moments = np.concatenate((Force_moments, force_array),
                                               axis=0)

                if t_rollout == nb_rollout_steps - 1:
                    logger.info(
                        'Peg-in-hole assembly failed for exceed steps!!!')
                    logger.info('The deepest position'.format(obs[8]))
                """train model for nb_train_steps times"""
                # for t_train in range(nb_train_steps):
                #     cl, al = agent.train(epoch_actor_lr, epoch_critic_lr)
                #     epoch_critic_losses.append(cl)
                #     epoch_actor_losses.append(al)
                #     agent.update_target_net()
                """Save the force figure"""
                env.save_figure(model_directory + str(cycle) +
                                'Force_figure.eps')
            """Save the memory data"""
            # agent.save_data()
            """Adapt param noise, if necessary"""
            # if memory.nb_entries >= batch_size and param_noise is not None:
            #     distance = agent.adapt_param_noise()
            #     epoch_adaptive_distances.append(distance)
            """write the result into the summary"""
            # agent.log_scalar("actor_loss", mpi_mean(epoch_actor_losses), epoch_episodes)
            # agent.log_scalar("critic_loss", mpi_mean(epoch_critic_losses), epoch_episodes)
            # agent.log_scalar("episode_score", mpi_mean(epoch_episode_rewards), epoch_episodes)
            # agent.log_scalar("episode_steps", mpi_mean(epoch_episode_steps), epoch_episodes)
            """Log stats."""
            epoch_train_duration = time.time() - epoch_start_time
            mean_epoch_time.append(epoch_train_duration)
            """Successful rate"""
            successful_rate.append(epoch_episodes / total_episodes)
            stats = agent.get_stats()
            combined_stats = {}

            for key in sorted(stats.keys()):
                combined_stats[key] = mpi_mean(stats[key])
            """Rollout statistics. compute the mean of the total nb_epoch_cycles"""
            combined_stats['rollout/rewards'] = mpi_mean(epoch_episode_rewards)
            mean_epoch_rewards.append(mpi_mean(epoch_episode_rewards))
            combined_stats['rollout/episode_steps'] = mpi_mean(
                epoch_episode_steps)
            mean_epoch_steps.append(mpi_mean(epoch_episode_steps))
            # combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes)
            combined_stats['rollout/actions_mean'] = mpi_mean(epoch_actions)
            combined_stats['rollout/actions_std'] = mpi_std(epoch_actions)
            combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs)
            """Train statistics"""
            # combined_stats['train/loss_actor'] = mpi_mean(epoch_actor_losses)
            # combined_stats['train/loss_critic'] = mpi_mean(epoch_critic_losses)
            # combined_stats['train/param_noise_distance'] = mpi_mean(epoch_adaptive_distances)
            """Total statistics"""
            combined_stats['total/episodes'] = mpi_sum(epoch_episodes)
            combined_stats['total/epochs'] = epoch + 1
            """Plot reward and steps"""
            # env.plot_rewards(epoch_episode_rewards, epoch_episodes)
            # env.plot_steps(epoch_episode_steps, epoch_episodes)
            """save the model and the result"""
            # saver.save(sess, model_directory + 'model_truth_general')
            """Save data"""
            pd_epoch_train_duration = pd.DataFrame(mean_epoch_time)
            pd_epoch_train_duration.to_csv(
                'data/large_duration_evaluation_before',
                sep=',',
                header=False,
                index=False)

            pd_rollout_time = pd.DataFrame(mean_rollout_time)
            pd_rollout_time.to_csv('data/large_rollout_time_evaluation_before',
                                   sep=',',
                                   header=False,
                                   index=False)

            # pd_successful_rate = pd.DataFrame(successful_rate)
            # pd_successful_rate.to_csv('data/successful_rate_evaluation_fail_1', sep=',', header=False, index=False)

            pd_Force_and_moments = pd.DataFrame(Force_moments)
            pd_Force_and_moments.to_csv(
                "data/large_force_moments_evaluation_before",
                sep=',',
                header=False,
                index=False)

            re_rewards = pd.DataFrame(epoch_episode_rewards)
            re_rewards.to_csv("data/large_re_true_rewards_evaluation_before",
                              sep=',',
                              header=False,
                              index=False)

            re_steps = pd.DataFrame(epoch_episode_steps)
            re_steps.to_csv("data/large_re_true_steps_evaluation_before",
                            sep=',',
                            header=False,
                            index=False)

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])

            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
예제 #3
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          reward_scale,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50,
          render=False,
          render_eval=False):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:

        #saver.restore(sess, "C:/Users/AN95540/Desktop/ICRA2019/Codes/RobotPath-ddpg/model/epoch_33.ckpt")
        # Prepare everything.
        agent.initialize(sess)
        #agent.continue_sess(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        info = {'near_collision': False, 'near_limits': False}
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):

            for cycle in range(nb_epoch_cycles):

                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    print('epoch:', epoch)
                    print('cycle:', cycle)
                    print('rollout:', t_rollout)
                    if epoch > 5:
                        time.sleep(0.1)
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    #if rank == 0 and render:
                    #env.render()
                    assert max_action.shape == action.shape
                    #new_obs, r, done, info = env.step(max_action * action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    #print(obs)
                    if info['near_collision'] or info[
                            'near_limits'] or epoch > 5:
                        new_obs, r, done, info = env.step(max_action * action)
                    else:
                        action = (obs[0:6] -
                                  obs[6:12]) * 0.1 + np.random.rand(6) * 0.1
                        new_obs, r, done, info = env.step(action)
                    t += 1
                    if episode_step > 1000:
                        done = True
                    #if rank == 0 and render:
                    #    env.render()
                    episode_reward += r
                    print('episode_reward:', episode_reward)
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        print('**********END OF EPISODE**********')
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    #print('t_train:',t_train)
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                eval_env = None
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(
                                eval_episode_reward)
                            eval_episode_reward = 0.

            mpi_size = MPI.COMM_WORLD.Get_size()
            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['epoch'] = epoch
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(
                episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(
                epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(
                epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(
                duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)
            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = eval_episode_rewards
                combined_stats['eval/return_history'] = np.mean(
                    eval_episode_rewards_history)
                combined_stats['eval/Q'] = eval_qs
                combined_stats['eval/episodes'] = len(eval_episode_rewards)

            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s' % x)

            combined_stats_sums = MPI.COMM_WORLD.allreduce(
                np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {
                k: v / mpi_size
                for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
            }

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            save_path = saver.save(sess,
                                   "./model/epoch_" + str(epoch) + ".ckpt")
            print(save_path)

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
예제 #4
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          tau=0.01,
          eval_env=None,
          controller=None,
          param_noise_adaption_interval=50,
          restore=True):
    rank = MPI.COMM_WORLD.Get_rank()

    #assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.
    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    #max_action = env.action_space.high
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    #    agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
    #        gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations,
    #        batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg,
    #        actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm,
    #        reward_scale=reward_scalei)
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale,
                 action_range=(0., 1.))

    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver(max_to_keep=600)
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        if restore == True:
            logger.info("Restoring from saved model")
            saver = tf.train.import_meta_graph(savingModelPath +
                                               "ddpg_test_model.meta")
            saver.restore(sess, tf.train.latest_checkpoint(savingModelPath))
        else:
            logger.info("Strarting from scratch!")
            sess.run(tf.global_variables_initializer())
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        #episode_pose = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        #epoch_episode_poses = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_act = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    #assert action.shape == env.action_space.shape
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    controller.assign_param(max_action * action)
                    act = controller.control(obs)
                    #act = np.clip(act, env.action_space.low, env.action_space.high)
                    new_obs, r, done, info = env.step(
                        act
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    #r += -.1*(act**2)
                    #pose = to_angle_square(new_obs)
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    #episode_pose += pose
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_act.append(act)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        #epoch_episode_poses.append(episode_pose)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        #episode_pose = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                #eval_episode_poses = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    eval_episode_pose = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        controller.assign_param(max_action * eval_action)
                        eval_act = controller.control(eval_obs)
                        #eval_act = np.clip(eval_act, eval_env.action_space.low, eval_env.action_space.high)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            eval_act
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        #eval_r += -.1*(eval_act**2)
                        #eval_p = to_angle_square(eval_obs)
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r
                        #eval_episode_pose += eval_p
                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            #eval_episode_poses.append(eval_episode_pose)
                            eval_episode_rewards_history.append(
                                eval_episode_reward)
                            eval_episode_reward = 0.
                            #eval_episode_pose = 0.
                if saver is not None:
                    logger.info("saving the trained model")
                    start_time_save = time.time()
                    saver.save(
                        sess, savingModelPath + str(
                            (epoch + 1) * (cycle + 1)) + '/' +
                        "ddpg_test_model")
                    logger.info('runtime saving: {}s'.format(time.time() -
                                                             start_time_save))
            mpi_size = MPI.COMM_WORLD.Get_size()
            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            #combined_stats['rollout/return_pose'] = np.mean(epoch_episode_poses)
            combined_stats['rollout/return_history'] = np.mean(
                episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(
                epoch_episode_steps)
            total = 0
            for params in epoch_actions:
                total += params
            total = max_action * total
            KTH_means = total[0] / len(epoch_actions)
            KE_means = total[1] / len(epoch_actions)
            #Kd_means = total[2]/len(epoch_actions)
            combined_stats['rollout/KTH_mean'] = KTH_means
            combined_stats['rollout/KE_mean'] = KE_means
            #combined_stats['rollout/Target_speed_mean'] = Kd_means
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(
                epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(
                duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)
            combined_stats['rollout/act_mean'] = np.mean(epoch_act)
            combined_stats['rollout/act_std'] = np.std(epoch_act)
            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = np.mean(eval_episode_rewards)
                #combined_stats['eval/return_pose'] = np.mean(eval_episode_poses)
                combined_stats['eval/return_history'] = np.mean(
                    eval_episode_rewards_history)
                combined_stats['eval/Q_mean'] = np.mean(eval_qs)
                combined_stats['eval/episodes'] = len(eval_episode_rewards)

            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                #else:
                #    raise ValueError('expected scalar, got %s'%x)

            combined_stats_sums = MPI.COMM_WORLD.allreduce(
                np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {
                k: v / mpi_size
                for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
            }

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)