예제 #1
0
파일: TD3.py 프로젝트: phymucs/HTRPO
def run_td3_train(env, agent, max_timesteps, logger, log_interval):

    timestep_counter = 0
    total_updates = max_timesteps / env.num_envs
    epinfobuf = deque(maxlen=100)
    observations = env.reset()

    loss_a = 0
    loss_c = 0
    explained_var = 0

    while (True):

        # collection of training data
        mb_obs, mb_as, mb_dones, mb_rs, mb_obs_ = [], [], [], [], []
        epinfos = []
        for i in range(0, agent.nsteps, env.num_envs):
            observations = torch.Tensor(observations)
            if timestep_counter > agent.learn_start_step:
                actions = agent.choose_action(observations)
                actions = actions.cpu().numpy().clip(env.action_space.low,
                                                     env.action_space.high)
            else:
                actions = []
                for i in range(env.num_envs):
                    actions.append(env.action_space.sample())
                actions = np.asarray(actions, dtype=np.float32)

            observations = observations.cpu().numpy()
            observations_, rewards, dones, infos = env.step(actions)
            for info in infos:
                maybeepinfo = info.get('episode')
                if maybeepinfo: epinfos.append(maybeepinfo)

            mb_obs.append(observations)
            mb_as.append(actions)
            mb_rs.append(rewards)
            mb_obs_.append(observations_)
            mb_dones.append(dones)

            observations = observations_

        epinfobuf.extend(epinfos)

        def reshape_data(arr):
            s = arr.shape
            return arr.reshape(s[0] * s[1], *s[2:])

        mb_obs = reshape_data(np.asarray(mb_obs, dtype=np.float32))
        mb_rs = reshape_data(np.asarray(mb_rs, dtype=np.float32))
        mb_as = reshape_data(np.asarray(mb_as))
        mb_dones = reshape_data(np.asarray(mb_dones, dtype=np.uint8))
        mb_obs_ = reshape_data(np.asarray(mb_obs_, dtype=np.float32))

        # store transition
        transition = {
            'state':
            mb_obs if mb_obs.ndim == 2 else np.expand_dims(mb_obs, 1),
            'action':
            mb_as if mb_as.ndim == 2 else np.expand_dims(mb_as, 1),
            'reward':
            mb_rs if mb_rs.ndim == 2 else np.expand_dims(mb_rs, 1),
            'next_state':
            mb_obs_ if mb_obs_.ndim == 2 else np.expand_dims(mb_obs_, 1),
            'done':
            mb_dones if mb_dones.ndim == 2 else np.expand_dims(mb_dones, 1),
        }
        agent.store_transition(transition)

        # training controller
        timestep_counter += agent.nsteps
        if timestep_counter >= max_timesteps:
            break

        if timestep_counter > agent.batch_size:
            # Update observation and reward mean and var.
            if agent.norm_ob:
                agent.ob_mean, agent.ob_var = env.ob_rms.mean, env.ob_rms.var
            if agent.norm_rw:
                agent.rw_mean, agent.rw_var = env.ret_rms.mean, env.ret_rms.var
            for i in range(0, agent.nsteps):
                agent.learn()

                # adjust learning rate for policy and value function
                # decay_coef = 1 - agent.learn_step_counter / total_updates
                # adjust_learning_rate(agent.optimizer_a, original_lr=agent.lr, decay_coef=decay_coef)
                # adjust_learning_rate(agent.optimizer_c, original_lr=agent.lrv, decay_coef=decay_coef)

                explained_var += 0.5 * explained_variance(agent.Qe1, agent.Qt)
                explained_var += 0.5 * explained_variance(agent.Qe2, agent.Qt)
                loss_a += agent.loss_a.item()
                loss_c += agent.loss_c.item()
                if agent.learn_step_counter % log_interval == 0:
                    print(
                        "------------------log information------------------")
                    print("total_timesteps:".ljust(20) + str(timestep_counter))
                    print("iterations:".ljust(20) +
                          str(agent.learn_step_counter) + " / " +
                          str(int(total_updates)))
                    print("explained_var:".ljust(20) +
                          str(explained_var / log_interval))
                    logger.add_scalar("explained_var/train",
                                      explained_var / log_interval,
                                      timestep_counter)
                    print("episode_len:".ljust(20) + "{:.1f}".format(
                        np.mean([epinfo['l'] for epinfo in epinfobuf])))
                    print("episode_rew:".ljust(20) +
                          str(np.mean([epinfo['r'] for epinfo in epinfobuf])))
                    logger.add_scalar(
                        "episode_reward/train",
                        np.mean([epinfo['r'] for epinfo in epinfobuf]),
                        timestep_counter)
                    print("max_episode_rew:".ljust(20) +
                          str(np.max([epinfo['r'] for epinfo in epinfobuf])))
                    print("min_episode_rew:".ljust(20) +
                          str(np.min([epinfo['r'] for epinfo in epinfobuf])))
                    print("loss_a:".ljust(20) + str(loss_a / log_interval))
                    logger.add_scalar("actor_loss/train",
                                      loss_a / log_interval, timestep_counter)
                    print("loss_c:".ljust(20) + str(loss_c / log_interval))
                    logger.add_scalar("critic_loss/train",
                                      loss_c / log_interval, timestep_counter)
                    print("action_noise_std:".ljust(20) + str(agent.noise))

                    explained_var = 0
                    loss_a = 0
                    loss_c = 0

    return agent
예제 #2
0
def run_pg_train(env, agent, max_timesteps, logger):
    timestep_counter = 0
    total_updates = max_timesteps // agent.nsteps
    epinfobuf = deque(maxlen=100)

    while (True):
        mb_obs, mb_rewards, mb_actions, mb_dones, mb_logpacs, mb_obs_, mb_mus, mb_sigmas \
            , mb_distris= [], [], [], [], [], [], [], [], []
        epinfos = []
        observations = env.reset()
        for i in range(0, agent.nsteps, env.num_envs):
            observations = torch.Tensor(observations)
            if not agent.dicrete_action:
                actions, mus, logsigmas, sigmas = agent.choose_action(
                    observations)
                logp = agent.compute_logp(mus, logsigmas, sigmas, actions)
                mus = mus.cpu().numpy()
                sigmas = sigmas.cpu().numpy()
                mb_mus.append(mus)
                mb_sigmas.append(sigmas)
            else:
                actions, distris = agent.choose_action(observations)
                logp = agent.compute_logp(distris, actions)
                distris = distris.cpu().numpy()
                mb_distris.append(distris)
            observations = observations.cpu().numpy()
            actions = actions.cpu().numpy()
            logp = logp.cpu().numpy()
            observations_, rewards, dones, infos = env.step(actions)
            for info in infos:
                maybeepinfo = info.get('episode')
                if maybeepinfo: epinfos.append(maybeepinfo)
            mb_obs.append(observations)
            mb_actions.append(actions)
            mb_logpacs.append(logp)
            mb_dones.append(dones.astype(np.uint8))
            mb_rewards.append(rewards)
            mb_obs_.append(observations_)
            observations = observations_

        epinfobuf.extend(epinfos)
        # make all final states marked by done, preventing wrong estimating of returns and advantages.
        # done flag:
        #      0: undone and not the final state
        #      1: realdone
        #      2: undone but the final state
        mb_dones[-1][np.where(mb_dones[-1] == 0)] = 2

        def reshape_data(arr):
            s = arr.shape
            return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:])

        mb_obs = reshape_data(np.asarray(mb_obs, dtype=np.float32))
        mb_rewards = reshape_data(np.asarray(mb_rewards, dtype=np.float32))
        mb_actions = reshape_data(np.asarray(mb_actions))
        mb_logpacs = reshape_data(np.asarray(mb_logpacs, dtype=np.float32))
        mb_dones = reshape_data(np.asarray(mb_dones, dtype=np.uint8))
        mb_obs_ = reshape_data(np.asarray(mb_obs_, dtype=np.float32))

        assert mb_obs.ndim <= 2 and mb_rewards.ndim <= 2 and mb_actions.ndim <= 2 and \
               mb_logpacs.ndim <= 2 and mb_dones.ndim <= 2 and mb_obs_.ndim <= 2, \
            "databuffer only supports 1-D data's batch."

        if not agent.dicrete_action:
            mb_mus = reshape_data(np.asarray(mb_mus, dtype=np.float32))
            mb_sigmas = reshape_data(np.asarray(mb_sigmas, dtype=np.float32))
            assert mb_mus.ndim <= 2 and mb_sigmas.ndim <= 2, "databuffer only supports 1-D data's batch."
        else:
            mb_distris = reshape_data(np.asarray(mb_distris, dtype=np.float32))
            assert mb_distris.ndim <= 2, "databuffer only supports 1-D data's batch."

        # store transition
        transition = {
            'state':
            mb_obs if mb_obs.ndim == 2 else np.expand_dims(mb_obs, 1),
            'action':
            mb_actions if mb_actions.ndim == 2 else np.expand_dims(
                mb_actions, 1),
            'reward':
            mb_rewards if mb_rewards.ndim == 2 else np.expand_dims(
                mb_rewards, 1),
            'next_state':
            mb_obs_ if mb_obs_.ndim == 2 else np.expand_dims(mb_obs_, 1),
            'done':
            mb_dones if mb_dones.ndim == 2 else np.expand_dims(mb_dones, 1),
            'logpac':
            mb_logpacs if mb_logpacs.ndim == 2 else np.expand_dims(
                mb_logpacs, 1),
        }
        if not agent.dicrete_action:
            transition['mu'] = mb_mus if mb_mus.ndim == 2 else np.expand_dims(
                mb_mus, 1)
            transition[
                'sigma'] = mb_sigmas if mb_sigmas.ndim == 2 else np.expand_dims(
                    mb_sigmas, 1)
        else:
            transition[
                'distri'] = mb_distris if mb_distris.ndim == 2 else np.expand_dims(
                    mb_distris, 1)
        agent.store_transition(transition)

        # agent learning step
        agent.learn()

        # training controller
        timestep_counter += agent.nsteps
        if timestep_counter >= max_timesteps:
            break

        # adjust learning rate for policy and value function
        decay_coef = 1 - agent.learn_step_counter / total_updates
        adjust_learning_rate(agent.optimizer,
                             original_lr=agent.lr,
                             decay_coef=decay_coef)
        if agent.value_type is not None:
            adjust_learning_rate(agent.v_optimizer,
                                 original_lr=agent.lr_v,
                                 decay_coef=decay_coef)

        print("------------------log information------------------")
        print("total_timesteps:".ljust(20) + str(timestep_counter))
        print("iterations:".ljust(20) + str(agent.learn_step_counter) + " / " +
              str(int(total_updates)))
        if agent.value_type is not None:
            explained_var = explained_variance(agent.V.cpu().numpy(),
                                               agent.esti_R.cpu().numpy())
            print("explained_var:".ljust(20) + str(explained_var))
            logger.add_scalar("explained_var/train", explained_var,
                              timestep_counter)
        print("episode_len:".ljust(20) +
              "{:.1f}".format(np.mean([epinfo['l'] for epinfo in epinfobuf])))
        print("episode_rew:".ljust(20) +
              str(np.mean([epinfo['r'] for epinfo in epinfobuf])))
        logger.add_scalar("episode_reward/train",
                          np.mean([epinfo['r'] for epinfo in epinfobuf]),
                          timestep_counter)
        print("mean_kl:".ljust(20) + str(agent.cur_kl))
        logger.add_scalar("mean_kl/train", agent.cur_kl, timestep_counter)
        print("policy_ent:".ljust(20) + str(agent.policy_ent))
        logger.add_scalar("policy_ent/train", agent.policy_ent,
                          timestep_counter)
        print("policy_loss:".ljust(20) + str(agent.policy_loss))
        logger.add_scalar("policy_loss/train", agent.policy_loss,
                          timestep_counter)
        print("value_loss:".ljust(20) + str(agent.value_loss))
        logger.add_scalar("value_loss/train", agent.value_loss,
                          timestep_counter)
    return agent
예제 #3
0
def run_htrpo_train(env, agent, max_timesteps, logger, eval_interval = None, num_evals = 5, render = False):
    timestep_counter = 0
    total_updates = max_timesteps // agent.nsteps
    epinfobuf = deque(maxlen=100)
    success_history = deque(maxlen=100)
    ep_num = 0

    if eval_interval:
        eval_ret, eval_success = agent.eval_brain(env, render=render, eval_num=num_evals)
        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        print("eval_ep_rew:".ljust(20) + str(np.mean(eval_ret)))
        print("eval_suc_rate:".ljust(20) + str(np.mean(eval_success)))
        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        logger.add_scalar("episode_reward/eval", np.mean(eval_ret), timestep_counter)
        logger.add_scalar("success_rate/eval", np.mean(eval_success), timestep_counter)

    while (True):
        mb_obs, mb_rewards, mb_actions, mb_dones, mb_logpacs, mb_obs_, mb_mus, mb_sigmas \
            , mb_distris = [], [], [], [], [], [], [], [], []
        mb_dg, mb_ag = [], []
        epinfos = []
        successes = []
        obs_dict = env.reset()
        # env.render()

        for i in range(0, agent.nsteps, env.num_envs):
            for key in obs_dict.keys():
                obs_dict[key] = torch.Tensor(obs_dict[key])

            if not agent.dicrete_action:
                actions, mus, logsigmas, sigmas = agent.choose_action(obs_dict["observation"],
                                                                      other_data=obs_dict["desired_goal"])
                logp = agent.compute_logp(mus, logsigmas, sigmas, actions)
                mus = mus.cpu().numpy()
                sigmas = sigmas.cpu().numpy()
                mb_mus.append(mus)
                mb_sigmas.append(sigmas)
            else:
                actions, distris = agent.choose_action(obs_dict["observation"],
                                                       other_data=obs_dict["desired_goal"])
                logp = agent.compute_logp(distris, actions)
                distris = distris.cpu().numpy()
                mb_distris.append(distris)
            observations = obs_dict['observation'].cpu().numpy()
            actions = actions.cpu().numpy()
            logp = logp.cpu().numpy()

            if np.random.rand() < 0.0:
                actions = np.concatenate([np.expand_dims(env.action_space.sample(), axis=0)
                                          for i in range(env.num_envs)], axis = 0)
                obs_dict_, rewards, dones, infos = env.step(actions)
            else:
                obs_dict_, rewards, dones, infos = env.step(actions)

            # if timestep_counter > 350000:
            # env.render()

            mb_obs.append(observations)
            mb_actions.append(actions)
            mb_logpacs.append(logp)
            mb_dones.append(dones.astype(np.uint8))
            mb_rewards.append(rewards)
            mb_obs_.append(obs_dict_['observation'].copy())
            mb_dg.append(obs_dict_['desired_goal'].copy())
            mb_ag.append(obs_dict_['achieved_goal'].copy())

            for e, info in enumerate(infos):
                if dones[e]:
                    epinfos.append(info.get('episode'))
                    successes.append(info.get('is_success'))
                    for k in obs_dict_.keys():
                        obs_dict_[k][e] = info.get('new_obs')[k]
                    ep_num += 1

            obs_dict = obs_dict_

        epinfobuf.extend(epinfos)
        success_history.extend(successes)

        # make all final states marked by done, preventing wrong estimating of returns and advantages.
        # done flag:
        #      0: undone and not the final state
        #      1: realdone
        #      2: undone but the final state
        ep_num += (mb_dones[-1] == 0).sum()
        mb_dones[-1][np.where(mb_dones[-1] == 0)] = 2

        def reshape_data(arr):
            s = arr.shape
            return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:])

        mb_obs = reshape_data(np.asarray(mb_obs, dtype=np.float32))
        mb_rewards = reshape_data(np.asarray(mb_rewards, dtype=np.float32))
        mb_actions = reshape_data(np.asarray(mb_actions))
        mb_logpacs = reshape_data(np.asarray(mb_logpacs, dtype=np.float32))
        mb_dones = reshape_data(np.asarray(mb_dones, dtype=np.uint8))
        mb_obs_ = reshape_data(np.asarray(mb_obs_, dtype=np.float32))
        mb_ag = reshape_data(np.asarray(mb_ag, dtype=np.float32))
        mb_dg = reshape_data(np.asarray(mb_dg, dtype=np.float32))

        assert mb_rewards.ndim <= 2 and mb_actions.ndim <= 2 and \
               mb_logpacs.ndim <= 2 and mb_dones.ndim <= 2, \
            "databuffer only supports 1-D data's batch."

        if not agent.dicrete_action:
            mb_mus = reshape_data(np.asarray(mb_mus, dtype=np.float32))
            mb_sigmas = reshape_data(np.asarray(mb_sigmas, dtype=np.float32))
            assert mb_mus.ndim <= 2 and mb_sigmas.ndim <= 2, "databuffer only supports 1-D data's batch."
        else:
            mb_distris = reshape_data(np.asarray(mb_distris, dtype=np.float32))
            assert mb_distris.ndim <= 2, "databuffer only supports 1-D data's batch."

        # store transition
        transition = {
            'state': mb_obs if mb_obs.ndim == 2 or mb_obs.ndim == 4 else np.expand_dims(mb_obs, 1),
            'action': mb_actions if mb_actions.ndim == 2 else np.expand_dims(mb_actions, 1),
            'reward': mb_rewards if mb_rewards.ndim == 2 else np.expand_dims(mb_rewards, 1),
            'next_state': mb_obs_ if mb_obs_.ndim == 2 or mb_obs_.ndim == 4 else np.expand_dims(mb_obs_, 1),
            'done': mb_dones if mb_dones.ndim == 2 else np.expand_dims(mb_dones, 1),
            'logpac': mb_logpacs if mb_logpacs.ndim == 2 else np.expand_dims(mb_logpacs, 1),
            'other_data': {
                'desired_goal': mb_dg if mb_dg.ndim == 2 else np.expand_dims(mb_dg, 1),
                'achieved_goal': mb_ag if mb_ag.ndim == 2 else np.expand_dims(mb_ag, 1),
            }
        }
        if not agent.dicrete_action:
            transition['mu'] = mb_mus if mb_mus.ndim == 2 else np.expand_dims(mb_mus, 1)
            transition['sigma'] = mb_sigmas if mb_sigmas.ndim == 2 else np.expand_dims(mb_sigmas, 1)
        else:
            transition['distri'] = mb_distris if mb_distris.ndim == 2 else np.expand_dims(mb_distris, 1)
        agent.store_transition(transition)

        # agent learning step
        agent.learn()

        # training controller
        timestep_counter += agent.nsteps
        if timestep_counter > max_timesteps:
            break

        print("------------------log information------------------")
        print("total_timesteps:".ljust(20) + str(timestep_counter))
        print("valid_ep_ratio:".ljust(20) + "{:.3f}".format(agent.n_valid_ep / ep_num))
        logger.add_scalar("valid_ep_ratio/train", agent.n_valid_ep / ep_num, timestep_counter)
        if agent.n_valid_ep > 0:
            print("iterations:".ljust(20) + str(agent.learn_step_counter) + " / " + str(int(total_updates)))
            if agent.value_type is not None:
                explained_var = explained_variance(agent.V.cpu().numpy(), agent.esti_R.cpu().numpy())
                print("explained_var:".ljust(20) + str(explained_var))
                logger.add_scalar("explained_var/train", explained_var, timestep_counter)
            print("episode_len:".ljust(20) + "{:.1f}".format(np.mean([epinfo['l'] for epinfo in epinfobuf])))
            rew = np.mean([epinfo['r'] for epinfo in epinfobuf]) + agent.max_steps
            print("episode_rew:".ljust(20) + str(rew))
            logger.add_scalar("episode_reward/train", rew, timestep_counter)
            print("success_rate:".ljust(20) + "{:.3f}".format(100 * np.mean(success_history)) + "%")
            logger.add_scalar("success_rate/train", np.mean(success_history), timestep_counter)
            print("mean_kl:".ljust(20) + str(agent.cur_kl))
            logger.add_scalar("mean_kl/train", agent.cur_kl, timestep_counter)
            print("policy_ent:".ljust(20) + str(agent.policy_ent))
            logger.add_scalar("policy_ent/train", agent.policy_ent, timestep_counter)
            print("value_loss:".ljust(20) + str(agent.value_loss))
            logger.add_scalar("value_loss/train", agent.value_loss, timestep_counter)
            print("actual_imprv:".ljust(20) + "{:.5f}".format(agent.improvement))
            logger.add_scalar("actual_imprv/train", agent.improvement, timestep_counter)
            print("exp_imprv:".ljust(20) + "{:.5f}".format(agent.expected_improvement))
            logger.add_scalar("exp_imprv/train", agent.expected_improvement, timestep_counter)
            ep_num = 0
        else:
            print("No valid episode was collected. Policy has not been updated.")

        if eval_interval and timestep_counter % eval_interval == 0:
            agent.save_model("output/models/HTRPO")
            eval_ret, eval_success = agent.eval_brain(env, render=render, eval_num=num_evals)
            print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            print("eval_ep_rew:".ljust(20) + str(np.mean(eval_ret)))
            print("eval_suc_rate:".ljust(20) + str(np.mean(eval_success)))
            print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            logger.add_scalar("episode_reward/eval", np.mean(eval_ret), timestep_counter)
            logger.add_scalar("success_rate/eval", np.mean(eval_success), timestep_counter)

    return agent