def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """

    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name, False)
    if time_state:
        obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H-%M-%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)

    scaler = Scaler(obs_dim, env_name)
    val_func = NNValueFunction(obs_dim, env_name, True)
    arg = [obs_dim, act_dim, kl_targ, time_state, env_name]
    policy = Policy(obs_dim, act_dim, kl_targ, env_name, True)

    episode = 0

    # to create new file at beginning of trial
    #f= open("coor_state.txt","w")
    #f.close

    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  arg,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        scaler.save()
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    logger.close()
    policy.close_sess()
    val_func.close_sess()
示例#2
0
def main(env_name, num_episodes, render, gamma, lam, kl_targ, batch_size):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name, render)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime("%b-%d_%H-%M-%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)

    scaler = Scaler(obs_dim, env_name)
    val_func = NNValueFunction(obs_dim, env_name)
    policy = Policy(obs_dim, act_dim, kl_targ, env_name)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    #capture = False
    while episode < num_episodes:
        trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size)
        episode += len(trajectories)
        """if episode > 600 and not capture:
               env.ScreenCapture(5)
               capture = True"""
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories, gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        
        logger.write(display=True)  # write logger results to file and stdout
        scaler.save()
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    logger.close()
    policy.close_sess()
    val_func.close_sess()
class Experiment:

    def __init__(self, env_name, discount, num_iterations, lamb, animate, kl_target, show):
        self.env_name = env_name
        self.env = gym.make(env_name)
        if env_name == "FetchReach-v0":
            self.env = gym.wrappers.FlattenDictWrapper(self.env, ['observation', 'desired_goal', 'achieved_goal'])
        gym.spaces.seed(1234)
        self.obs_dim = self.env.observation_space.shape[0] + 1 # adding time step as feature
        self.act_dim = self.env.action_space.shape[0]
        self.discount = discount
        self.num_iterations = num_iterations
        self.lamb = lamb
        self.animate = animate
        self.episodes = 20
        self.killer = GracefulKiller()
        # self.policy = ProximalPolicy(self.obs_dim, self.act_dim, self.env.action_space, kl_target, discount=discount,
        #                              lamb=lamb)
        self.policy = NoTracePolicy(self.obs_dim, self.act_dim, self.env.action_space, kl_target, epochs=20)
        # using MC return would be more helpful
        self.value_func = l2TargetValueFunc(self.obs_dim, epochs=10)
        # self.value_func = ValueFunc(self.obs_dim, discount=discount, lamb=1)

        if not show:
            # save copies of file
            shutil.copy(inspect.getfile(self.policy.__class__), OUTPATH)
            shutil.copy(inspect.getfile(self.value_func.__class__), OUTPATH)
            shutil.copy(inspect.getfile(self.__class__), OUTPATH)

            self.log_file = open(OUTPATH + 'log.csv', 'w')
            self.write_header = True

        print('observation dimension:', self.obs_dim)
        print('action dimension:', self.act_dim)

        # Use of a scaler is crucial
        self.scaler = Scaler(self.obs_dim)
        self.init_scaler()

    def init_scaler(self):
        print('fitting scaler')
        observation_samples = []
        for i in range(5):
            observation = []
            obs = self.env.reset()
            observation.append(obs)
            obs = obs.astype(np.float64).reshape((1, -1))
            done = False
            step = 0
            while not done:
                obs = np.append(obs, [[step]], axis=1)  # add time step feature
                action = self.policy.get_sample(obs).reshape((1, -1)).astype(np.float64)
                if self.env_name == "FetchReach-v0":
                    obs_new, reward, done, _ = self.env.step(action.reshape(-1))
                else:
                    obs_new, reward, done, _ = self.env.step(action)
                observation.append(obs_new)
                obs = obs_new.astype(np.float64).reshape((1, -1))
                step += 1e-3
            observation_samples.append(observation)
        observation_samples = np.concatenate(observation_samples, axis=0)
        # print(observation_samples.shape)
        self.scaler.update(observation_samples)

    def normalize_obs(self, obs):
        scale, offset = self.scaler.get()
        obs_scaled = (obs-offset)*scale
        self.scaler.update(obs.astype(np.float64).reshape((1, -1)))
        return obs_scaled

    def run_one_episode(self):
        """
        collect data only
        :param save:
        :param train_policy:
        :param train_value_func:
        :param animate:
        :return:
        """
        obs = self.env.reset()
        observes, actions, rewards = [],[],[]
        done = False
        step = 0
        while not done:
            if self.animate:
                self.env.render()
            obs = obs.astype(np.float64).reshape((1, -1))
            obs = self.normalize_obs(obs)
            obs = np.append(obs, [[step]], axis=1)  # add time step feature
            observes.append(obs)
            action = self.policy.get_sample(obs).reshape((1, -1)).astype(np.float64)
            actions.append(action)
            if self.env_name == "FetchReach-v0":
                obs_new, reward, done, _ = self.env.step(action.reshape(-1))
            else:
                obs_new, reward, done, _ = self.env.step(action)
            if not isinstance(reward, float):
                reward = np.asscalar(reward)
            rewards.append(reward)

            obs = obs_new
            step += 0.003

        return np.concatenate(observes), np.concatenate(actions), np.array(rewards)

    def discounted_sum(self, l, factor):
        discounted = []
        sum = 0
        for i in reversed(l):
            discounted.append(factor*sum+i)
            sum = factor*sum+i
        return np.array(list(reversed(discounted)))

    def run_policy(self, episodes):
        trajectories = []
        for e in range(episodes):
            observes, actions, rewards = self.run_one_episode()
            trajectory = {'observes': observes,
                          'actions': actions,
                          'rewards': rewards}
            # scale rewards
            if self.discount < 0.999:
                rewards = rewards*(1-self.discount)

            trajectory['values'] = self.value_func.predict(observes)
            trajectory['mc_return'] = self.discounted_sum(rewards, self.discount)

            trajectory['td_residual'] = rewards + self.discount*np.append(trajectory['values'][1:],0) - trajectory['values']
            trajectory['gae'] = self.discounted_sum(trajectory['td_residual'], self.discount*self.lamb)

            trajectories.append(trajectory)

        return trajectories

    def run_expr(self):
        ep_steps = []
        ep_rewards = []
        ep_entropy = []
        i = 0
        while i < self.num_iterations:
            trajectories = self.run_policy(20)
            i += len(trajectories)
            observes = np.concatenate([t['observes'] for t in trajectories])
            actions = np.concatenate([t['actions'] for t in trajectories])
            mc_returns = np.concatenate([t['mc_return'] for t in trajectories])
            advantages = np.concatenate([t['td_residual'] for t in trajectories])
            # advantages = np.concatenate([t['gae'] for t in trajectories])

            # normalize advantage estimates
            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6)

            value_func_loss = self.value_func.update(observes, mc_returns)
            policy_loss, kl, entropy, beta = self.policy.update(observes, actions, advantages)

            avg_rewards = np.sum(np.concatenate([t['rewards'] for t in trajectories])) / self.episodes
            avg_timesteps = np.average([len(t['rewards']) for t in trajectories])
            log = {}

            # compute statistics such as mean and std
            log['steps'] = avg_timesteps
            log['rewards'] = avg_rewards
            log['policy_loss'] = policy_loss
            log['kl'] = kl
            log['entropy'] = entropy
            log['value_func_loss'] = value_func_loss
            log['beta'] = beta

            # display
            print('episode: ', i)
            print('average steps: {0}, average rewards: {1}'.format(log['steps'], log['rewards']))
            for key in ['policy_loss', 'kl', 'entropy', 'beta', 'value_func_loss']:
                print('{:s}: {:.2g}'.format(key, log[key]))
            print('\n')
            ep_steps.append(log['steps'])
            ep_rewards.append(log['rewards'])
            ep_entropy.append(log['entropy'])


            # write to log.csv
            if self.write_header:
                fieldnames = [x for x in log.keys()]
                self.writer = csv.DictWriter(self.log_file, fieldnames=fieldnames)
                self.writer.writeheader()
                self.write_header = False
            self.writer.writerow(log)
            # we want the csv file to preserve information even if the program terminates earlier than scheduled.
            self.log_file.flush()

            # save model weights if stopped manually
            if self.killer.kill_now:
                if input('Terminate training (y/[n])? ') == 'y':
                    break
                self.killer.kill_now = False

            # if (i+1)%20 == 0:
            #     print('episode: ', i+1)
            #     print('average steps', np.average(steps))
            #     print('average rewards', np.average(rewards))

        self.policy.save(OUTPATH)
        self.value_func.save(OUTPATH)
        self.scaler.save(OUTPATH)


        plt.figure(figsize=(12,9))

        if self.env_name.startswith('Fetch'):
            ax1 = plt.subplot(121)
            plt.xlabel('episodes')
            plt.ylabel('policy entropy')
            plt.plot(ep_entropy)
            scale_x = self.episodes
            ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x))
            ax1.xaxis.set_major_formatter(ticks_x)
        else:
            ax1 = plt.subplot(121)
            plt.xlabel('episodes')
            plt.ylabel('steps')
            plt.plot(ep_steps)
            scale_x = self.episodes
            ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x))
            ax1.xaxis.set_major_formatter(ticks_x)

        ax2 = plt.subplot(122)
        plt.xlabel('episodes')
        plt.ylabel('episodic rewards')
        plt.plot(ep_rewards)
        scale_x = self.episodes
        ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x))
        ax2.xaxis.set_major_formatter(ticks_x)

        plt.savefig(OUTPATH + 'train.png')

    def load_model(self, load_from):
        from tensorflow.python.tools import inspect_checkpoint as chkp

        # # print all tensors in checkpoint file
        # chkp.print_tensors_in_checkpoint_file(load_from+'policy/policy.pl', tensor_name='', all_tensors=True, all_tensor_names=True)
        self.policy.load(load_from + 'policy/policy.pl')
        self.value_func.load(load_from + 'value_func/value_func.pl')

    def demonstrate_agent(self, load_from):
        self.load_model(load_from)
        with open(load_from + "scaler.pkl", 'rb') as file:
            self.scaler = pickle.load(file)
        self.animate = True
        for i in range(10):
            observes, actons, rewards = self.run_one_episode()
            ep_rewards = np.sum(rewards)
            ep_steps = len(rewards)
            print("Total steps: {0}, total rewards: {1}\n".format(ep_steps, ep_rewards))
示例#4
0
class Experiment:

    def __init__(self, env_name, discount, num_iterations, lamb, animate, kl_target, **kwargs):
        self.env_name = env_name
        self.env = gym.make(env_name)
        if env_name.startswith('Fetch'): # FetchReach env is a little bit different
            self.env = gym.wrappers.FlattenDictWrapper(self.env, ['observation', 'desired_goal', 'achieved_goal'])
        gym.spaces.seed(1234) # for reproducibility
        self.obs_dim = self.env.observation_space.shape[0] + 1 # adding time step as feature
        self.act_dim = self.env.action_space.shape[0]
        self.discount = discount
        self.num_iterations = num_iterations
        self.lamb = lamb
        self.animate = animate

        self.buffer = Buffer(1000000, self.obs_dim, self.act_dim) # 1000000 is the size they have used in paper
        self.episodes = 20 # larger episodes can reduce variance
        self.killer = GracefulKiller()

        self.policy = QPropPolicy(self.obs_dim, self.act_dim, self.env.action_space, kl_target, epochs=20)
        self.critic = DeterministicCritic(self.obs_dim, self.act_dim, self.discount, OUTPATH)
        self.value_func = l2TargetValueFunc(self.obs_dim, epochs=10)

        if 'show' in kwargs and not kwargs['show']:
            # save copies of file
            shutil.copy(inspect.getfile(self.policy.__class__), OUTPATH)
            shutil.copy(inspect.getfile(self.value_func.__class__), OUTPATH)
            shutil.copy(inspect.getfile(self.critic.__class__), OUTPATH)
            shutil.copy(inspect.getfile(self.__class__), OUTPATH)

            self.log_file = open(OUTPATH + 'log.csv', 'w')
            self.write_header = True

        print('Observation dimension:', self.obs_dim)
        print('Action dimension:', self.act_dim)

        # The use of a scaler is crucial
        self.scaler = Scaler(self.obs_dim)
        self.init_scaler()

    def init_scaler(self):
        """
        Collection observations from 5 episodes to initialize Scaler.
        :return: a properly initialized scaler
        """
        print('Fitting scaler')
        observation_samples = []
        for i in range(5):
            observation = []
            obs = self.env.reset()
            observation.append(obs)
            obs = obs.astype(np.float64).reshape((1, -1))
            done = False
            step = 0
            while not done:
                obs = np.append(obs, [[step]], axis=1)  # add time step feature
                action = self.policy.get_sample(obs).reshape((1, -1)).astype(np.float64)
                if self.env_name.startswith('Fetch'):
                    obs_new, reward, done, _ = self.env.step(action.reshape(-1))
                else:
                    obs_new, reward, done, _ = self.env.step(action)
                observation.append(obs_new)
                obs = obs_new.astype(np.float64).reshape((1, -1))
                step += 1e-3
            observation_samples.append(observation)
        observation_samples = np.concatenate(observation_samples, axis=0)
        self.scaler.update(observation_samples)

    def normalize_obs(self, obs):
        """
        Transform and update the scaler on the fly.
        :param obs: Raw observation
        :return: normalized observation
        """
        scale, offset = self.scaler.get()
        obs_scaled = (obs-offset)*scale
        self.scaler.update(obs.astype(np.float64).reshape((1, -1)))
        return obs_scaled

    def run_one_episode(self):
        """
        collect a trajectory of (obs, act, reward, obs_next)
        """
        obs = self.env.reset()
        observes, actions, rewards = [],[],[]
        done = False
        step = 0
        while not done:
            if self.animate:
                self.env.render()

            obs = obs.astype(np.float64).reshape((1, -1))
            obs = self.normalize_obs(obs)
            obs = np.append(obs, [[step]], axis=1)  # add time step feature at normalized observation
            observes.append(obs)

            action = self.policy.get_sample(obs).reshape((1, -1)).astype(np.float64)
            actions.append(action)
            if self.env_name.startswith('Fetch'):
                obs_new, reward, done, _ = self.env.step(action.reshape(-1))
            else:
                obs_new, reward, done, _ = self.env.step(action)

            if not isinstance(reward, float):
                reward = np.asscalar(reward)
            rewards.append(reward)

            obs = obs_new
            step += 0.003

        return np.concatenate(observes), np.concatenate(actions), np.array(rewards)

    def discounted_sum(self, l, factor):
        """
        Discounted sum of return or advantage estimates along a trajectory.
        :param l: a list containing the values of discounted summed interest.
        :param factor: discount factor in the disc_sum case or discount*lambda for GAE
        :return: discounted sum of l with regard to factor
        """
        discounted = []
        sum = 0
        for i in reversed(l):
            discounted.append(factor*sum+i)
            sum = factor*sum+i
        return np.array(list(reversed(discounted)))

    def run_policy(self, episodes):
        """
        Gather a batch of trajectory samples.
        :param episodes: size of batch.
        :return: a batch of samples
        """
        trajectories = []
        for e in range(episodes):
            observes, actions, rewards = self.run_one_episode()
            trajectory = {'observes': observes,
                          'actions': actions,
                          'rewards': rewards,
                          'scaled_rewards': rewards*(1-self.discount)}
            trajectories.append(trajectory)

        return trajectories

    def run_expr(self):
        ep_steps = []
        ep_rewards = []
        ep_entropy = []
        i = 0
        while i < self.num_iterations:
            trajectories = self.run_policy(20)
            # add to experience replay buffer
            self.buffer.append(trajectories)
            print('buffer size:', self.buffer.size())

            i += len(trajectories)

            # for E=20, T=50, the total number of samples would be 1000
            # In future needs to account for not uniform time steps per episode.
            # e.g. in Hopper-v2 environment not every episode has same time steps
            # E = len(trajectories)
            # num_samples = np.sum([len(t['rewards']) for t in trajectories])
            gradient_steps = np.sum([len(t['rewards']) for t in trajectories])
            if self.env_name.startswith('Fetch'):
                assert (gradient_steps == 20*50)

            """train critic"""
            # train all samples in the buffer, to the extreme
            # self.critic.fit(self.policy, self.buffer, epochs=20, num_samples=self.buffer.size())
            # train some samples minibatches only
            critic_loss_mean, critic_loss_std = self.critic.another_fit_func(self.policy, self.buffer, gradient_steps)

            """calculation of episodic discounted return only needs rewards"""
            mc_returns = np.concatenate([self.discounted_sum(t['scaled_rewards'], self.discount) for t in trajectories])

            """using current batch of samples to update baseline"""
            observes = np.concatenate([t['observes'] for t in trajectories])
            actions = np.concatenate([t['actions'] for t in trajectories])
            value_func_loss = self.value_func.update(observes, mc_returns)

            """compute GAE"""
            for t in trajectories:
                t['values'] = self.value_func.predict(t['observes'])
                # IS it really legitimate to insert 0 at the last obs?
                t['td_residual'] = t['scaled_rewards'] + self.discount * np.append(t['values'][1:], 0) - t['values']
                t['gae'] = self.discounted_sum(t['td_residual'], self.discount * self.lamb)
            advantages = np.concatenate([t['gae'] for t in trajectories])
            """normalize advantage estimates, Crucial step"""
            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6)

            """compute control variate"""""
            cv = self.critic.get_contorl_variate(self.policy, observes, actions)
            # cv must not be centered
            # cv = (cv - cv.mean()) / (cv.std() + 1e-6)

            """conservative control variate"""
            eta = [1 if i > 0 else 0 for i in advantages*cv]

            """center learning signal"""
            # check that advantages and CV should be of size E*T
            # eta controls the on-off of control variate
            learning_signal = advantages - eta*cv
            # learning_signal = (learning_signal - learning_signal.mean()) / (learning_signal.std() + 1e-6)

            """controlled taylor eval term"""
            ctrl_taylor = np.concatenate([ [eta[i]*act] for i, act in enumerate(self.critic.get_taylor_eval(self.policy, observes))])

            """policy update"""
            ppo_loss, ddpg_loss, kl, entropy, beta = self.policy.update(observes, actions, learning_signal, ctrl_taylor)

            avg_rewards = np.sum(np.concatenate([t['rewards'] for t in trajectories])) / self.episodes
            avg_timesteps = np.average([len(t['rewards']) for t in trajectories])
            log = {}

            # save training statistics
            log['steps'] = avg_timesteps
            log['rewards'] = avg_rewards
            log['critic_loss'] = critic_loss_mean
            log['policy_ppo_loss'] = ppo_loss
            log['policy_ddpg_loss'] = ddpg_loss
            log['kl'] = kl
            log['entropy'] = entropy
            log['value_func_loss'] = value_func_loss
            log['beta'] = beta

            # display
            print('episode: ', i)
            print('average steps: {0}, average rewards: {1}'.format(log['steps'], log['rewards']))
            for key in ['critic_loss', 'policy_ppo_loss', 'policy_ddpg_loss', 'value_func_loss', 'kl', 'entropy', 'beta']:
                print('{:s}: {:.2g}'.format(key, log[key]))
            print('\n')
            ep_steps.append(log['steps'])
            ep_rewards.append(log['rewards'])
            ep_entropy.append(log['entropy'])

            # write to log.csv
            if self.write_header:
                fieldnames = [x for x in log.keys()]
                self.writer = csv.DictWriter(self.log_file, fieldnames=fieldnames)
                self.writer.writeheader()
                self.write_header = False
            self.writer.writerow(log)
            # we want the csv file to preserve information even if the program terminates earlier than scheduled.
            self.log_file.flush()

            # save model weights if stopped early
            if self.killer.kill_now:
                if input('Terminate training (y/[n])? ') == 'y':
                    break
                self.killer.kill_now = False

        self.policy.save(OUTPATH)
        self.value_func.save(OUTPATH)
        self.scaler.save(OUTPATH)

        plt.figure(figsize=(12,9))

        if self.env_name.startswith('Fetch'):
            ax1 = plt.subplot(121)
            plt.xlabel('episodes')
            plt.ylabel('policy entropy')
            plt.plot(ep_entropy)
            scale_x = self.episodes
            ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x))
            ax1.xaxis.set_major_formatter(ticks_x)
        else:
            ax1 = plt.subplot(121)
            plt.xlabel('episodes')
            plt.ylabel('steps')
            plt.plot(ep_steps)
            scale_x = self.episodes
            ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x))
            ax1.xaxis.set_major_formatter(ticks_x)

        ax2 = plt.subplot(122)
        plt.xlabel('episodes')
        plt.ylabel('episodic rewards')
        plt.plot(ep_rewards)
        scale_x = self.episodes
        ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x))
        ax2.xaxis.set_major_formatter(ticks_x)

        plt.savefig(OUTPATH + 'train.png')

    def load_model(self, load_from):
        """
        Load all Function Approximators plus a Scaler.
        Replaybuffer is not restored though.
        :param load_from: Dir containing saved weights.
        """
        from tensorflow.python.tools import inspect_checkpoint as chkp
        # # print all tensors in checkpoint file
        # chkp.print_tensors_in_checkpoint_file(load_from+'policy/policy.pl', tensor_name='', all_tensors=True, all_tensor_names=True)
        self.policy.load(load_from + 'policy/')
        self.value_func.load(load_from + 'value_func/')
        self.critic.load(load_from+'critic/')
        with open(load_from + "scaler.pkl", 'rb') as file:
            self.scaler = pickle.load(file)

    def demonstrate_agent(self, load_from):
        """
        Simply run the policy without training.
        :param load_from:
        :return:
        """
        self.load_model(load_from)
        while True:
            observes, actons, rewards = self.run_one_episode()
            ep_rewards = np.sum(rewards)
            ep_steps = len(rewards)
            print("Total steps: {0}, total rewards: {1}\n".format(ep_steps, ep_rewards))
示例#5
0
class Experiment:
    def __init__(self, env_name, discount, num_iterations, lamb, animate,
                 kl_target, show):
        self.env_name = env_name
        self.env = gym.make(env_name)
        if env_name == "FetchReach-v0":
            self.env = gym.wrappers.FlattenDictWrapper(
                self.env, ['observation', 'desired_goal', 'achieved_goal'])
        gym.spaces.seed(1234)
        self.obs_dim = self.env.observation_space.shape[
            0] + 1  # adding time step as feature
        self.act_dim = self.env.action_space.shape[0]
        self.discount = discount
        self.num_iterations = num_iterations
        self.lamb = lamb
        self.animate = animate

        self.buffer = Buffer(50000, self.obs_dim, self.act_dim)
        self.episodes = 20
        self.killer = GracefulKiller()

        self.policy = QPropPolicy(self.obs_dim,
                                  self.act_dim,
                                  self.env.action_space,
                                  kl_target,
                                  epochs=5)
        self.critic = DeterministicCritic(self.obs_dim, self.act_dim,
                                          self.discount, OUTPATH)
        # using MC return would be more helpful
        self.value_func = l2TargetValueFunc(self.obs_dim, epochs=10)

        if not show:
            # save copies of file
            shutil.copy(inspect.getfile(self.policy.__class__), OUTPATH)
            shutil.copy(inspect.getfile(self.value_func.__class__), OUTPATH)
            shutil.copy(inspect.getfile(self.__class__), OUTPATH)

            self.log_file = open(OUTPATH + 'log.csv', 'w')
            self.write_header = True

        print('observation dimension:', self.obs_dim)
        print('action dimension:', self.act_dim)

        # Use of a scaler is crucial
        self.scaler = Scaler(self.obs_dim)
        self.init_scaler()

    def init_scaler(self):
        """
        5 episodes empirically determined.
        :return:
        """
        print('Fitting scaler')
        observation_samples = []
        for i in range(5):
            observation = []
            obs = self.env.reset()
            observation.append(obs)
            obs = obs.astype(np.float64).reshape((1, -1))
            done = False
            step = 0
            while not done:
                obs = np.append(obs, [[step]], axis=1)  # add time step feature
                action = self.policy.get_sample(obs).reshape(
                    (1, -1)).astype(np.float64)
                if self.env_name == "FetchReach-v0":
                    obs_new, reward, done, _ = self.env.step(
                        action.reshape(-1))
                else:
                    obs_new, reward, done, _ = self.env.step(action)
                observation.append(obs_new)
                obs = obs_new.astype(np.float64).reshape((1, -1))
                step += 1e-3
            observation_samples.append(observation)
        observation_samples = np.concatenate(observation_samples, axis=0)
        self.scaler.update(observation_samples)

    def normalize_obs(self, obs):
        """
        transform and update on the fly.
        :param obs:
        :return:
        """
        scale, offset = self.scaler.get()
        obs_scaled = (obs - offset) * scale
        self.scaler.update(obs.astype(np.float64).reshape((1, -1)))
        return obs_scaled

    def run_one_episode(self):
        """
        collect data only
        :param save:
        :param train_policy:
        :param train_value_func:
        :param animate:
        :return:
        """
        obs = self.env.reset()
        observes, actions, rewards = [], [], []
        done = False
        step = 0
        while not done:
            if self.animate:
                self.env.render()
            obs = obs.astype(np.float64).reshape((1, -1))
            obs = self.normalize_obs(obs)
            obs = np.append(obs, [[step]], axis=1)  # add time step feature
            observes.append(obs)
            action = self.policy.get_sample(obs).reshape(
                (1, -1)).astype(np.float64)
            actions.append(action)
            if self.env_name == "FetchReach-v0":
                obs_new, reward, done, _ = self.env.step(action.reshape(-1))
            else:
                obs_new, reward, done, _ = self.env.step(action)
            if not isinstance(reward, float):
                reward = np.asscalar(reward)
            rewards.append(reward)

            obs = obs_new
            step += 0.003

        return np.concatenate(observes), np.concatenate(actions), np.array(
            rewards)

    def discounted_sum(self, l, factor):
        discounted = []
        sum = 0
        for i in reversed(l):
            discounted.append(factor * sum + i)
            sum = factor * sum + i
        return np.array(list(reversed(discounted)))

    def run_policy(self, episodes):
        """
        gather a batch of samples.
        :param episodes:
        :return:
        """
        trajectories = []
        for e in range(episodes):
            observes, actions, rewards = self.run_one_episode()
            trajectory = {
                'observes': observes,
                'actions': actions,
                'rewards': rewards
            }
            trajectories.append(trajectory)

        return trajectories

    def run_expr(self):
        ep_steps = []
        ep_rewards = []
        ep_entropy = []
        i = 0
        while i < self.num_iterations:
            trajectories = self.run_policy(20)
            # add to experience replay buffer
            self.buffer.append(trajectories)
            i += len(trajectories)

            # for E=20, T=50, the total number of samples would be 1000
            # In future needs to account for not uniform time steps per episode.
            # e.g. in Hopper-v2 environment not every episode has same time steps
            E = len(trajectories)
            T = trajectories[0]['observes'].shape[0]
            """train critic"""
            self.critic.fit(
                self.policy, self.buffer, epochs=1, num_samples=E *
                T)  # take E*T samples, so in total E*T gradient steps
            """calculation of episodic discounted return only needs rewards"""
            mc_returns = np.concatenate([
                self.discounted_sum(t['rewards'], self.discount)
                for t in trajectories
            ])
            """using current batch of samples to update baseline"""
            observes = np.concatenate([t['observes'] for t in trajectories])
            actions = np.concatenate([t['actions'] for t in trajectories])
            value_func_loss = self.value_func.update(observes, mc_returns)
            """compute GAE"""
            for t in trajectories:
                t['values'] = self.value_func.predict(t['observes'])
                # IS it really legitimate to insert 0 at the last obs?
                t['td_residual'] = t['rewards'] + self.discount * np.append(
                    t['values'][1:], 0) - t['values']
                t['gae'] = self.discounted_sum(t['td_residual'],
                                               self.discount * self.lamb)
            advantages = np.concatenate([t['gae'] for t in trajectories])
            """compute control variate""" ""
            cv = self.critic.get_contorl_variate(self.policy, observes,
                                                 actions)
            """conservative control variate"""
            eta = [1 if i > 0 else 0 for i in advantages * cv]
            """center learning signal"""
            # check that advantages and CV should be of size E*T
            # eta controls the on-off of control variate
            learning_signal = advantages - eta * cv
            """controlled taylor eval term"""
            ctrl_taylor = np.concatenate(
                [[eta[i] * act] for i, act in enumerate(
                    self.critic.get_taylor_eval(self.policy, observes))])

            policy_loss, kl, entropy, beta = self.policy.update(
                observes, actions, learning_signal, ctrl_taylor)

            # normalize advantage estimates
            # advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6)

            avg_rewards = np.sum(
                np.concatenate([t['rewards']
                                for t in trajectories])) / self.episodes
            avg_timesteps = np.average(
                [len(t['rewards']) for t in trajectories])
            log = {}

            # compute statistics such as mean and std
            log['steps'] = avg_timesteps
            log['rewards'] = avg_rewards
            log['policy_loss'] = policy_loss
            log['kl'] = kl
            log['entropy'] = entropy
            log['value_func_loss'] = value_func_loss
            log['beta'] = beta

            # display
            print('episode: ', i)
            print('average steps: {0}, average rewards: {1}'.format(
                log['steps'], log['rewards']))
            for key in [
                    'policy_loss', 'kl', 'entropy', 'beta', 'value_func_loss'
            ]:
                print('{:s}: {:.2g}'.format(key, log[key]))
            print('\n')
            ep_steps.append(log['steps'])
            ep_rewards.append(log['rewards'])
            ep_entropy.append(log['entropy'])

            # write to log.csv
            if self.write_header:
                fieldnames = [x for x in log.keys()]
                self.writer = csv.DictWriter(self.log_file,
                                             fieldnames=fieldnames)
                self.writer.writeheader()
                self.write_header = False
            self.writer.writerow(log)
            # we want the csv file to preserve information even if the program terminates earlier than scheduled.
            self.log_file.flush()

            # save model weights if stopped manually
            if self.killer.kill_now:
                if input('Terminate training (y/[n])? ') == 'y':
                    break
                self.killer.kill_now = False

            # if (i+1)%20 == 0:
            #     print('episode: ', i+1)
            #     print('average steps', np.average(steps))
            #     print('average rewards', np.average(rewards))

        self.policy.save(OUTPATH)
        self.value_func.save(OUTPATH)
        self.scaler.save(OUTPATH)

        plt.figure(figsize=(12, 9))

        if self.env_name.startswith('Fetch'):
            ax1 = plt.subplot(121)
            plt.xlabel('episodes')
            plt.ylabel('policy entropy')
            plt.plot(ep_entropy)
            scale_x = self.episodes
            ticks_x = ticker.FuncFormatter(
                lambda x, pos: '{0:g}'.format(x * scale_x))
            ax1.xaxis.set_major_formatter(ticks_x)
        else:
            ax1 = plt.subplot(121)
            plt.xlabel('episodes')
            plt.ylabel('steps')
            plt.plot(ep_steps)
            scale_x = self.episodes
            ticks_x = ticker.FuncFormatter(
                lambda x, pos: '{0:g}'.format(x * scale_x))
            ax1.xaxis.set_major_formatter(ticks_x)

        ax2 = plt.subplot(122)
        plt.xlabel('episodes')
        plt.ylabel('episodic rewards')
        plt.plot(ep_rewards)
        scale_x = self.episodes
        ticks_x = ticker.FuncFormatter(
            lambda x, pos: '{0:g}'.format(x * scale_x))
        ax2.xaxis.set_major_formatter(ticks_x)

        plt.savefig(OUTPATH + 'train.png')

    def load_model(self, load_from):
        from tensorflow.python.tools import inspect_checkpoint as chkp

        # # print all tensors in checkpoint file
        # chkp.print_tensors_in_checkpoint_file(load_from+'policy/policy.pl', tensor_name='', all_tensors=True, all_tensor_names=True)
        self.policy.load(load_from + 'policy/policy.pl')
        self.value_func.load(load_from + 'value_func/value_func.pl')

    def demonstrate_agent(self, load_from):
        self.load_model(load_from)
        with open(load_from + "scaler.pkl", 'rb') as file:
            self.scaler = pickle.load(file)
        self.animate = True
        for i in range(10):
            observes, actons, rewards = self.run_one_episode()
            ep_rewards = np.sum(rewards)
            ep_steps = len(rewards)
            print("Total steps: {0}, total rewards: {1}\n".format(
                ep_steps, ep_rewards))
示例#6
0
class Experiment:

    def __init__(self, env_name, discount, num_iterations, lamb, animate, kl_target):
        self.env = gym.make(env_name)
        gym.spaces.seed(1234)
        self.obs_dim = self.env.observation_space.shape[0] + 1 # the use of time steps is beneficial
        self.act_dim = self.env.action_space.shape[0]
        self.discount = discount
        self.num_iterations = num_iterations
        self.lamb = lamb
        self.animate = animate
        self.killer = GracefulKiller()
        self.policy = LinearPolicy(self.obs_dim, self.act_dim, self.env.action_space, kl_target, discount=discount)
        self.value_func = LinearValueFunc(self.obs_dim, discount=discount)

        # save copies of file
        shutil.copy(inspect.getfile(self.policy.__class__), OUTPATH)
        shutil.copy(inspect.getfile(self.value_func.__class__), OUTPATH)
        shutil.copy(inspect.getfile(self.__class__), OUTPATH)

        self.log_file = open(OUTPATH + 'log.csv', 'w')
        self.write_header = True
        print('observation dimension:', self.obs_dim)
        print('action dimension:', self.act_dim)
        self.scaler = Scaler(self.obs_dim)
        self.init_scaler()

    def init_scaler(self):
        print('fitting scaler')
        observation_samples = []
        for i in range(5):
            observation = []
            obs = self.env.reset()
            observation.append(obs)
            obs = obs.astype(np.float64).reshape((1, -1))
            done = False
            step = 0
            while not done:
                obs = np.append(obs, [[step]], axis=1)  # add time step feature
                action = self.policy.get_sample(obs).reshape((1, -1)).astype(np.float64)
                obs_new, reward, done, _ = self.env.step(action)
                observation.append(obs_new)
                obs = obs_new.astype(np.float64).reshape((1, -1))
                step += 1e-3
            observation_samples.append(observation)
        observation_samples = np.concatenate(observation_samples, axis=0)
        # print(observation_samples.shape)
        self.scaler.update(observation_samples)

    def normalize_obs(self, obs):
        scale, offset = self.scaler.get()
        obs_scaled = (obs-offset)*scale
        self.scaler.update(obs.astype(np.float64).reshape((1, -1)))
        return obs_scaled

    def run_one_epsisode(self, train_policy=True, train_value_func=True, animate=False):
        obs = self.env.reset()
        obs = obs.astype(np.float64).reshape((1, -1))
        obs = self.normalize_obs(obs)
        obs = np.append(obs, [[0]], axis=1)  # add time step feature
        log = {
            'rewards': [],
            'policy_loss': [],
            'value_func_loss': [],
            'entropy': [],
            'beta': [],
            'kl': [],
            'advantage':[]
        }

        done = False
        step = 0
        while not done:
            if animate:
                self.env.render()
            action = self.policy.get_sample(obs).reshape((1, -1)).astype(np.float64)
            step += 1e-3
            # print(action)
            obs_new, reward, done, _ = self.env.step(action)
            obs_new = obs_new.astype(np.float64).reshape((1, -1))
            obs_new = self.normalize_obs(obs_new)
            obs_new = np.append(obs_new, [[step]], axis=1)  # add time step feature

            if not isinstance(reward, float):
                reward = np.asscalar(reward)
            log['rewards'].append(reward)

            # scale reward
            if self.discount < 0.999:
                reward *= (1-self.discount)

            # TD residual
            advantage = reward + self.discount * self.value_func.predict(obs_new) - self.value_func.predict(obs)
            advantage = advantage.astype(np.float64).reshape((1,))

            if train_value_func:
                value_func_loss = self.value_func.update(obs, advantage)
            if train_policy:
                policy_loss, kl, entropy, beta = self.policy.update(obs, action, advantage)

            if train_value_func and train_policy:
                log['policy_loss'].append(policy_loss)
                log['kl'].append(kl)
                log['entropy'].append(entropy)
                log['beta'].append(beta)
                log['value_func_loss'].append(value_func_loss)
                log['advantage'].append(advantage)

            obs = obs_new

        return log

    def run_expr(self):
        ep_steps = []
        ep_rewards = []
        for i in range(self.num_iterations):
            # trace vectors are emptied at the beginning of each episode

            # get more accurate value_func estimator
            for _ in range(5):
                self.value_func.init_trace()
                self.run_one_epsisode(train_value_func=True, train_policy=False, animate=False)

            self.policy.init_trace()
            self.value_func.init_trace()

            # run (and train) one trajectory
            log = self.run_one_epsisode(animate=self.animate)

            # compute statistics such as mean and std
            log['steps'] = len(log['rewards'])
            log['rewards'] = np.sum(log['rewards'])
            for key in ['policy_loss', 'kl', 'entropy', 'beta', 'value_func_loss', 'advantage']:
                log[key + '_mean'] = np.mean(log[key])
                log[key + '_std'] = np.std(log[key])
                del log[key]

            # display
            print('episode: ', i)
            print('total steps: {0}, episodic rewards: {1}'.format(log['steps'], log['rewards']))
            for key in ['policy_loss', 'kl', 'entropy', 'beta', 'value_func_loss', 'advantage']:
                print('{:s}: {:.2g}({:.2g})'.format(key, log[key + '_mean'], log[key + '_std']))
            print('\n')
            ep_steps.append(log['steps'])
            ep_rewards.append(log['rewards'])

            # write to log.csv
            if self.write_header:
                fieldnames = [x for x in log.keys()]
                self.writer = csv.DictWriter(self.log_file, fieldnames=fieldnames)
                self.writer.writeheader()
                self.write_header = False
            self.writer.writerow(log)
            # we want the csv file to preserve information even if the program terminates earlier than scheduled.
            self.log_file.flush()

            # save model weights if stopped manually
            if self.killer.kill_now:
                if input('Terminate training (y/[n])? ') == 'y':
                    break
                self.killer.kill_now = False

            # if (i+1)%20 == 0:
            #     print('episode: ', i+1)
            #     print('average steps', np.average(steps))
            #     print('average rewards', np.average(rewards))

        # save weights
        self.policy.save(OUTPATH)
        self.value_func.save(OUTPATH)
        self.scaler.save(OUTPATH)

        plt.subplot(121)
        plt.xlabel('episodes')
        plt.ylabel('steps')
        plt.plot(ep_steps)

        plt.subplot(122)
        plt.xlabel('episodes')
        plt.ylabel('episodic rewards')
        plt.plot(ep_rewards)

        plt.savefig(OUTPATH + 'train.png')
示例#7
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, restore_path,
         out_path, thread_count, animation_mode, gait_name, gait_length,
         gaits_config_path, reward_mask, log_rewards, gait_reward_weight,
         g_colab, progress_reward_weight, phase_time_limit):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """
    killer = GracefulKiller()
    # restore_path = os.path.abspath(restore_path)
    env, obs_dim, act_dim = init_gym(env_name)
    log_rewards = log_rewards or (num_episodes == 0)
    env_list = []
    if thread_count > 1:
        env_list, obs_dim, act_dim = init_gyms(env_name, batch_size)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    start_time = datetime.now()  # create unique directories
    start_time_str = start_time.strftime("%b-%d/%H.%M.%S")
    logger = Logger(logname=env_name, now=start_time_str, out_path=out_path)
    env.env.set_params(gaits_config_path=gaits_config_path,
                       gait_name=gait_name,
                       gait_cycle_len=gait_length,
                       out_path=logger.path,
                       log_rewards=log_rewards,
                       render_mode=animation_mode,
                       reward_mask=reward_mask,
                       contact_reward=gait_reward_weight,
                       g_colab=g_colab,
                       progress_weight=progress_reward_weight,
                       phase_time_limit=phase_time_limit)
    scaler = Scaler(obs_dim)

    val_func = NNValueFunction(obs_dim, logger, restore_path)
    policy = Policy(obs_dim, act_dim, kl_targ, logger, restore_path)

    log_train_info(logger, num_episodes, start_time_str, gait_name,
                   gait_length, batch_size, restore_path, reward_mask,
                   gait_reward_weight, progress_reward_weight,
                   phase_time_limit)

    # run a few episodes of untrained policy to initialize scaler:
    episode = 0
    try:
        if restore_path is None:
            print("\nInitializing scaler (may take some time)... ")
            run_policy(env, policy, scaler, logger, episodes=5)
            print("Done\n")
        else:
            scaler.load(restore_path, obs_dim)

        while episode < num_episodes:
            sim_time = datetime.now()
            if thread_count > 1:
                trajectories = run_policy_parallel(env_list,
                                                   policy,
                                                   scaler,
                                                   logger,
                                                   episodes=batch_size,
                                                   thread_num=thread_count)
            else:
                trajectories = run_policy(env,
                                          policy,
                                          scaler,
                                          logger,
                                          episodes=batch_size)
            sim_time = datetime.now() - sim_time

            episode += len(trajectories)
            add_value(trajectories,
                      val_func)  # add estimated values to episodes
            add_disc_sum_rew(trajectories,
                             gamma)  # calculated discounted sum of Rs
            add_gae(trajectories, gamma, lam)  # calculate advantage
            # concatenate all episodes into single NumPy arrays
            observes, actions, advantages, disc_sum_rew = build_train_set(
                trajectories)
            # add various stats to training log:
            train_time = datetime.now() - start_time
            policy_time = datetime.now()
            policy.update(observes, actions, advantages,
                          logger)  # update policy
            policy_time = datetime.now() - policy_time
            val_time = datetime.now()
            val_func.fit(observes, disc_sum_rew,
                         logger)  # update value function
            val_time = datetime.now() - val_time

            log_batch_stats(observes, actions, advantages, disc_sum_rew,
                            logger, episode, train_time, sim_time, policy_time,
                            val_time)
            logger.write(
                display=True)  # write logger results to file and stdout
            print("Estimated time left: {}\n".format(
                estimate_time_left(episode, num_episodes, train_time)))

            if episode % 1000 == 0:
                policy.save()
                val_func.save()
                scaler.save(logger.path)
                print("Data saved at {}\n".format(logger.path))
                update_train_info(logger, episode)
                if animation_mode > 0:
                    run_policy(env,
                               policy,
                               scaler,
                               logger,
                               episodes=1,
                               animate=True,
                               anim_name='epizode_{}'.format(episode))
            if episode % 5000 == 0:
                os.rename(
                    os.path.join(logger.path, 'value_dump'),
                    os.path.join(logger.path, 'value_dump_' + str(episode)))
                os.rename(
                    os.path.join(logger.path, 'policy_dump'),
                    os.path.join(logger.path, 'policy_dump_' + str(episode)))
                # if episode == 20000:
                #     reward_mask = 63
                #     env.env.set_params(gaits_config_path=gaits_config_path, gait_name=gait_name, gait_cycle_len=gait_length,
                #                        out_path=logger.path, log_rewards=log_rewards, render_mode=animation_mode,
                #                        reward_mask=reward_mask, contact_reward=gait_reward_weight, g_colab=g_colab)
                print("Progress Enabled")
            if killer.kill_now:
                # if input('Terminate training (y/[n])? ') == 'y':
                #     break
                # killer.kill_now = False
                break
    finally:
        if animation_mode > 0 or num_episodes == 0:
            print("Rendering result video")
            try:
                trajectories = run_policy(
                    env,
                    policy,
                    scaler,
                    logger,
                    episodes=1,
                    animate=True,
                    anim_name='final_epizode_{}'.format(episode))
                # for walk analysis
                for t in trajectories:
                    logger.log_trajectory(t)
            except Exception as e:
                print("Failed to animate results, error: {}".format(e))
                raise e

        scaler.save(logger.path)
        policy.close_sess()
        val_func.close_sess()
        update_train_info(logger, episode)
        logger.close()