示例#1
0
def run_rolloutScaler(env, scaler, LengthOfRollout):
    """ Run  rollout with random state

			Args:
				env: ai gym environment
				LengthOfRollout: length of the rollout

			Returns: 4-tuple of NumPy arrays
				observes_1: shape = (episode len, obs_dim)
				actions: shape = (episode len, act_dim)
				rewards: shape = (episode len,)
				observes_2: shape = (episode len, obs_dim)
	"""
    scaler_action = Scaler(env.action_space.shape[0])
    scaler_act, scaler_offset = scaler_action.get()
    scale, offset = scaler.get()
    scale[-1] = 1.0  # don't scale time step feature
    offset[-1] = 0.0  # don't offset time step feature
    obs = env.reset()
    observes_1, actions, rewards, observes_2 = [], [], [], []
    for _ in range(LengthOfRollout):
        obs = obs.astype(np.float64).reshape((1, -1))
        observes_1.append(obs)
        action = env.action_space.sample()
        action = action.astype(np.float64).reshape((1, -1))

        actions.append(action)
        obs, reward, done, _ = env.step(action)
        obs = obs.astype(np.float64).reshape((1, -1))
        observes_2.append(obs)

        if not isinstance(reward, float):
            reward = np.asscalar(reward)
        rewards.append(reward)

    return (np.concatenate(observes_1), np.concatenate(actions),
            np.array(rewards, dtype=np.float64), np.concatenate(observes_2))
class Experiment:

    def __init__(self, env_name, discount, num_iterations, lamb, animate, kl_target, show):
        self.env_name = env_name
        self.env = gym.make(env_name)
        if env_name == "FetchReach-v0":
            self.env = gym.wrappers.FlattenDictWrapper(self.env, ['observation', 'desired_goal', 'achieved_goal'])
        gym.spaces.seed(1234)
        self.obs_dim = self.env.observation_space.shape[0] + 1 # adding time step as feature
        self.act_dim = self.env.action_space.shape[0]
        self.discount = discount
        self.num_iterations = num_iterations
        self.lamb = lamb
        self.animate = animate
        self.episodes = 20
        self.killer = GracefulKiller()
        # self.policy = ProximalPolicy(self.obs_dim, self.act_dim, self.env.action_space, kl_target, discount=discount,
        #                              lamb=lamb)
        self.policy = NoTracePolicy(self.obs_dim, self.act_dim, self.env.action_space, kl_target, epochs=20)
        # using MC return would be more helpful
        self.value_func = l2TargetValueFunc(self.obs_dim, epochs=10)
        # self.value_func = ValueFunc(self.obs_dim, discount=discount, lamb=1)

        if not show:
            # save copies of file
            shutil.copy(inspect.getfile(self.policy.__class__), OUTPATH)
            shutil.copy(inspect.getfile(self.value_func.__class__), OUTPATH)
            shutil.copy(inspect.getfile(self.__class__), OUTPATH)

            self.log_file = open(OUTPATH + 'log.csv', 'w')
            self.write_header = True

        print('observation dimension:', self.obs_dim)
        print('action dimension:', self.act_dim)

        # Use of a scaler is crucial
        self.scaler = Scaler(self.obs_dim)
        self.init_scaler()

    def init_scaler(self):
        print('fitting scaler')
        observation_samples = []
        for i in range(5):
            observation = []
            obs = self.env.reset()
            observation.append(obs)
            obs = obs.astype(np.float64).reshape((1, -1))
            done = False
            step = 0
            while not done:
                obs = np.append(obs, [[step]], axis=1)  # add time step feature
                action = self.policy.get_sample(obs).reshape((1, -1)).astype(np.float64)
                if self.env_name == "FetchReach-v0":
                    obs_new, reward, done, _ = self.env.step(action.reshape(-1))
                else:
                    obs_new, reward, done, _ = self.env.step(action)
                observation.append(obs_new)
                obs = obs_new.astype(np.float64).reshape((1, -1))
                step += 1e-3
            observation_samples.append(observation)
        observation_samples = np.concatenate(observation_samples, axis=0)
        # print(observation_samples.shape)
        self.scaler.update(observation_samples)

    def normalize_obs(self, obs):
        scale, offset = self.scaler.get()
        obs_scaled = (obs-offset)*scale
        self.scaler.update(obs.astype(np.float64).reshape((1, -1)))
        return obs_scaled

    def run_one_episode(self):
        """
        collect data only
        :param save:
        :param train_policy:
        :param train_value_func:
        :param animate:
        :return:
        """
        obs = self.env.reset()
        observes, actions, rewards = [],[],[]
        done = False
        step = 0
        while not done:
            if self.animate:
                self.env.render()
            obs = obs.astype(np.float64).reshape((1, -1))
            obs = self.normalize_obs(obs)
            obs = np.append(obs, [[step]], axis=1)  # add time step feature
            observes.append(obs)
            action = self.policy.get_sample(obs).reshape((1, -1)).astype(np.float64)
            actions.append(action)
            if self.env_name == "FetchReach-v0":
                obs_new, reward, done, _ = self.env.step(action.reshape(-1))
            else:
                obs_new, reward, done, _ = self.env.step(action)
            if not isinstance(reward, float):
                reward = np.asscalar(reward)
            rewards.append(reward)

            obs = obs_new
            step += 0.003

        return np.concatenate(observes), np.concatenate(actions), np.array(rewards)

    def discounted_sum(self, l, factor):
        discounted = []
        sum = 0
        for i in reversed(l):
            discounted.append(factor*sum+i)
            sum = factor*sum+i
        return np.array(list(reversed(discounted)))

    def run_policy(self, episodes):
        trajectories = []
        for e in range(episodes):
            observes, actions, rewards = self.run_one_episode()
            trajectory = {'observes': observes,
                          'actions': actions,
                          'rewards': rewards}
            # scale rewards
            if self.discount < 0.999:
                rewards = rewards*(1-self.discount)

            trajectory['values'] = self.value_func.predict(observes)
            trajectory['mc_return'] = self.discounted_sum(rewards, self.discount)

            trajectory['td_residual'] = rewards + self.discount*np.append(trajectory['values'][1:],0) - trajectory['values']
            trajectory['gae'] = self.discounted_sum(trajectory['td_residual'], self.discount*self.lamb)

            trajectories.append(trajectory)

        return trajectories

    def run_expr(self):
        ep_steps = []
        ep_rewards = []
        ep_entropy = []
        i = 0
        while i < self.num_iterations:
            trajectories = self.run_policy(20)
            i += len(trajectories)
            observes = np.concatenate([t['observes'] for t in trajectories])
            actions = np.concatenate([t['actions'] for t in trajectories])
            mc_returns = np.concatenate([t['mc_return'] for t in trajectories])
            advantages = np.concatenate([t['td_residual'] for t in trajectories])
            # advantages = np.concatenate([t['gae'] for t in trajectories])

            # normalize advantage estimates
            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6)

            value_func_loss = self.value_func.update(observes, mc_returns)
            policy_loss, kl, entropy, beta = self.policy.update(observes, actions, advantages)

            avg_rewards = np.sum(np.concatenate([t['rewards'] for t in trajectories])) / self.episodes
            avg_timesteps = np.average([len(t['rewards']) for t in trajectories])
            log = {}

            # compute statistics such as mean and std
            log['steps'] = avg_timesteps
            log['rewards'] = avg_rewards
            log['policy_loss'] = policy_loss
            log['kl'] = kl
            log['entropy'] = entropy
            log['value_func_loss'] = value_func_loss
            log['beta'] = beta

            # display
            print('episode: ', i)
            print('average steps: {0}, average rewards: {1}'.format(log['steps'], log['rewards']))
            for key in ['policy_loss', 'kl', 'entropy', 'beta', 'value_func_loss']:
                print('{:s}: {:.2g}'.format(key, log[key]))
            print('\n')
            ep_steps.append(log['steps'])
            ep_rewards.append(log['rewards'])
            ep_entropy.append(log['entropy'])


            # write to log.csv
            if self.write_header:
                fieldnames = [x for x in log.keys()]
                self.writer = csv.DictWriter(self.log_file, fieldnames=fieldnames)
                self.writer.writeheader()
                self.write_header = False
            self.writer.writerow(log)
            # we want the csv file to preserve information even if the program terminates earlier than scheduled.
            self.log_file.flush()

            # save model weights if stopped manually
            if self.killer.kill_now:
                if input('Terminate training (y/[n])? ') == 'y':
                    break
                self.killer.kill_now = False

            # if (i+1)%20 == 0:
            #     print('episode: ', i+1)
            #     print('average steps', np.average(steps))
            #     print('average rewards', np.average(rewards))

        self.policy.save(OUTPATH)
        self.value_func.save(OUTPATH)
        self.scaler.save(OUTPATH)


        plt.figure(figsize=(12,9))

        if self.env_name.startswith('Fetch'):
            ax1 = plt.subplot(121)
            plt.xlabel('episodes')
            plt.ylabel('policy entropy')
            plt.plot(ep_entropy)
            scale_x = self.episodes
            ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x))
            ax1.xaxis.set_major_formatter(ticks_x)
        else:
            ax1 = plt.subplot(121)
            plt.xlabel('episodes')
            plt.ylabel('steps')
            plt.plot(ep_steps)
            scale_x = self.episodes
            ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x))
            ax1.xaxis.set_major_formatter(ticks_x)

        ax2 = plt.subplot(122)
        plt.xlabel('episodes')
        plt.ylabel('episodic rewards')
        plt.plot(ep_rewards)
        scale_x = self.episodes
        ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x))
        ax2.xaxis.set_major_formatter(ticks_x)

        plt.savefig(OUTPATH + 'train.png')

    def load_model(self, load_from):
        from tensorflow.python.tools import inspect_checkpoint as chkp

        # # print all tensors in checkpoint file
        # chkp.print_tensors_in_checkpoint_file(load_from+'policy/policy.pl', tensor_name='', all_tensors=True, all_tensor_names=True)
        self.policy.load(load_from + 'policy/policy.pl')
        self.value_func.load(load_from + 'value_func/value_func.pl')

    def demonstrate_agent(self, load_from):
        self.load_model(load_from)
        with open(load_from + "scaler.pkl", 'rb') as file:
            self.scaler = pickle.load(file)
        self.animate = True
        for i in range(10):
            observes, actons, rewards = self.run_one_episode()
            ep_rewards = np.sum(rewards)
            ep_steps = len(rewards)
            print("Total steps: {0}, total rewards: {1}\n".format(ep_steps, ep_rewards))
示例#3
0
class Experiment:

    def __init__(self, env_name, discount, num_iterations, lamb, animate, kl_target, **kwargs):
        self.env_name = env_name
        self.env = gym.make(env_name)
        if env_name.startswith('Fetch'): # FetchReach env is a little bit different
            self.env = gym.wrappers.FlattenDictWrapper(self.env, ['observation', 'desired_goal', 'achieved_goal'])
        gym.spaces.seed(1234) # for reproducibility
        self.obs_dim = self.env.observation_space.shape[0] + 1 # adding time step as feature
        self.act_dim = self.env.action_space.shape[0]
        self.discount = discount
        self.num_iterations = num_iterations
        self.lamb = lamb
        self.animate = animate

        self.buffer = Buffer(1000000, self.obs_dim, self.act_dim) # 1000000 is the size they have used in paper
        self.episodes = 20 # larger episodes can reduce variance
        self.killer = GracefulKiller()

        self.policy = QPropPolicy(self.obs_dim, self.act_dim, self.env.action_space, kl_target, epochs=20)
        self.critic = DeterministicCritic(self.obs_dim, self.act_dim, self.discount, OUTPATH)
        self.value_func = l2TargetValueFunc(self.obs_dim, epochs=10)

        if 'show' in kwargs and not kwargs['show']:
            # save copies of file
            shutil.copy(inspect.getfile(self.policy.__class__), OUTPATH)
            shutil.copy(inspect.getfile(self.value_func.__class__), OUTPATH)
            shutil.copy(inspect.getfile(self.critic.__class__), OUTPATH)
            shutil.copy(inspect.getfile(self.__class__), OUTPATH)

            self.log_file = open(OUTPATH + 'log.csv', 'w')
            self.write_header = True

        print('Observation dimension:', self.obs_dim)
        print('Action dimension:', self.act_dim)

        # The use of a scaler is crucial
        self.scaler = Scaler(self.obs_dim)
        self.init_scaler()

    def init_scaler(self):
        """
        Collection observations from 5 episodes to initialize Scaler.
        :return: a properly initialized scaler
        """
        print('Fitting scaler')
        observation_samples = []
        for i in range(5):
            observation = []
            obs = self.env.reset()
            observation.append(obs)
            obs = obs.astype(np.float64).reshape((1, -1))
            done = False
            step = 0
            while not done:
                obs = np.append(obs, [[step]], axis=1)  # add time step feature
                action = self.policy.get_sample(obs).reshape((1, -1)).astype(np.float64)
                if self.env_name.startswith('Fetch'):
                    obs_new, reward, done, _ = self.env.step(action.reshape(-1))
                else:
                    obs_new, reward, done, _ = self.env.step(action)
                observation.append(obs_new)
                obs = obs_new.astype(np.float64).reshape((1, -1))
                step += 1e-3
            observation_samples.append(observation)
        observation_samples = np.concatenate(observation_samples, axis=0)
        self.scaler.update(observation_samples)

    def normalize_obs(self, obs):
        """
        Transform and update the scaler on the fly.
        :param obs: Raw observation
        :return: normalized observation
        """
        scale, offset = self.scaler.get()
        obs_scaled = (obs-offset)*scale
        self.scaler.update(obs.astype(np.float64).reshape((1, -1)))
        return obs_scaled

    def run_one_episode(self):
        """
        collect a trajectory of (obs, act, reward, obs_next)
        """
        obs = self.env.reset()
        observes, actions, rewards = [],[],[]
        done = False
        step = 0
        while not done:
            if self.animate:
                self.env.render()

            obs = obs.astype(np.float64).reshape((1, -1))
            obs = self.normalize_obs(obs)
            obs = np.append(obs, [[step]], axis=1)  # add time step feature at normalized observation
            observes.append(obs)

            action = self.policy.get_sample(obs).reshape((1, -1)).astype(np.float64)
            actions.append(action)
            if self.env_name.startswith('Fetch'):
                obs_new, reward, done, _ = self.env.step(action.reshape(-1))
            else:
                obs_new, reward, done, _ = self.env.step(action)

            if not isinstance(reward, float):
                reward = np.asscalar(reward)
            rewards.append(reward)

            obs = obs_new
            step += 0.003

        return np.concatenate(observes), np.concatenate(actions), np.array(rewards)

    def discounted_sum(self, l, factor):
        """
        Discounted sum of return or advantage estimates along a trajectory.
        :param l: a list containing the values of discounted summed interest.
        :param factor: discount factor in the disc_sum case or discount*lambda for GAE
        :return: discounted sum of l with regard to factor
        """
        discounted = []
        sum = 0
        for i in reversed(l):
            discounted.append(factor*sum+i)
            sum = factor*sum+i
        return np.array(list(reversed(discounted)))

    def run_policy(self, episodes):
        """
        Gather a batch of trajectory samples.
        :param episodes: size of batch.
        :return: a batch of samples
        """
        trajectories = []
        for e in range(episodes):
            observes, actions, rewards = self.run_one_episode()
            trajectory = {'observes': observes,
                          'actions': actions,
                          'rewards': rewards,
                          'scaled_rewards': rewards*(1-self.discount)}
            trajectories.append(trajectory)

        return trajectories

    def run_expr(self):
        ep_steps = []
        ep_rewards = []
        ep_entropy = []
        i = 0
        while i < self.num_iterations:
            trajectories = self.run_policy(20)
            # add to experience replay buffer
            self.buffer.append(trajectories)
            print('buffer size:', self.buffer.size())

            i += len(trajectories)

            # for E=20, T=50, the total number of samples would be 1000
            # In future needs to account for not uniform time steps per episode.
            # e.g. in Hopper-v2 environment not every episode has same time steps
            # E = len(trajectories)
            # num_samples = np.sum([len(t['rewards']) for t in trajectories])
            gradient_steps = np.sum([len(t['rewards']) for t in trajectories])
            if self.env_name.startswith('Fetch'):
                assert (gradient_steps == 20*50)

            """train critic"""
            # train all samples in the buffer, to the extreme
            # self.critic.fit(self.policy, self.buffer, epochs=20, num_samples=self.buffer.size())
            # train some samples minibatches only
            critic_loss_mean, critic_loss_std = self.critic.another_fit_func(self.policy, self.buffer, gradient_steps)

            """calculation of episodic discounted return only needs rewards"""
            mc_returns = np.concatenate([self.discounted_sum(t['scaled_rewards'], self.discount) for t in trajectories])

            """using current batch of samples to update baseline"""
            observes = np.concatenate([t['observes'] for t in trajectories])
            actions = np.concatenate([t['actions'] for t in trajectories])
            value_func_loss = self.value_func.update(observes, mc_returns)

            """compute GAE"""
            for t in trajectories:
                t['values'] = self.value_func.predict(t['observes'])
                # IS it really legitimate to insert 0 at the last obs?
                t['td_residual'] = t['scaled_rewards'] + self.discount * np.append(t['values'][1:], 0) - t['values']
                t['gae'] = self.discounted_sum(t['td_residual'], self.discount * self.lamb)
            advantages = np.concatenate([t['gae'] for t in trajectories])
            """normalize advantage estimates, Crucial step"""
            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6)

            """compute control variate"""""
            cv = self.critic.get_contorl_variate(self.policy, observes, actions)
            # cv must not be centered
            # cv = (cv - cv.mean()) / (cv.std() + 1e-6)

            """conservative control variate"""
            eta = [1 if i > 0 else 0 for i in advantages*cv]

            """center learning signal"""
            # check that advantages and CV should be of size E*T
            # eta controls the on-off of control variate
            learning_signal = advantages - eta*cv
            # learning_signal = (learning_signal - learning_signal.mean()) / (learning_signal.std() + 1e-6)

            """controlled taylor eval term"""
            ctrl_taylor = np.concatenate([ [eta[i]*act] for i, act in enumerate(self.critic.get_taylor_eval(self.policy, observes))])

            """policy update"""
            ppo_loss, ddpg_loss, kl, entropy, beta = self.policy.update(observes, actions, learning_signal, ctrl_taylor)

            avg_rewards = np.sum(np.concatenate([t['rewards'] for t in trajectories])) / self.episodes
            avg_timesteps = np.average([len(t['rewards']) for t in trajectories])
            log = {}

            # save training statistics
            log['steps'] = avg_timesteps
            log['rewards'] = avg_rewards
            log['critic_loss'] = critic_loss_mean
            log['policy_ppo_loss'] = ppo_loss
            log['policy_ddpg_loss'] = ddpg_loss
            log['kl'] = kl
            log['entropy'] = entropy
            log['value_func_loss'] = value_func_loss
            log['beta'] = beta

            # display
            print('episode: ', i)
            print('average steps: {0}, average rewards: {1}'.format(log['steps'], log['rewards']))
            for key in ['critic_loss', 'policy_ppo_loss', 'policy_ddpg_loss', 'value_func_loss', 'kl', 'entropy', 'beta']:
                print('{:s}: {:.2g}'.format(key, log[key]))
            print('\n')
            ep_steps.append(log['steps'])
            ep_rewards.append(log['rewards'])
            ep_entropy.append(log['entropy'])

            # write to log.csv
            if self.write_header:
                fieldnames = [x for x in log.keys()]
                self.writer = csv.DictWriter(self.log_file, fieldnames=fieldnames)
                self.writer.writeheader()
                self.write_header = False
            self.writer.writerow(log)
            # we want the csv file to preserve information even if the program terminates earlier than scheduled.
            self.log_file.flush()

            # save model weights if stopped early
            if self.killer.kill_now:
                if input('Terminate training (y/[n])? ') == 'y':
                    break
                self.killer.kill_now = False

        self.policy.save(OUTPATH)
        self.value_func.save(OUTPATH)
        self.scaler.save(OUTPATH)

        plt.figure(figsize=(12,9))

        if self.env_name.startswith('Fetch'):
            ax1 = plt.subplot(121)
            plt.xlabel('episodes')
            plt.ylabel('policy entropy')
            plt.plot(ep_entropy)
            scale_x = self.episodes
            ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x))
            ax1.xaxis.set_major_formatter(ticks_x)
        else:
            ax1 = plt.subplot(121)
            plt.xlabel('episodes')
            plt.ylabel('steps')
            plt.plot(ep_steps)
            scale_x = self.episodes
            ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x))
            ax1.xaxis.set_major_formatter(ticks_x)

        ax2 = plt.subplot(122)
        plt.xlabel('episodes')
        plt.ylabel('episodic rewards')
        plt.plot(ep_rewards)
        scale_x = self.episodes
        ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x))
        ax2.xaxis.set_major_formatter(ticks_x)

        plt.savefig(OUTPATH + 'train.png')

    def load_model(self, load_from):
        """
        Load all Function Approximators plus a Scaler.
        Replaybuffer is not restored though.
        :param load_from: Dir containing saved weights.
        """
        from tensorflow.python.tools import inspect_checkpoint as chkp
        # # print all tensors in checkpoint file
        # chkp.print_tensors_in_checkpoint_file(load_from+'policy/policy.pl', tensor_name='', all_tensors=True, all_tensor_names=True)
        self.policy.load(load_from + 'policy/')
        self.value_func.load(load_from + 'value_func/')
        self.critic.load(load_from+'critic/')
        with open(load_from + "scaler.pkl", 'rb') as file:
            self.scaler = pickle.load(file)

    def demonstrate_agent(self, load_from):
        """
        Simply run the policy without training.
        :param load_from:
        :return:
        """
        self.load_model(load_from)
        while True:
            observes, actons, rewards = self.run_one_episode()
            ep_rewards = np.sum(rewards)
            ep_steps = len(rewards)
            print("Total steps: {0}, total rewards: {1}\n".format(ep_steps, ep_rewards))
示例#4
0
class Experiment:
    def __init__(self, env_name, discount, num_iterations, lamb, animate,
                 kl_target, show):
        self.env_name = env_name
        self.env = gym.make(env_name)
        if env_name == "FetchReach-v0":
            self.env = gym.wrappers.FlattenDictWrapper(
                self.env, ['observation', 'desired_goal', 'achieved_goal'])
        gym.spaces.seed(1234)
        self.obs_dim = self.env.observation_space.shape[
            0] + 1  # adding time step as feature
        self.act_dim = self.env.action_space.shape[0]
        self.discount = discount
        self.num_iterations = num_iterations
        self.lamb = lamb
        self.animate = animate

        self.buffer = Buffer(50000, self.obs_dim, self.act_dim)
        self.episodes = 20
        self.killer = GracefulKiller()

        self.policy = QPropPolicy(self.obs_dim,
                                  self.act_dim,
                                  self.env.action_space,
                                  kl_target,
                                  epochs=5)
        self.critic = DeterministicCritic(self.obs_dim, self.act_dim,
                                          self.discount, OUTPATH)
        # using MC return would be more helpful
        self.value_func = l2TargetValueFunc(self.obs_dim, epochs=10)

        if not show:
            # save copies of file
            shutil.copy(inspect.getfile(self.policy.__class__), OUTPATH)
            shutil.copy(inspect.getfile(self.value_func.__class__), OUTPATH)
            shutil.copy(inspect.getfile(self.__class__), OUTPATH)

            self.log_file = open(OUTPATH + 'log.csv', 'w')
            self.write_header = True

        print('observation dimension:', self.obs_dim)
        print('action dimension:', self.act_dim)

        # Use of a scaler is crucial
        self.scaler = Scaler(self.obs_dim)
        self.init_scaler()

    def init_scaler(self):
        """
        5 episodes empirically determined.
        :return:
        """
        print('Fitting scaler')
        observation_samples = []
        for i in range(5):
            observation = []
            obs = self.env.reset()
            observation.append(obs)
            obs = obs.astype(np.float64).reshape((1, -1))
            done = False
            step = 0
            while not done:
                obs = np.append(obs, [[step]], axis=1)  # add time step feature
                action = self.policy.get_sample(obs).reshape(
                    (1, -1)).astype(np.float64)
                if self.env_name == "FetchReach-v0":
                    obs_new, reward, done, _ = self.env.step(
                        action.reshape(-1))
                else:
                    obs_new, reward, done, _ = self.env.step(action)
                observation.append(obs_new)
                obs = obs_new.astype(np.float64).reshape((1, -1))
                step += 1e-3
            observation_samples.append(observation)
        observation_samples = np.concatenate(observation_samples, axis=0)
        self.scaler.update(observation_samples)

    def normalize_obs(self, obs):
        """
        transform and update on the fly.
        :param obs:
        :return:
        """
        scale, offset = self.scaler.get()
        obs_scaled = (obs - offset) * scale
        self.scaler.update(obs.astype(np.float64).reshape((1, -1)))
        return obs_scaled

    def run_one_episode(self):
        """
        collect data only
        :param save:
        :param train_policy:
        :param train_value_func:
        :param animate:
        :return:
        """
        obs = self.env.reset()
        observes, actions, rewards = [], [], []
        done = False
        step = 0
        while not done:
            if self.animate:
                self.env.render()
            obs = obs.astype(np.float64).reshape((1, -1))
            obs = self.normalize_obs(obs)
            obs = np.append(obs, [[step]], axis=1)  # add time step feature
            observes.append(obs)
            action = self.policy.get_sample(obs).reshape(
                (1, -1)).astype(np.float64)
            actions.append(action)
            if self.env_name == "FetchReach-v0":
                obs_new, reward, done, _ = self.env.step(action.reshape(-1))
            else:
                obs_new, reward, done, _ = self.env.step(action)
            if not isinstance(reward, float):
                reward = np.asscalar(reward)
            rewards.append(reward)

            obs = obs_new
            step += 0.003

        return np.concatenate(observes), np.concatenate(actions), np.array(
            rewards)

    def discounted_sum(self, l, factor):
        discounted = []
        sum = 0
        for i in reversed(l):
            discounted.append(factor * sum + i)
            sum = factor * sum + i
        return np.array(list(reversed(discounted)))

    def run_policy(self, episodes):
        """
        gather a batch of samples.
        :param episodes:
        :return:
        """
        trajectories = []
        for e in range(episodes):
            observes, actions, rewards = self.run_one_episode()
            trajectory = {
                'observes': observes,
                'actions': actions,
                'rewards': rewards
            }
            trajectories.append(trajectory)

        return trajectories

    def run_expr(self):
        ep_steps = []
        ep_rewards = []
        ep_entropy = []
        i = 0
        while i < self.num_iterations:
            trajectories = self.run_policy(20)
            # add to experience replay buffer
            self.buffer.append(trajectories)
            i += len(trajectories)

            # for E=20, T=50, the total number of samples would be 1000
            # In future needs to account for not uniform time steps per episode.
            # e.g. in Hopper-v2 environment not every episode has same time steps
            E = len(trajectories)
            T = trajectories[0]['observes'].shape[0]
            """train critic"""
            self.critic.fit(
                self.policy, self.buffer, epochs=1, num_samples=E *
                T)  # take E*T samples, so in total E*T gradient steps
            """calculation of episodic discounted return only needs rewards"""
            mc_returns = np.concatenate([
                self.discounted_sum(t['rewards'], self.discount)
                for t in trajectories
            ])
            """using current batch of samples to update baseline"""
            observes = np.concatenate([t['observes'] for t in trajectories])
            actions = np.concatenate([t['actions'] for t in trajectories])
            value_func_loss = self.value_func.update(observes, mc_returns)
            """compute GAE"""
            for t in trajectories:
                t['values'] = self.value_func.predict(t['observes'])
                # IS it really legitimate to insert 0 at the last obs?
                t['td_residual'] = t['rewards'] + self.discount * np.append(
                    t['values'][1:], 0) - t['values']
                t['gae'] = self.discounted_sum(t['td_residual'],
                                               self.discount * self.lamb)
            advantages = np.concatenate([t['gae'] for t in trajectories])
            """compute control variate""" ""
            cv = self.critic.get_contorl_variate(self.policy, observes,
                                                 actions)
            """conservative control variate"""
            eta = [1 if i > 0 else 0 for i in advantages * cv]
            """center learning signal"""
            # check that advantages and CV should be of size E*T
            # eta controls the on-off of control variate
            learning_signal = advantages - eta * cv
            """controlled taylor eval term"""
            ctrl_taylor = np.concatenate(
                [[eta[i] * act] for i, act in enumerate(
                    self.critic.get_taylor_eval(self.policy, observes))])

            policy_loss, kl, entropy, beta = self.policy.update(
                observes, actions, learning_signal, ctrl_taylor)

            # normalize advantage estimates
            # advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6)

            avg_rewards = np.sum(
                np.concatenate([t['rewards']
                                for t in trajectories])) / self.episodes
            avg_timesteps = np.average(
                [len(t['rewards']) for t in trajectories])
            log = {}

            # compute statistics such as mean and std
            log['steps'] = avg_timesteps
            log['rewards'] = avg_rewards
            log['policy_loss'] = policy_loss
            log['kl'] = kl
            log['entropy'] = entropy
            log['value_func_loss'] = value_func_loss
            log['beta'] = beta

            # display
            print('episode: ', i)
            print('average steps: {0}, average rewards: {1}'.format(
                log['steps'], log['rewards']))
            for key in [
                    'policy_loss', 'kl', 'entropy', 'beta', 'value_func_loss'
            ]:
                print('{:s}: {:.2g}'.format(key, log[key]))
            print('\n')
            ep_steps.append(log['steps'])
            ep_rewards.append(log['rewards'])
            ep_entropy.append(log['entropy'])

            # write to log.csv
            if self.write_header:
                fieldnames = [x for x in log.keys()]
                self.writer = csv.DictWriter(self.log_file,
                                             fieldnames=fieldnames)
                self.writer.writeheader()
                self.write_header = False
            self.writer.writerow(log)
            # we want the csv file to preserve information even if the program terminates earlier than scheduled.
            self.log_file.flush()

            # save model weights if stopped manually
            if self.killer.kill_now:
                if input('Terminate training (y/[n])? ') == 'y':
                    break
                self.killer.kill_now = False

            # if (i+1)%20 == 0:
            #     print('episode: ', i+1)
            #     print('average steps', np.average(steps))
            #     print('average rewards', np.average(rewards))

        self.policy.save(OUTPATH)
        self.value_func.save(OUTPATH)
        self.scaler.save(OUTPATH)

        plt.figure(figsize=(12, 9))

        if self.env_name.startswith('Fetch'):
            ax1 = plt.subplot(121)
            plt.xlabel('episodes')
            plt.ylabel('policy entropy')
            plt.plot(ep_entropy)
            scale_x = self.episodes
            ticks_x = ticker.FuncFormatter(
                lambda x, pos: '{0:g}'.format(x * scale_x))
            ax1.xaxis.set_major_formatter(ticks_x)
        else:
            ax1 = plt.subplot(121)
            plt.xlabel('episodes')
            plt.ylabel('steps')
            plt.plot(ep_steps)
            scale_x = self.episodes
            ticks_x = ticker.FuncFormatter(
                lambda x, pos: '{0:g}'.format(x * scale_x))
            ax1.xaxis.set_major_formatter(ticks_x)

        ax2 = plt.subplot(122)
        plt.xlabel('episodes')
        plt.ylabel('episodic rewards')
        plt.plot(ep_rewards)
        scale_x = self.episodes
        ticks_x = ticker.FuncFormatter(
            lambda x, pos: '{0:g}'.format(x * scale_x))
        ax2.xaxis.set_major_formatter(ticks_x)

        plt.savefig(OUTPATH + 'train.png')

    def load_model(self, load_from):
        from tensorflow.python.tools import inspect_checkpoint as chkp

        # # print all tensors in checkpoint file
        # chkp.print_tensors_in_checkpoint_file(load_from+'policy/policy.pl', tensor_name='', all_tensors=True, all_tensor_names=True)
        self.policy.load(load_from + 'policy/policy.pl')
        self.value_func.load(load_from + 'value_func/value_func.pl')

    def demonstrate_agent(self, load_from):
        self.load_model(load_from)
        with open(load_from + "scaler.pkl", 'rb') as file:
            self.scaler = pickle.load(file)
        self.animate = True
        for i in range(10):
            observes, actons, rewards = self.run_one_episode()
            ep_rewards = np.sum(rewards)
            ep_steps = len(rewards)
            print("Total steps: {0}, total rewards: {1}\n".format(
                ep_steps, ep_rewards))
class Discriminator(object):
    def __init__(self, obs_dim, act_dim, ent_reg_weight, epochs, input_type,
                 loss_type, logger):
        self.obs_dim = obs_dim
        self.act_dim = act_dim

        self.input_type = input_type
        self.loss_type = loss_type
        if self.input_type == 'states_actions':
            self.input_dim = obs_dim + act_dim
        elif self.input_type == 'states':
            self.input_dim = obs_dim

        self.epochs = epochs

        # we are only NORMALIZING states for now
        self.scaler = Scaler(self.obs_dim)

        # SET LEARNING RATE
        self.lr_mult = 1.0
        self.ent_reg_weight = ent_reg_weight

        # logger
        self.logger = logger

        # creating graph
        self.g = tf.Graph()
        with self.g.as_default():
            self._placeholders()
            self._nn_disc()
            self._loss_train_op()
            self.init = tf.global_variables_initializer()

        # session
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.49,
                                    allow_growth=True)
        self.sess = tf.Session(graph=self.g,
                               config=tf.ConfigProto(
                                   gpu_options=gpu_options,
                                   allow_soft_placement=True))
        self.sess.run(self.init)

    def _placeholders(self):
        self.input_ph = tf.placeholder(tf.float32, (None, self.input_dim),
                                       name='inputs')
        self.labels_ph = tf.placeholder(tf.float32, (None, ), name='labels')
        self.weights_ph = tf.placeholder(tf.float32, (None, ), name='weights')
        self.lr_ph = tf.placeholder(tf.float32, (), name='learning_rate')

    def _nn_disc(self):
        hid1_size = 300
        hid2_size = 200
        self.lr = 1e-4
        '''
		hid1_size = self.obs_dim * 10
		hid3_size = self.act_dim * 10
		hid2_size = int(np.sqrt(hid1_size * hid3_size))
		self.lr = 9e-4 / np.sqrt(hid2_size)
		'''
        out = tf.layers.dense(self.input_ph,
                              hid1_size,
                              tf.tanh,
                              kernel_initializer=tf.random_normal_initializer(
                                  stddev=np.sqrt(1 / self.obs_dim)),
                              name="h1")
        out = tf.layers.dense(out,
                              hid2_size,
                              tf.tanh,
                              kernel_initializer=tf.random_normal_initializer(
                                  stddev=np.sqrt(1 / hid1_size)),
                              name="h2")
        '''
		out = tf.layers.dense(out, hid3_size, tf.tanh,
		                      kernel_initializer=tf.random_normal_initializer(
		                          stddev=np.sqrt(1 / hid2_size)), name="h3")
		'''

        scores = tf.layers.dense(
            out,
            1,
            tf.identity,
            kernel_initializer=tf.random_normal_initializer(
                stddev=np.sqrt(1 / hid2_size)),
            name="scores")

        self.scores = tf.squeeze(scores)

        # rewards could be clipped
        self.reward_op = -tf.log(1 - tf.nn.sigmoid(self.scores))

    def _loss_train_op(self):
        if self.loss_type == 'pure_gail':
            cross_loss = tf.nn.sigmoid_cross_entropy_with_logits(
                logits=self.scores, labels=self.labels_ph)
            # this extra entropy penalty is NOT included in the paper
            # taken from the example provided by the authors
            # in the paper there is an entropy term for TRPO update???
            ent_loss = (1.0 - tf.nn.sigmoid(
                self.scores)) * self.scores + tf.nn.softplus(-self.scores)
            self.loss = tf.reduce_mean(
                (cross_loss - self.ent_reg_weight * ent_loss) *
                self.weights_ph)

            train_op = tf.train.AdamOptimizer(learning_rate=self.lr_ph)
            self.train_min = train_op.minimize(self.loss)
        elif self.loss_type == 'wasserstein':
            self.loss = tf.reduce_mean(
                (self.labels_ph * self.scores +
                 (1.0 - self.labels_ph) * self.scores) * self.weights_ph)
            train_op = tf.train.AdamOptimizer(learning_rate=self.lr_ph)
            self.train_min = train_op.minimize(self.loss)

    def normalize_input(self, inpt):
        # check out this normalization
        self.scaler.update(inpt)
        # i was getting in reverse order
        scale, offset = self.scaler.get()
        inpt = (inpt - offset) * scale
        return inpt

    def get_rewards(self, gen_obs, gen_acts=None):
        # those observations are already normalized
        scale, offset = self.scaler.get()
        gen_obs = (gen_obs - offset) * scale
        gen_input = gen_obs
        if self.input_type == 'states_actions':
            gen_input = np.concatenate([gen_obs, gen_acts], axis=1)
        return self.sess.run(self.reward_op,
                             feed_dict={self.input_ph: gen_input})

    def update(self, exp_obs, gen_obs):
        # shuffle generator observations and actions
        gen_obs = shuffle(gen_obs)

        obs = np.concatenate([gen_obs, exp_obs], axis=0)
        obs = self.normalize_input(obs)

        # number of generator examples
        gen_num = gen_obs.shape[0]
        exp_num = exp_obs.shape[0]

        # create labels and mark real/fake
        labels = np.zeros((gen_num + exp_num))
        labels[gen_num:] = 1.0

        # calc loss weight
        weights = np.zeros((gen_num + exp_num))
        weights[:gen_num] = gen_num / (gen_num + exp_num)
        weights[gen_num:] = exp_num / (gen_num + exp_num)

        for i in range(self.epochs):
            inpt, labels, weights = shuffle(obs, labels, weights)
            bobs = np.array_split(inpt, self.epochs, axis=0)
            blabs = np.array_split(labels, self.epochs)
            bweg = np.array_split(weights, self.epochs)
            for j in range(self.epochs):
                loss, _ = self.sess.run(
                    [self.loss, self.train_min],
                    feed_dict={
                        self.input_ph: bobs[i],
                        self.labels_ph: blabs[i],
                        self.weights_ph: bweg[i],
                        self.lr_ph: self.lr * self.lr_mult
                    })

        # evaluate the discriminator
        scores = self.sess.run(self.scores, feed_dict={self.input_ph: obs})

        def sigmoid(x):
            return 1 / (1 + np.exp(-x))

        gen_corr = np.sum((sigmoid(scores[:gen_num]) < 0.5))
        exp_corr = np.sum((sigmoid(scores[gen_num:]) > 0.5))
        gen_acc = gen_corr / gen_num
        exp_acc = exp_corr / exp_num
        total_acc = (gen_corr + exp_corr) / (gen_num + exp_num)

        # log necessary info
        #self.logger.log('gen_acc', gen_acc)
        #self.logger.log('exp_acc', exp_acc)
        #self.logger.log('total_acc', total_acc)
        return gen_acc, exp_acc, total_acc

    def close_session(self):
        self.sess.close()
示例#6
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, nprocs,
         policy_hid_list, valfunc_hid_list, gpu_pct, restore_path, animate,
         submit):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """
    # killer = GracefulKiller()

    env, obs_dim, act_dim = init_osim(animate)
    env.seed(111 + mpi_util.rank)
    mpi_util.set_global_seeds(111 + mpi_util.rank)

    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    if mpi_util.rank == 0:
        #aigym_path = os.path.join('/tmp', env_name, now)
        #env = wrappers.Monitor(env, aigym_path, force=True)
        logger = Logger(logname=env_name, now=now)

    episode = 0

    checkpoint = Checkpoint("saves", now)
    # restore from checkpoint?
    if restore_path:
        (policy, val_func, scaler, episode, obs_dim, act_dim,
         kl_targ) = checkpoint.restore(restore_path)
    else:
        policy = Policy(obs_dim, act_dim, kl_targ)
        val_func = NNValueFunction(obs_dim)
        scaler = Scaler(obs_dim)

        if mpi_util.rank == 0:
            # run a few episodes (on node 0) of untrained policy to initialize scaler:
            trajectories = run_policy(env, policy, scaler, episodes=5)

            unscaled = np.concatenate(
                [t['unscaled_obs'] for t in trajectories])
            scaler.update(
                unscaled)  # update running statistics for scaling observations

        # broadcast policy weights, scaler, val_func
        (policy, scaler, val_func) = mpi_util.broadcast_policy_scaler_val(
            policy, scaler, val_func)

        if mpi_util.rank == 0:
            checkpoint.save(policy, val_func, scaler, episode)

    if animate:
        observes, actions, rewards, unscaled_obs = run_episode(env,
                                                               policy,
                                                               scaler,
                                                               animate=animate)
        exit(0)

    if submit:
        # Settings
        #remote_base = 'http://grader.crowdai.org:1729'
        remote_base = 'http://grader.crowdai.org:1730'
        token = 'a83412a94593cae3a491f3ee28ff44e1'

        client = Client(remote_base)

        # Create environment
        observation = client.env_create(token)
        step = 0.0
        observes, actions, rewards, unscaled_obs = [], [], [], []
        scale, offset = scaler.get()
        scale[-1] = 1.0  # don't scale time step feature
        offset[-1] = 0.0  # don't offset time step feature

        # Run a single step
        #
        # The grader runs 3 simulations of at most 1000 steps each. We stop after the last one
        while True:
            obs = np.array(observation).astype(np.float32).reshape((1, -1))
            print("OBSERVATION TYPE:", type(obs), obs.shape)
            print(obs)
            obs = np.append(obs, [[step]], axis=1)  # add time step feature
            unscaled_obs.append(obs)
            obs = (obs - offset) * scale  # center and scale observations
            observes.append(obs)

            action = policy.sample(obs).astype(np.float32).reshape((-1, 1))
            print("ACTION TYPE:", type(action), action.shape)
            print(action)
            actions.append(action)

            [observation, reward, done,
             info] = client.env_step(action.tolist())
            print("step:", step, "reward:", reward)

            if not isinstance(reward, float):
                reward = np.asscalar(reward)
            rewards.append(reward)
            step += 1e-3  # increment time step feature

            if done:
                print(
                    "================================== RESTARTING ================================="
                )
                observation = client.env_reset()
                step = 0.0
                observes, actions, rewards, unscaled_obs = [], [], [], []
                scale, offset = scaler.get()
                scale[-1] = 1.0  # don't scale time step feature
                offset[-1] = 0.0  # don't offset time step feature
                if not observation:
                    break

        client.submit()
        exit(0)

    ######

    worker_batch_size = int(batch_size / mpi_util.nworkers)  # HACK
    if (worker_batch_size * mpi_util.nworkers != batch_size):
        print("batch_size:", batch_size, " is not divisible by nworkers:",
              mpi_util.nworkers)
        exit(1)

    batch = 0
    while episode < num_episodes:
        if mpi_util.rank == 0 and batch > 0 and batch % 10 == 0:
            checkpoint.save(policy, val_func, scaler, episode)
        batch = batch + 1

        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  episodes=worker_batch_size)
        trajectories = mpi_util.gather_trajectories(trajectories)

        if mpi_util.rank == 0:
            # concatentate trajectories into one list
            trajectories = list(itertools.chain.from_iterable(trajectories))
            print("did a batch of ", len(trajectories), " trajectories")
            print([t['rewards'].sum() for t in trajectories])

            episode += len(trajectories)
            add_value(trajectories,
                      val_func)  # add estimated values to episodes
            add_disc_sum_rew(trajectories,
                             gamma)  # calculated discounted sum of Rs
            add_gae(trajectories, gamma, lam)  # calculate advantage

            # concatenate all episodes into single NumPy arrays
            observes, actions, advantages, disc_sum_rew = build_train_set(
                trajectories)

            # add various stats to training log:
            logger.log({
                '_MeanReward':
                np.mean([t['rewards'].sum() for t in trajectories]),
                'Steps':
                np.sum([t['observes'].shape[0] for t in trajectories])
            })
            log_batch_stats(observes, actions, advantages, disc_sum_rew,
                            logger, episode)

            policy.update(observes, actions, advantages,
                          logger)  # update policy
            val_func.fit(observes, disc_sum_rew,
                         logger)  # update value function

            unscaled = np.concatenate(
                [t['unscaled_obs'] for t in trajectories])
            scaler.update(
                unscaled)  # update running statistics for scaling observations

            logger.write(
                display=True)  # write logger results to file and stdout

        # if mpi_util.rank == 0 and killer.kill_now:
        #     if input('Terminate training (y/[n])? ') == 'y':
        #         break
        #     killer.kill_now = False

        # broadcast policy weights, scaler, val_func
        (policy, scaler, val_func) = mpi_util.broadcast_policy_scaler_val(
            policy, scaler, val_func)

    if mpi_util.rank == 0: logger.close()
    policy.close_sess()
    if mpi_util.rank == 0: val_func.close_sess()
示例#7
0
class Experiment:

    def __init__(self, env_name, discount, num_iterations, lamb, animate, kl_target):
        self.env = gym.make(env_name)
        gym.spaces.seed(1234)
        self.obs_dim = self.env.observation_space.shape[0] + 1 # the use of time steps is beneficial
        self.act_dim = self.env.action_space.shape[0]
        self.discount = discount
        self.num_iterations = num_iterations
        self.lamb = lamb
        self.animate = animate
        self.killer = GracefulKiller()
        self.policy = LinearPolicy(self.obs_dim, self.act_dim, self.env.action_space, kl_target, discount=discount)
        self.value_func = LinearValueFunc(self.obs_dim, discount=discount)

        # save copies of file
        shutil.copy(inspect.getfile(self.policy.__class__), OUTPATH)
        shutil.copy(inspect.getfile(self.value_func.__class__), OUTPATH)
        shutil.copy(inspect.getfile(self.__class__), OUTPATH)

        self.log_file = open(OUTPATH + 'log.csv', 'w')
        self.write_header = True
        print('observation dimension:', self.obs_dim)
        print('action dimension:', self.act_dim)
        self.scaler = Scaler(self.obs_dim)
        self.init_scaler()

    def init_scaler(self):
        print('fitting scaler')
        observation_samples = []
        for i in range(5):
            observation = []
            obs = self.env.reset()
            observation.append(obs)
            obs = obs.astype(np.float64).reshape((1, -1))
            done = False
            step = 0
            while not done:
                obs = np.append(obs, [[step]], axis=1)  # add time step feature
                action = self.policy.get_sample(obs).reshape((1, -1)).astype(np.float64)
                obs_new, reward, done, _ = self.env.step(action)
                observation.append(obs_new)
                obs = obs_new.astype(np.float64).reshape((1, -1))
                step += 1e-3
            observation_samples.append(observation)
        observation_samples = np.concatenate(observation_samples, axis=0)
        # print(observation_samples.shape)
        self.scaler.update(observation_samples)

    def normalize_obs(self, obs):
        scale, offset = self.scaler.get()
        obs_scaled = (obs-offset)*scale
        self.scaler.update(obs.astype(np.float64).reshape((1, -1)))
        return obs_scaled

    def run_one_epsisode(self, train_policy=True, train_value_func=True, animate=False):
        obs = self.env.reset()
        obs = obs.astype(np.float64).reshape((1, -1))
        obs = self.normalize_obs(obs)
        obs = np.append(obs, [[0]], axis=1)  # add time step feature
        log = {
            'rewards': [],
            'policy_loss': [],
            'value_func_loss': [],
            'entropy': [],
            'beta': [],
            'kl': [],
            'advantage':[]
        }

        done = False
        step = 0
        while not done:
            if animate:
                self.env.render()
            action = self.policy.get_sample(obs).reshape((1, -1)).astype(np.float64)
            step += 1e-3
            # print(action)
            obs_new, reward, done, _ = self.env.step(action)
            obs_new = obs_new.astype(np.float64).reshape((1, -1))
            obs_new = self.normalize_obs(obs_new)
            obs_new = np.append(obs_new, [[step]], axis=1)  # add time step feature

            if not isinstance(reward, float):
                reward = np.asscalar(reward)
            log['rewards'].append(reward)

            # scale reward
            if self.discount < 0.999:
                reward *= (1-self.discount)

            # TD residual
            advantage = reward + self.discount * self.value_func.predict(obs_new) - self.value_func.predict(obs)
            advantage = advantage.astype(np.float64).reshape((1,))

            if train_value_func:
                value_func_loss = self.value_func.update(obs, advantage)
            if train_policy:
                policy_loss, kl, entropy, beta = self.policy.update(obs, action, advantage)

            if train_value_func and train_policy:
                log['policy_loss'].append(policy_loss)
                log['kl'].append(kl)
                log['entropy'].append(entropy)
                log['beta'].append(beta)
                log['value_func_loss'].append(value_func_loss)
                log['advantage'].append(advantage)

            obs = obs_new

        return log

    def run_expr(self):
        ep_steps = []
        ep_rewards = []
        for i in range(self.num_iterations):
            # trace vectors are emptied at the beginning of each episode

            # get more accurate value_func estimator
            for _ in range(5):
                self.value_func.init_trace()
                self.run_one_epsisode(train_value_func=True, train_policy=False, animate=False)

            self.policy.init_trace()
            self.value_func.init_trace()

            # run (and train) one trajectory
            log = self.run_one_epsisode(animate=self.animate)

            # compute statistics such as mean and std
            log['steps'] = len(log['rewards'])
            log['rewards'] = np.sum(log['rewards'])
            for key in ['policy_loss', 'kl', 'entropy', 'beta', 'value_func_loss', 'advantage']:
                log[key + '_mean'] = np.mean(log[key])
                log[key + '_std'] = np.std(log[key])
                del log[key]

            # display
            print('episode: ', i)
            print('total steps: {0}, episodic rewards: {1}'.format(log['steps'], log['rewards']))
            for key in ['policy_loss', 'kl', 'entropy', 'beta', 'value_func_loss', 'advantage']:
                print('{:s}: {:.2g}({:.2g})'.format(key, log[key + '_mean'], log[key + '_std']))
            print('\n')
            ep_steps.append(log['steps'])
            ep_rewards.append(log['rewards'])

            # write to log.csv
            if self.write_header:
                fieldnames = [x for x in log.keys()]
                self.writer = csv.DictWriter(self.log_file, fieldnames=fieldnames)
                self.writer.writeheader()
                self.write_header = False
            self.writer.writerow(log)
            # we want the csv file to preserve information even if the program terminates earlier than scheduled.
            self.log_file.flush()

            # save model weights if stopped manually
            if self.killer.kill_now:
                if input('Terminate training (y/[n])? ') == 'y':
                    break
                self.killer.kill_now = False

            # if (i+1)%20 == 0:
            #     print('episode: ', i+1)
            #     print('average steps', np.average(steps))
            #     print('average rewards', np.average(rewards))

        # save weights
        self.policy.save(OUTPATH)
        self.value_func.save(OUTPATH)
        self.scaler.save(OUTPATH)

        plt.subplot(121)
        plt.xlabel('episodes')
        plt.ylabel('steps')
        plt.plot(ep_steps)

        plt.subplot(122)
        plt.xlabel('episodes')
        plt.ylabel('episodic rewards')
        plt.plot(ep_rewards)

        plt.savefig(OUTPATH + 'train.png')
示例#8
0
class Policy():
    def __init__(self,
                 name,
                 obs_dim,
                 act_dim,
                 n_ways,
                 batch_size,
                 log_path,
                 gamma=0.995,
                 lam=0.98,
                 kl_targ=0.003,
                 hid1_mult=10,
                 policy_logvar=1.0):
        self.name = name
        self.obs_dim, self.act_dim = obs_dim, act_dim
        self.n_ways = n_ways
        self.batch_size = batch_size
        self.gamma = gamma
        self.lam = lam
        self.kl_targ = kl_targ
        self.hid1_mult = hid1_mult
        self.policy_logvar = policy_logvar
        self.logger = Logger(logname=os.path.join(log_path, name),
                             now=datetime.utcnow().strftime("%b_%d_%H_%M_%S"))

        self.scaler = Scaler(self.obs_dim)
        self.val_func = NNValueFunction(self.obs_dim, hid1_mult=10)
        self.trpo_net = TrpoNet(name,
                                self.obs_dim,
                                self.act_dim,
                                n_ways=n_ways,
                                kl_targ=kl_targ,
                                hid1_mult=hid1_mult,
                                policy_logvar=policy_logvar)

        self.trajectories = []
        self.episode = 0

    def update_scaler(self, unscaled):
        self.scaler.update(
            unscaled)  # update running statistics for scaling observations

    def update(self,
               unscaled_obs,
               actions,
               rewards,
               env_idx=-1,
               trainWeight=False):
        scale, offset = self.scaler.get()
        scale[-1] = 1.0
        offset[-1] = 0.0
        observes = (unscaled_obs - offset) * scale
        trajectory = {
            'observes': observes,
            'actions': actions,
            'rewards': rewards,
            'unscaled_obs': unscaled_obs
        }
        self.trajectories.append(trajectory)
        if len(self.trajectories) > self.batch_size:
            unscaled = np.concatenate(
                [t['unscaled_obs'] for t in self.trajectories])
            self.scaler.update(
                unscaled)  # update running statistics for scaling observations
            self.logger.log({
                '_{}_MeanReward'.format(self.name):
                np.mean([t['rewards'].sum() for t in self.trajectories]),
                '_{}_steps'.format(self.name):
                unscaled.shape[0] / self.batch_size
            })
            trajs = copy.deepcopy(self.trajectories)
            self.trajectories = []

            self.episode += len(trajs)
            self._add_value(trajs,
                            self.val_func)  # add estimated values to episodes
            self._add_disc_sum_rew(
                trajs, self.gamma)  # calculated discounted sum of Rs
            self._add_gae(trajs, self.gamma, self.lam)  # calculate advantage
            # concatenate all episodes into single NumPy arrays
            observes, actions, advantages, disc_sum_rew = self._build_train_set(
                trajs)
            self._log_batch_stats(observes, actions, advantages, disc_sum_rew,
                                  self.logger, self.episode)
            self.trpo_net.update(observes,
                                 actions,
                                 advantages,
                                 env_idx,
                                 self.logger,
                                 trainWeight=trainWeight)  # update policy
            self.val_func.fit(observes, disc_sum_rew,
                              self.logger)  # update value function

            self.logger.write(display=False)

    def act(self, unscaled_obs):
        scale, offset = self.scaler.get()
        scale[-1] = 1.0  # don't scale time step feature
        offset[-1] = 0.0  # don't offset time step feature
        #print(self.name,unscaled_obs.shape,len(offset))
        obs = (unscaled_obs - offset) * scale
        action = self.trpo_net.sample(obs).reshape((1, -1)).astype(np.float32)
        return action

    def addway(self):
        self.n_ways += 1

        var_dict = self.trpo_net.get_vars()
        new_pi = TrpoNet(self.name, self.obs_dim, self.act_dim, self.n_ways,
                         self.kl_targ, self.hid1_mult, self.policy_logvar)
        new_pi.set_vars(var_dict)
        self.trpo_net.close_sess()
        self.trpo_net = new_pi
        gc.collect()

    def close_session(self):
        self.val_func.close_sess()
        self.trpo_net.close_sess()

    def _discount(self, x, gamma):
        """ Calculate discounted forward sum of a sequence at each point """
        return scipy.signal.lfilter([1.0], [1.0, -gamma], x[::-1])[::-1]

    def _add_value(self, trajectories, val_func):
        """ Adds estimated value to all time steps of all trajectories

        Args:
            trajectories: as returned by run_policy()
            val_func: object with predict() method, takes observations
                and returns predicted state value

        Returns:
            None (mutates trajectories dictionary to add 'values')
        """
        for trajectory in trajectories:
            observes = trajectory['observes']
            values = val_func.predict(observes)
            trajectory['values'] = values

    def _add_disc_sum_rew(self, trajectories, gamma):
        """ Adds discounted sum of rewards to all time steps of all trajectories

        Args:
            trajectories: as returned by run_policy()
            gamma: discount

        Returns:
            None (mutates trajectories dictionary to add 'disc_sum_rew')
        """
        for trajectory in trajectories:
            if gamma < 0.999:  # don't scale for gamma ~= 1
                rewards = trajectory['rewards'] * (1 - gamma)
            else:
                rewards = trajectory['rewards']
            disc_sum_rew = self._discount(rewards, gamma)
            trajectory['disc_sum_rew'] = disc_sum_rew

    def _add_gae(self, trajectories, gamma, lam):
        """ Add generalized advantage estimator.
        https://arxiv.org/pdf/1506.02438.pdf

        Args:
            trajectories: as returned by run_policy(), must include 'values'
                key from add_value().
            gamma: reward discount
            lam: lambda (see paper).
                lam=0 : use TD residuals
                lam=1 : A =  Sum Discounted Rewards - V_hat(s)

        Returns:
            None (mutates trajectories dictionary to add 'advantages')
        """
        for trajectory in trajectories:
            if gamma < 0.999:  # don't scale for gamma ~= 1
                rewards = trajectory['rewards'] * (1 - gamma)
            else:
                rewards = trajectory['rewards']
            values = trajectory['values']
            # temporal differences
            tds = rewards - values + np.append(values[1:] * gamma, 0)
            advantages = self._discount(tds, gamma * lam)
            trajectory['advantages'] = advantages

    def _build_train_set(self, trajectories):
        """

        Args:
            trajectories: trajectories after processing by add_disc_sum_rew(),
                add_value(), and add_gae()

        Returns: 4-tuple of NumPy arrays
            observes: shape = (N, obs_dim)
            actions: shape = (N, act_dim)
            advantages: shape = (N,)
            disc_sum_rew: shape = (N,)
        """
        observes = np.concatenate([t['observes'] for t in trajectories])
        actions = np.concatenate([t['actions'] for t in trajectories])
        disc_sum_rew = np.concatenate(
            [t['disc_sum_rew'] for t in trajectories])
        advantages = np.concatenate([t['advantages'] for t in trajectories])
        # normalize advantages
        advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                         1e-6)

        return observes, actions, advantages, disc_sum_rew

    def _log_batch_stats(self, observes, actions, advantages, disc_sum_rew,
                         logger, episode):
        """ Log various batch statistics """
        logger.log({
            '_mean_obs': np.mean(observes),
            '_min_obs': np.min(observes),
            '_max_obs': np.max(observes),
            '_std_obs': np.mean(np.var(observes, axis=0)),
            '_mean_act': np.mean(actions),
            '_min_act': np.min(actions),
            '_max_act': np.max(actions),
            '_std_act': np.mean(np.var(actions, axis=0)),
            '_mean_adv': np.mean(advantages),
            '_min_adv': np.min(advantages),
            '_max_adv': np.max(advantages),
            '_std_adv': np.var(advantages),
            '_mean_discrew': np.mean(disc_sum_rew),
            '_min_discrew': np.min(disc_sum_rew),
            '_max_discrew': np.max(disc_sum_rew),
            '_std_discrew': np.var(disc_sum_rew),
            '_Episode': episode
        })
示例#9
0
class Agent:
    #Warning! policy.py and critic.py are still work in progress and contain many global variables that should be converted to
    #class member variables. Before that is done, all instances of Agent must use the same values for the following:
    #PPOepsilon,nHidden,nUnitsPerLayer,activation,H,entropyLossWeight,sdLowLimit
    def __init__(self,
                 stateDim: int,
                 actionDim: int,
                 actionMin: np.array,
                 actionMax: np.array,
                 learningRate=0.0005,
                 gamma=0.99,
                 GAElambda=0.95,
                 PPOepsilon=0.2,
                 PPOentropyLossWeight=0,
                 nHidden: int = 2,
                 nUnitsPerLayer: int = 128,
                 mode="PPO-CMA-m",
                 activation="lrelu",
                 H: int = 9,
                 entropyLossWeight: float = 0,
                 sdLowLimit=0.01,
                 useScaler: bool = True,
                 criticTimestepScale=0.001):
        #Create policy network
        print("Creating policy")
        self.actionMin = actionMin.copy()
        self.actionMax = actionMax.copy()
        self.actionDim = actionDim
        self.stateDim = stateDim
        self.useScaler = useScaler
        if useScaler:
            self.scaler = Scaler(stateDim)
        self.scalerInitialized = False
        self.normalizeAdvantages = True
        self.gamma = gamma
        self.GAElambda = GAElambda
        self.criticTimestepScale = 0 if gamma == 0 else criticTimestepScale  #with gamma==0, no need for this
        piEpsilon = None
        nHistory = 1
        negativeAdvantageAvoidanceSigma = 0
        if mode == "PPO-CMA" or mode == "PPO-CMA-m":
            usePPOLoss = False  #if True, we use PPO's clipped surrogate loss function instead of the standard -A_i * log(pi(a_i | s_i))
            separateVarAdapt = True
            self.reluAdvantages = True if mode == "PPO-CMA" else False
            nHistory = H  #policy mean adapts immediately, policy covariance as an aggreagate of this many past iterations
            useSigmaSoftClip = True
            negativeAdvantageAvoidanceSigma = 1 if mode == "PPO-CMA-m" else 0
        elif mode == "PPO":
            usePPOLoss = True  #if True, we use PPO's clipped surrogate loss function instead of the standard -A_i * log(pi(a_i | s_i))
            separateVarAdapt = False
            # separateSigmaAdapt=False
            self.reluAdvantages = False
            useSigmaSoftClip = True
            piEpsilon = 0
        else:
            raise ("Unknown mode {}".format(mode))
        self.policy = Policy(
            stateDim,
            actionDim,
            actionMin,
            actionMax,
            entropyLossWeight=PPOentropyLossWeight,
            networkActivation=activation,
            networkDepth=nHidden,
            networkUnits=nUnitsPerLayer,
            networkSkips=False,
            learningRate=learningRate,
            minSigma=sdLowLimit,
            PPOepsilon=PPOepsilon,
            usePPOLoss=usePPOLoss,
            separateVarAdapt=separateVarAdapt,
            nHistory=nHistory,
            useSigmaSoftClip=useSigmaSoftClip,
            piEpsilon=piEpsilon,
            negativeAdvantageAvoidanceSigma=negativeAdvantageAvoidanceSigma)

        #Create critic network, +1 stateDim because at least in OpenAI gym, episodes are time-limited and the value estimates thus depend on simulation time.
        #Thus, we use time step as an additional feature for the critic.
        #Note that this does not mess up generalization, as the feature is not used for the policy during training or at runtime
        print("Creating critic network")
        self.critic = Critic(stateDim=stateDim + 1,
                             learningRate=learningRate,
                             nHidden=nHidden,
                             networkUnits=nUnitsPerLayer,
                             networkActivation=activation,
                             useSkips=False,
                             lossType="L1")

        #Experience trajectory buffers for the memorize() and updateWithMemorized() methods
        self.experienceTrajectories = []
        self.currentTrajectory = []

    #call this after tensorflow's global variables initializer
    def init(self, sess: tf.Session, verbose=False):
        #Pretrain the policy to output the initial Gaussian for all states
        self.policy.init(
            sess, 0, 1,
            0.5 * (self.actionMin + self.actionMax) * np.ones(self.actionDim),
            0.5 * (self.actionMax - self.actionMin) * np.ones(self.actionDim),
            256, 2000, verbose)

    #stateObs is an n-by-m tensor, where n = number of observations, m = number of observation variables
    def act(self,
            sess: tf.Session,
            stateObs: np.array,
            deterministic=False,
            clipActionToLimits=True):
        #Expand a single 1d-observation into a batch of 1 vectors
        if len(stateObs.shape) == 1:
            stateObs = np.reshape(stateObs, [1, stateObs.shape[0]])
        #Query the policy for the action, except for the first iteration where we sample directly from the initial exploration Gaussian
        #that covers the whole action space.
        #This is done because we don't know the scale of state observations a priori; thus, we can only init the state scaler in update(),
        #after we have collected some experience.
        if self.useScaler and (not self.scalerInitialized):
            actions = np.random.normal(
                0.5 * (self.actionMin + self.actionMax) *
                np.ones(self.actionDim),
                0.5 * (self.actionMax - self.actionMin) *
                np.ones(self.actionDim),
                size=[stateObs.shape[0], self.actionDim])
            if clipActionToLimits:
                actions = np.clip(
                    actions, np.reshape(self.actionMin, [1, self.actionDim]),
                    np.reshape(self.actionMax, [1, self.actionDim]))
            return actions
        else:
            if self.useScaler:
                scaledObs = self.scaler.process(stateObs)
            else:
                scaledObs = stateObs
            if deterministic:
                actions = self.policy.getExpectation(sess, scaledObs)
            else:
                actions = self.policy.sample(sess, scaledObs)
            if clipActionToLimits:
                actions = np.clip(actions, self.actionMin, self.actionMax)
            return actions

    def memorize(self, observation: np.array, action: np.array, reward: float,
                 nextObservation: np.array, done: bool):
        e = Experience(observation, action, reward, nextObservation, done)
        self.currentTrajectory.append(e)
        if done:
            self.experienceTrajectories.append(self.currentTrajectory)
            self.currentTrajectory = []

    def getAverageActionStdev(self):
        if self.useScaler and (not self.scalerInitialized):
            return np.mean(0.5 * (self.actionMax - self.actionMin))
        else:
            return self.policy.usedSigmaSum / (1e-20 +
                                               self.policy.usedSigmaSumCounter)

    #If you call memorize() after each action, you can update the agent with this method.
    #If you handle the experience buffers yourself, e.g., due to a multithreaded implementation, use the update() method instead.
    def updateWithMemorized(self,
                            sess: tf.Session,
                            batchSize: int = 512,
                            nBatches: int = 100,
                            verbose=True,
                            valuesValid=False,
                            timestepsValid=False):
        self.update(sess,
                    experienceTrajectories=self.experienceTrajectories,
                    batchSize=batchSize,
                    nBatches=nBatches,
                    verbose=verbose,
                    valuesValid=valuesValid,
                    timestepsValid=timestepsValid)
        averageEpisodeReturn = 0
        for t in self.experienceTrajectories:
            episodeReturn = 0
            for e in t:
                episodeReturn += e.r
            averageEpisodeReturn += episodeReturn
        averageEpisodeReturn /= len(self.experienceTrajectories)
        self.experienceTrajectories = []
        self.currentTrajectory = []
        return averageEpisodeReturn

    #experienceTrajectories is a list of lists of Experience instances such that each of the contained lists corresponds to an episode simulation trajectory
    def update(self,
               sess: tf.Session,
               experienceTrajectories,
               batchSize: int = 512,
               nBatches: int = 100,
               verbose=True,
               valuesValid=False,
               timestepsValid=False):
        trajectories = experienceTrajectories  #shorthand

        #Collect all data into linear arrays for training.
        nTrajectories = len(trajectories)
        nData = 0
        for trajectory in trajectories:
            nData += len(trajectory)
            #propagate values backwards along trajectory if not already done
            if not valuesValid:
                for i in reversed(range(len(trajectory) - 1)):
                    #value estimates, used for training the critic and estimating advantages
                    trajectory[i].V = trajectory[
                        i].r + self.gamma * trajectory[i + 1].V
            #update time steps if not updated
            if not timestepsValid:
                for i in range(len(trajectory)):
                    trajectory[i].timeStep = i
        allStates = np.zeros([nData, self.stateDim])
        allActions = np.zeros([nData, self.actionDim])
        allValues = np.zeros([nData])
        allTimes = np.zeros([nData, 1])
        k = 0
        for trajectory in trajectories:
            for e in trajectory:
                allStates[k, :] = e.s
                allValues[k] = e.V
                allActions[k, :] = e.a
                allTimes[k, 0] = e.timeStep * self.criticTimestepScale
                k += 1

        #Update scalers
        if self.useScaler:
            self.scaler.update(allStates)
            scale, offset = self.scaler.get()
            self.scalerInitialized = True
        else:
            offset = 0
            scale = 1

        #Scale the observations for training the critic
        scaledStates = self.scaler.process(allStates)

        #Train critic
        def augmentCriticObs(obs: np.array, timeSteps: np.array):
            return np.concatenate([obs, timeSteps], axis=1)

        self.critic.train(sess,
                          augmentCriticObs(scaledStates, allTimes),
                          allValues,
                          batchSize,
                          nEpochs=0,
                          nBatches=nBatches,
                          verbose=verbose)

        #Policy training needs advantages, which depend on the critic we just trained.
        #We use Generalized Advantage Estimation by Schulman et al.
        if verbose:
            print("Estimating advantages...".format(len(trajectories)))
        for t in trajectories:
            #query the critic values of all states of this trajectory in one big batch
            nSteps = len(t)
            states = np.zeros([nSteps + 1, self.stateDim])
            timeSteps = np.zeros([nSteps + 1, 1])
            for i in range(nSteps):
                states[i, :] = t[i].s
                timeSteps[i, 0] = t[i].timeStep * self.criticTimestepScale
            states[nSteps, :] = t[nSteps - 1].s_next
            states = (states - offset) * scale
            values = self.critic.predict(sess,
                                         augmentCriticObs(states, timeSteps))

            #GAE loop, i.e., take the instantaneous advantage (how much value a single action brings, assuming that the
            #values given by the critic are unbiased), and smooth those along the trajectory using 1st-order IIR filter.
            for step in reversed(range(nSteps - 1)):
                delta_t = t[step].r + self.gamma * values[step +
                                                          1] - values[step]
                t[step].advantage = delta_t + self.GAElambda * self.gamma * t[
                    step + 1].advantage

        #Gather the advantages to linear array and apply ReLU and normalization if needed
        allAdvantages = np.zeros([nData])
        k = 0
        for trajectory in trajectories:
            for e in trajectory:
                allAdvantages[k] = e.advantage
                k += 1

        if self.reluAdvantages:
            allAdvantages = np.clip(allAdvantages, 0, np.inf)
        if self.normalizeAdvantages:
            aMean = np.mean(allAdvantages)
            aSd = np.std(allAdvantages)
            if verbose:
                print("Advantage mean {}, sd{}".format(aMean, aSd))
            allAdvantages /= 1e-10 + aSd

        #Train policy. Note that this uses original unscaled states, because the PPO-CMA variance training needs a history of
        #states in the same scale
        self.policy.train(sess,
                          allStates,
                          allActions,
                          allAdvantages,
                          batchSize,
                          nEpochs=0,
                          nBatches=nBatches,
                          stateOffset=offset,
                          stateScale=scale,
                          verbose=verbose)
示例#10
0
class GeneratorAgentPure(object):
    def __init__(self,
                 env,
                 policy_function,
                 value_function,
                 discriminator,
                 gamma,
                 lam,
                 init_qpos,
                 init_qvel,
                 logger=None):
        self.env = env
        self.obs_dim = env.observation_space.shape[0]
        self.act_dim = env.action_space.shape[0]

        self.policy = policy_function
        self.value = value_function
        self.discriminator = discriminator
        self.gamma = gamma
        self.lam = lam

        self.init_qpos = init_qpos
        self.init_qvel = init_qvel

        self.scaler = Scaler(self.obs_dim)

        # logger
        self.logger = logger

        # set scaler's scale and offset by collecting 5 episodes
        self.collect(timesteps=2048)

    def discount(self, x, gamma):
        return scipy.signal.lfilter([1.0], [1.0, -gamma], x[::-1])[::-1]

    def get_random(self):
        idx = np.random.randint(low=0, high=self.init_qpos.shape[1], size=1)
        return np.squeeze(self.init_qpos[:,
                                         idx]), np.squeeze(self.init_qvel[:,
                                                                          idx])

    def collect(self, timesteps):
        trajectories = []
        trew_stat = []

        scale, offset = self.scaler.get()

        self.logger.log('scale_offset', [scale, offset])

        buffer_time = 0
        while buffer_time < timesteps:
            unscaled_obs, scaled_obs, actions, rewards = [], [], [], []
            egocentric = []
            done = False
            obs = self.env.reset()
            qpos, qvel = self.get_random()
            # we are setting initial qpos and qvel from expert
            self.env.set_state(qpos, qvel)
            timestep = 0
            while not done and timestep < 1000:
                obs = obs.astype(np.float32).reshape(1, -1)
                unscaled_obs.append(obs)
                obs = (obs - offset) * scale
                scaled_obs.append(obs)
                acts = self.policy.sample(obs)
                actions.append(acts.astype(np.float32).reshape(1, -1))
                obs, rew, done, _ = self.env.step(acts)
                rewards.append(rew)
                timestep += 1
                buffer_time += 1

            # statistics
            trew_stat.append(np.sum(rewards))

            # episode info
            traj_obs = np.concatenate(scaled_obs)
            traj_unscaled_obs = np.concatenate(unscaled_obs)
            traj_acts = np.concatenate(actions)
            #traj_rews = np.array(rewards, dtype=np.float64)
            traj_rews = np.squeeze(
                self.discriminator.get_rewards(traj_unscaled_obs, traj_acts))

            # scale rewards using running std of the experiment
            # traj_scaled_rews = traj_rews * np.squeeze(rew_scale)
            traj_scaled_rews = traj_rews

            # calculate discount sum of rewards
            traj_disc_rews = self.discount(traj_scaled_rews, self.gamma)

            # calculate advantages
            traj_values = self.value.predict(traj_obs)

            deltas = traj_scaled_rews - traj_values + np.append(
                traj_values[1:] * self.gamma, 0)
            traj_advantages = self.discount(deltas, self.gamma * self.lam)

            trajectory = {
                'observations': traj_obs,
                'actions': traj_acts,
                'tdlam': traj_disc_rews,
                'advantages': traj_advantages,
                'unscaled_obs': traj_unscaled_obs
            }
            trajectories.append(trajectory)

        # update observation scaler
        uns_obs = np.concatenate([t['unscaled_obs'] for t in trajectories])
        self.scaler.update(uns_obs)

        # update rewards scaler
        #uns_rews = np.concatenate([t['unscaled_rews'] for t in trajectories])
        #self.rew_scaler.update(uns_rews)
        observations = np.concatenate(
            [t['observations'] for t in trajectories])
        actions = np.concatenate([t['actions'] for t in trajectories])
        tdlam = np.concatenate([t['tdlam'] for t in trajectories])
        advantages = np.concatenate([t['advantages'] for t in trajectories])
        advantages = (advantages - np.mean(advantages)) / np.std(advantages)

        # check stats
        print('mean_trew: %f' % np.mean(trew_stat))
        self.logger.log('trew_stat', np.mean(trew_stat))

        return observations, uns_obs, actions, tdlam, advantages
示例#11
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)
    aigym_path = os.path.join('/tmp', env_name, now)
    #env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)
    policy = Policy(env_name, obs_dim, act_dim, kl_targ, hid1_mult,
                    policy_logvar)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False

    scale, offset = scaler.get()
    data = {'SCALE': scale, 'OFFSET': offset}
    directory_to_store_data = '../saved_models/' + env_name + '/'
    if not os.path.exists(directory_to_store_data):
        os.makedirs(directory_to_store_data)
    file_name = directory_to_store_data + 'scale_and_offset.pkl'
    with open(file_name, 'wb') as f:
        pickle.dump(data, f)

    logger.close()
    policy.close_sess()
    val_func.close_sess()
示例#12
0
def main(num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar):
    """ Main training loop
    Args:
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym()
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())

    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)
    policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, episodes=5)
    episode = 0
    #Inizialize reward list (to keep track of improvements)
    avg_rew_list = []
    while episode < num_episodes:
        print(episode)
        trajectories = run_policy(env, policy, scaler, episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        policy.update(observes, actions, advantages)  # update policy
        val_func.fit(observes, disc_sum_rew)  # update value function
        avg_rew_list.append(avg_rewards(trajectories))
        #Save every 20000 epidodes models (value_func, policy, scaler) and average rewards
        if not episode % 20000:
            print("Saving models")
            policy.save(episode)
            val_func.save(episode)
            f = open("models/scaler-" + str(episode) + ".pkl", 'wb')
            pickle.dump(scaler, f, pickle.HIGHEST_PROTOCOL)
            f.close()
            f2 = open("models/rewards-" + str(episode) + ".pkl", 'wb')
            pickle.dump(deepcopy(avg_rew_list), f2, pickle.HIGHEST_PROTOCOL)
            f2.close()
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    #Show animation at the end of training
    while True:
        obs = env.reset()
        step = 0.0
        scale, offset = scaler.get()
        scale[-1] = 1.0
        offset[-1] = 0.0
        done = False
        while not done:
            obs = obs.astype(np.float32).reshape((1, -1))
            obs = np.append(obs, [[step]], axis=1)
            obs = (obs - offset) * scale
            action = policy.sample(obs).reshape((1, -1)).astype(np.float32)
            obs, reward, done, _ = env.step(np.squeeze(action, axis=0))
            env.render1()
            env.render2()
            step += 1e-3
    policy.close_sess()
    val_func.close_sess()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar, task_identity):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """
    print('Training started for ' + env_name + ' and task_identity ' +
          str(task_identity))

    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name=env_name,
                                     task_identity=task_identity)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)
    policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar,
                    env_name, task_identity)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False

    scale, offset = scaler.get()
    #scale_and_offset_data = {'scale': scale, 'offset': offset}
    #scale_and_offset_file = 'scale_and_offset_file_' + env_name + '_' + task_identity + '.pkl'
    #with open(scale_and_offset_file, 'wb') as f:
    #    pickle.dump(scale_and_offset_data, f)
    #### Saving expert trajectories after sufficient training has been made
    ## Visualization
    #aigym_path = os.path.join(VIDEO_LOGS_DIRECTORY, env_name, now)
    #env = wrappers.Monitor(env, aigym_path, force=True)
    trajectories = run_policy(env,
                              policy,
                              scaler,
                              logger,
                              episodes=DEMONSTRATOR_EPISODES_TO_LOG)
    data_to_store = {
        DEMONSTRATOR_TRAJECTORY_KEY: trajectories,
        SCALE_KEY: scale,
        OFFSET_KEY: offset
    }
    directory_to_store_trajectories = './../' + DEMONSTRATOR_TRAJECTORIES_DIRECTORY
    if not os.path.exists(directory_to_store_trajectories):
        os.makedirs(directory_to_store_trajectories)
    file_to_store_trajectories = directory_to_store_trajectories + env_name + '_' + task_identity + '.pkl'
    with open(file_to_store_trajectories, "wb") as f:
        pickle.dump(data_to_store, f)

    logger.close()
    policy.close_sess()
    val_func.close_sess()