class Experiment: def __init__(self, env_name, discount, num_iterations, lamb, animate, kl_target, **kwargs): self.env_name = env_name self.env = gym.make(env_name) if env_name.startswith('Fetch'): # FetchReach env is a little bit different self.env = gym.wrappers.FlattenDictWrapper(self.env, ['observation', 'desired_goal', 'achieved_goal']) gym.spaces.seed(1234) # for reproducibility self.obs_dim = self.env.observation_space.shape[0] + 1 # adding time step as feature self.act_dim = self.env.action_space.shape[0] self.discount = discount self.num_iterations = num_iterations self.lamb = lamb self.animate = animate self.buffer = Buffer(1000000, self.obs_dim, self.act_dim) # 1000000 is the size they have used in paper self.episodes = 20 # larger episodes can reduce variance self.killer = GracefulKiller() self.policy = QPropPolicy(self.obs_dim, self.act_dim, self.env.action_space, kl_target, epochs=20) self.critic = DeterministicCritic(self.obs_dim, self.act_dim, self.discount, OUTPATH) self.value_func = l2TargetValueFunc(self.obs_dim, epochs=10) if 'show' in kwargs and not kwargs['show']: # save copies of file shutil.copy(inspect.getfile(self.policy.__class__), OUTPATH) shutil.copy(inspect.getfile(self.value_func.__class__), OUTPATH) shutil.copy(inspect.getfile(self.critic.__class__), OUTPATH) shutil.copy(inspect.getfile(self.__class__), OUTPATH) self.log_file = open(OUTPATH + 'log.csv', 'w') self.write_header = True print('Observation dimension:', self.obs_dim) print('Action dimension:', self.act_dim) # The use of a scaler is crucial self.scaler = Scaler(self.obs_dim) self.init_scaler() def init_scaler(self): """ Collection observations from 5 episodes to initialize Scaler. :return: a properly initialized scaler """ print('Fitting scaler') observation_samples = [] for i in range(5): observation = [] obs = self.env.reset() observation.append(obs) obs = obs.astype(np.float64).reshape((1, -1)) done = False step = 0 while not done: obs = np.append(obs, [[step]], axis=1) # add time step feature action = self.policy.get_sample(obs).reshape((1, -1)).astype(np.float64) if self.env_name.startswith('Fetch'): obs_new, reward, done, _ = self.env.step(action.reshape(-1)) else: obs_new, reward, done, _ = self.env.step(action) observation.append(obs_new) obs = obs_new.astype(np.float64).reshape((1, -1)) step += 1e-3 observation_samples.append(observation) observation_samples = np.concatenate(observation_samples, axis=0) self.scaler.update(observation_samples) def normalize_obs(self, obs): """ Transform and update the scaler on the fly. :param obs: Raw observation :return: normalized observation """ scale, offset = self.scaler.get() obs_scaled = (obs-offset)*scale self.scaler.update(obs.astype(np.float64).reshape((1, -1))) return obs_scaled def run_one_episode(self): """ collect a trajectory of (obs, act, reward, obs_next) """ obs = self.env.reset() observes, actions, rewards = [],[],[] done = False step = 0 while not done: if self.animate: self.env.render() obs = obs.astype(np.float64).reshape((1, -1)) obs = self.normalize_obs(obs) obs = np.append(obs, [[step]], axis=1) # add time step feature at normalized observation observes.append(obs) action = self.policy.get_sample(obs).reshape((1, -1)).astype(np.float64) actions.append(action) if self.env_name.startswith('Fetch'): obs_new, reward, done, _ = self.env.step(action.reshape(-1)) else: obs_new, reward, done, _ = self.env.step(action) if not isinstance(reward, float): reward = np.asscalar(reward) rewards.append(reward) obs = obs_new step += 0.003 return np.concatenate(observes), np.concatenate(actions), np.array(rewards) def discounted_sum(self, l, factor): """ Discounted sum of return or advantage estimates along a trajectory. :param l: a list containing the values of discounted summed interest. :param factor: discount factor in the disc_sum case or discount*lambda for GAE :return: discounted sum of l with regard to factor """ discounted = [] sum = 0 for i in reversed(l): discounted.append(factor*sum+i) sum = factor*sum+i return np.array(list(reversed(discounted))) def run_policy(self, episodes): """ Gather a batch of trajectory samples. :param episodes: size of batch. :return: a batch of samples """ trajectories = [] for e in range(episodes): observes, actions, rewards = self.run_one_episode() trajectory = {'observes': observes, 'actions': actions, 'rewards': rewards, 'scaled_rewards': rewards*(1-self.discount)} trajectories.append(trajectory) return trajectories def run_expr(self): ep_steps = [] ep_rewards = [] ep_entropy = [] i = 0 while i < self.num_iterations: trajectories = self.run_policy(20) # add to experience replay buffer self.buffer.append(trajectories) print('buffer size:', self.buffer.size()) i += len(trajectories) # for E=20, T=50, the total number of samples would be 1000 # In future needs to account for not uniform time steps per episode. # e.g. in Hopper-v2 environment not every episode has same time steps # E = len(trajectories) # num_samples = np.sum([len(t['rewards']) for t in trajectories]) gradient_steps = np.sum([len(t['rewards']) for t in trajectories]) if self.env_name.startswith('Fetch'): assert (gradient_steps == 20*50) """train critic""" # train all samples in the buffer, to the extreme # self.critic.fit(self.policy, self.buffer, epochs=20, num_samples=self.buffer.size()) # train some samples minibatches only critic_loss_mean, critic_loss_std = self.critic.another_fit_func(self.policy, self.buffer, gradient_steps) """calculation of episodic discounted return only needs rewards""" mc_returns = np.concatenate([self.discounted_sum(t['scaled_rewards'], self.discount) for t in trajectories]) """using current batch of samples to update baseline""" observes = np.concatenate([t['observes'] for t in trajectories]) actions = np.concatenate([t['actions'] for t in trajectories]) value_func_loss = self.value_func.update(observes, mc_returns) """compute GAE""" for t in trajectories: t['values'] = self.value_func.predict(t['observes']) # IS it really legitimate to insert 0 at the last obs? t['td_residual'] = t['scaled_rewards'] + self.discount * np.append(t['values'][1:], 0) - t['values'] t['gae'] = self.discounted_sum(t['td_residual'], self.discount * self.lamb) advantages = np.concatenate([t['gae'] for t in trajectories]) """normalize advantage estimates, Crucial step""" advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6) """compute control variate""""" cv = self.critic.get_contorl_variate(self.policy, observes, actions) # cv must not be centered # cv = (cv - cv.mean()) / (cv.std() + 1e-6) """conservative control variate""" eta = [1 if i > 0 else 0 for i in advantages*cv] """center learning signal""" # check that advantages and CV should be of size E*T # eta controls the on-off of control variate learning_signal = advantages - eta*cv # learning_signal = (learning_signal - learning_signal.mean()) / (learning_signal.std() + 1e-6) """controlled taylor eval term""" ctrl_taylor = np.concatenate([ [eta[i]*act] for i, act in enumerate(self.critic.get_taylor_eval(self.policy, observes))]) """policy update""" ppo_loss, ddpg_loss, kl, entropy, beta = self.policy.update(observes, actions, learning_signal, ctrl_taylor) avg_rewards = np.sum(np.concatenate([t['rewards'] for t in trajectories])) / self.episodes avg_timesteps = np.average([len(t['rewards']) for t in trajectories]) log = {} # save training statistics log['steps'] = avg_timesteps log['rewards'] = avg_rewards log['critic_loss'] = critic_loss_mean log['policy_ppo_loss'] = ppo_loss log['policy_ddpg_loss'] = ddpg_loss log['kl'] = kl log['entropy'] = entropy log['value_func_loss'] = value_func_loss log['beta'] = beta # display print('episode: ', i) print('average steps: {0}, average rewards: {1}'.format(log['steps'], log['rewards'])) for key in ['critic_loss', 'policy_ppo_loss', 'policy_ddpg_loss', 'value_func_loss', 'kl', 'entropy', 'beta']: print('{:s}: {:.2g}'.format(key, log[key])) print('\n') ep_steps.append(log['steps']) ep_rewards.append(log['rewards']) ep_entropy.append(log['entropy']) # write to log.csv if self.write_header: fieldnames = [x for x in log.keys()] self.writer = csv.DictWriter(self.log_file, fieldnames=fieldnames) self.writer.writeheader() self.write_header = False self.writer.writerow(log) # we want the csv file to preserve information even if the program terminates earlier than scheduled. self.log_file.flush() # save model weights if stopped early if self.killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break self.killer.kill_now = False self.policy.save(OUTPATH) self.value_func.save(OUTPATH) self.scaler.save(OUTPATH) plt.figure(figsize=(12,9)) if self.env_name.startswith('Fetch'): ax1 = plt.subplot(121) plt.xlabel('episodes') plt.ylabel('policy entropy') plt.plot(ep_entropy) scale_x = self.episodes ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x)) ax1.xaxis.set_major_formatter(ticks_x) else: ax1 = plt.subplot(121) plt.xlabel('episodes') plt.ylabel('steps') plt.plot(ep_steps) scale_x = self.episodes ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x)) ax1.xaxis.set_major_formatter(ticks_x) ax2 = plt.subplot(122) plt.xlabel('episodes') plt.ylabel('episodic rewards') plt.plot(ep_rewards) scale_x = self.episodes ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x)) ax2.xaxis.set_major_formatter(ticks_x) plt.savefig(OUTPATH + 'train.png') def load_model(self, load_from): """ Load all Function Approximators plus a Scaler. Replaybuffer is not restored though. :param load_from: Dir containing saved weights. """ from tensorflow.python.tools import inspect_checkpoint as chkp # # print all tensors in checkpoint file # chkp.print_tensors_in_checkpoint_file(load_from+'policy/policy.pl', tensor_name='', all_tensors=True, all_tensor_names=True) self.policy.load(load_from + 'policy/') self.value_func.load(load_from + 'value_func/') self.critic.load(load_from+'critic/') with open(load_from + "scaler.pkl", 'rb') as file: self.scaler = pickle.load(file) def demonstrate_agent(self, load_from): """ Simply run the policy without training. :param load_from: :return: """ self.load_model(load_from) while True: observes, actons, rewards = self.run_one_episode() ep_rewards = np.sum(rewards) ep_steps = len(rewards) print("Total steps: {0}, total rewards: {1}\n".format(ep_steps, ep_rewards))
class Experiment: def __init__(self, env_name, discount, num_iterations, lamb, animate, kl_target, show): self.env_name = env_name self.env = gym.make(env_name) if env_name == "FetchReach-v0": self.env = gym.wrappers.FlattenDictWrapper( self.env, ['observation', 'desired_goal', 'achieved_goal']) gym.spaces.seed(1234) self.obs_dim = self.env.observation_space.shape[ 0] + 1 # adding time step as feature self.act_dim = self.env.action_space.shape[0] self.discount = discount self.num_iterations = num_iterations self.lamb = lamb self.animate = animate self.buffer = Buffer(50000, self.obs_dim, self.act_dim) self.episodes = 20 self.killer = GracefulKiller() self.policy = QPropPolicy(self.obs_dim, self.act_dim, self.env.action_space, kl_target, epochs=5) self.critic = DeterministicCritic(self.obs_dim, self.act_dim, self.discount, OUTPATH) # using MC return would be more helpful self.value_func = l2TargetValueFunc(self.obs_dim, epochs=10) if not show: # save copies of file shutil.copy(inspect.getfile(self.policy.__class__), OUTPATH) shutil.copy(inspect.getfile(self.value_func.__class__), OUTPATH) shutil.copy(inspect.getfile(self.__class__), OUTPATH) self.log_file = open(OUTPATH + 'log.csv', 'w') self.write_header = True print('observation dimension:', self.obs_dim) print('action dimension:', self.act_dim) # Use of a scaler is crucial self.scaler = Scaler(self.obs_dim) self.init_scaler() def init_scaler(self): """ 5 episodes empirically determined. :return: """ print('Fitting scaler') observation_samples = [] for i in range(5): observation = [] obs = self.env.reset() observation.append(obs) obs = obs.astype(np.float64).reshape((1, -1)) done = False step = 0 while not done: obs = np.append(obs, [[step]], axis=1) # add time step feature action = self.policy.get_sample(obs).reshape( (1, -1)).astype(np.float64) if self.env_name == "FetchReach-v0": obs_new, reward, done, _ = self.env.step( action.reshape(-1)) else: obs_new, reward, done, _ = self.env.step(action) observation.append(obs_new) obs = obs_new.astype(np.float64).reshape((1, -1)) step += 1e-3 observation_samples.append(observation) observation_samples = np.concatenate(observation_samples, axis=0) self.scaler.update(observation_samples) def normalize_obs(self, obs): """ transform and update on the fly. :param obs: :return: """ scale, offset = self.scaler.get() obs_scaled = (obs - offset) * scale self.scaler.update(obs.astype(np.float64).reshape((1, -1))) return obs_scaled def run_one_episode(self): """ collect data only :param save: :param train_policy: :param train_value_func: :param animate: :return: """ obs = self.env.reset() observes, actions, rewards = [], [], [] done = False step = 0 while not done: if self.animate: self.env.render() obs = obs.astype(np.float64).reshape((1, -1)) obs = self.normalize_obs(obs) obs = np.append(obs, [[step]], axis=1) # add time step feature observes.append(obs) action = self.policy.get_sample(obs).reshape( (1, -1)).astype(np.float64) actions.append(action) if self.env_name == "FetchReach-v0": obs_new, reward, done, _ = self.env.step(action.reshape(-1)) else: obs_new, reward, done, _ = self.env.step(action) if not isinstance(reward, float): reward = np.asscalar(reward) rewards.append(reward) obs = obs_new step += 0.003 return np.concatenate(observes), np.concatenate(actions), np.array( rewards) def discounted_sum(self, l, factor): discounted = [] sum = 0 for i in reversed(l): discounted.append(factor * sum + i) sum = factor * sum + i return np.array(list(reversed(discounted))) def run_policy(self, episodes): """ gather a batch of samples. :param episodes: :return: """ trajectories = [] for e in range(episodes): observes, actions, rewards = self.run_one_episode() trajectory = { 'observes': observes, 'actions': actions, 'rewards': rewards } trajectories.append(trajectory) return trajectories def run_expr(self): ep_steps = [] ep_rewards = [] ep_entropy = [] i = 0 while i < self.num_iterations: trajectories = self.run_policy(20) # add to experience replay buffer self.buffer.append(trajectories) i += len(trajectories) # for E=20, T=50, the total number of samples would be 1000 # In future needs to account for not uniform time steps per episode. # e.g. in Hopper-v2 environment not every episode has same time steps E = len(trajectories) T = trajectories[0]['observes'].shape[0] """train critic""" self.critic.fit( self.policy, self.buffer, epochs=1, num_samples=E * T) # take E*T samples, so in total E*T gradient steps """calculation of episodic discounted return only needs rewards""" mc_returns = np.concatenate([ self.discounted_sum(t['rewards'], self.discount) for t in trajectories ]) """using current batch of samples to update baseline""" observes = np.concatenate([t['observes'] for t in trajectories]) actions = np.concatenate([t['actions'] for t in trajectories]) value_func_loss = self.value_func.update(observes, mc_returns) """compute GAE""" for t in trajectories: t['values'] = self.value_func.predict(t['observes']) # IS it really legitimate to insert 0 at the last obs? t['td_residual'] = t['rewards'] + self.discount * np.append( t['values'][1:], 0) - t['values'] t['gae'] = self.discounted_sum(t['td_residual'], self.discount * self.lamb) advantages = np.concatenate([t['gae'] for t in trajectories]) """compute control variate""" "" cv = self.critic.get_contorl_variate(self.policy, observes, actions) """conservative control variate""" eta = [1 if i > 0 else 0 for i in advantages * cv] """center learning signal""" # check that advantages and CV should be of size E*T # eta controls the on-off of control variate learning_signal = advantages - eta * cv """controlled taylor eval term""" ctrl_taylor = np.concatenate( [[eta[i] * act] for i, act in enumerate( self.critic.get_taylor_eval(self.policy, observes))]) policy_loss, kl, entropy, beta = self.policy.update( observes, actions, learning_signal, ctrl_taylor) # normalize advantage estimates # advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6) avg_rewards = np.sum( np.concatenate([t['rewards'] for t in trajectories])) / self.episodes avg_timesteps = np.average( [len(t['rewards']) for t in trajectories]) log = {} # compute statistics such as mean and std log['steps'] = avg_timesteps log['rewards'] = avg_rewards log['policy_loss'] = policy_loss log['kl'] = kl log['entropy'] = entropy log['value_func_loss'] = value_func_loss log['beta'] = beta # display print('episode: ', i) print('average steps: {0}, average rewards: {1}'.format( log['steps'], log['rewards'])) for key in [ 'policy_loss', 'kl', 'entropy', 'beta', 'value_func_loss' ]: print('{:s}: {:.2g}'.format(key, log[key])) print('\n') ep_steps.append(log['steps']) ep_rewards.append(log['rewards']) ep_entropy.append(log['entropy']) # write to log.csv if self.write_header: fieldnames = [x for x in log.keys()] self.writer = csv.DictWriter(self.log_file, fieldnames=fieldnames) self.writer.writeheader() self.write_header = False self.writer.writerow(log) # we want the csv file to preserve information even if the program terminates earlier than scheduled. self.log_file.flush() # save model weights if stopped manually if self.killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break self.killer.kill_now = False # if (i+1)%20 == 0: # print('episode: ', i+1) # print('average steps', np.average(steps)) # print('average rewards', np.average(rewards)) self.policy.save(OUTPATH) self.value_func.save(OUTPATH) self.scaler.save(OUTPATH) plt.figure(figsize=(12, 9)) if self.env_name.startswith('Fetch'): ax1 = plt.subplot(121) plt.xlabel('episodes') plt.ylabel('policy entropy') plt.plot(ep_entropy) scale_x = self.episodes ticks_x = ticker.FuncFormatter( lambda x, pos: '{0:g}'.format(x * scale_x)) ax1.xaxis.set_major_formatter(ticks_x) else: ax1 = plt.subplot(121) plt.xlabel('episodes') plt.ylabel('steps') plt.plot(ep_steps) scale_x = self.episodes ticks_x = ticker.FuncFormatter( lambda x, pos: '{0:g}'.format(x * scale_x)) ax1.xaxis.set_major_formatter(ticks_x) ax2 = plt.subplot(122) plt.xlabel('episodes') plt.ylabel('episodic rewards') plt.plot(ep_rewards) scale_x = self.episodes ticks_x = ticker.FuncFormatter( lambda x, pos: '{0:g}'.format(x * scale_x)) ax2.xaxis.set_major_formatter(ticks_x) plt.savefig(OUTPATH + 'train.png') def load_model(self, load_from): from tensorflow.python.tools import inspect_checkpoint as chkp # # print all tensors in checkpoint file # chkp.print_tensors_in_checkpoint_file(load_from+'policy/policy.pl', tensor_name='', all_tensors=True, all_tensor_names=True) self.policy.load(load_from + 'policy/policy.pl') self.value_func.load(load_from + 'value_func/value_func.pl') def demonstrate_agent(self, load_from): self.load_model(load_from) with open(load_from + "scaler.pkl", 'rb') as file: self.scaler = pickle.load(file) self.animate = True for i in range(10): observes, actons, rewards = self.run_one_episode() ep_rewards = np.sum(rewards) ep_steps = len(rewards) print("Total steps: {0}, total rewards: {1}\n".format( ep_steps, ep_rewards))