def __init__(self, env, learning_rate, buffer_size, batch_size, n_epochs, gamma, gae_lam, clip_range, ent_coef, vf_coef, max_grad_norm): self.env = env self.lr = learning_rate self.buffer_size = buffer_size self.batch_size = batch_size self.n_epochs = n_epochs self.gamma = gamma self.gae_lam = gae_lam self.clip_range = clip_range self.ent_coef = ent_coef self.vf_coef = vf_coef self.max_grad_norm = max_grad_norm self.num_timesteps = 0 self.ep_info_buffer = deque(maxlen=100) self._n_updates = 0 self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') if isinstance(env, VecEnv): self.num_envs = env.num_envs self.rms_obs = RunningMeanStd(shape=(1, 1, 84, 84)) self.rms_rew = RunningMeanStd() self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') logger.configure('./logs')
def __init__(self, env_id, lr, nstep, batch_size, n_epochs, gamma, gae_lam, clip_range, ent_coef, vf_coef, max_grad_norm): self.env_id = env_id self.env = make_env(env_id, n_envs=4) self.num_envs = self.env.num_envs if isinstance(self.env, VecEnv) else 1 self.state_dim = self.env.observation_space.shape[0] self.action_converter = ActionConverter(self.env.action_space) self.lr = lr self.nstep = nstep self.batch_size = batch_size self.n_epochs = n_epochs self.gamma = gamma self.gae_lam = gae_lam self.clip_range = clip_range self.ent_coef = ent_coef self.vf_coef = vf_coef self.max_grad_norm = max_grad_norm self.ep_info_buffer = deque(maxlen=50) self._n_updates = 0 self.num_timesteps = 0 self.num_episodes = 0 self.obs_rms = RunningMeanStd()
def __init__(self, *, env_id, lr=3e-4, nstep=128, batch_size=128, n_epochs=10, gamma=0.99, int_gamma=0.99, gae_lam=0.95, clip_range=0.2, ent_coef=.01, vf_coef=0.5, int_vf_coef=0.5, max_grad_norm=0.2, hidden_size=128, int_hidden_size=128, int_lr=3e-4, rnd_start=1e+3): super(PPO_RND, self).__init__(env_id, lr, nstep, batch_size, n_epochs, gamma, gae_lam, clip_range, ent_coef, vf_coef, max_grad_norm) self.policy = Policy(self.env, hidden_size, intrinsic_model=True) self.rnd = RndNetwork(self.state_dim, hidden_size=int_hidden_size) self.rollout = IntrinsicStorage(nstep, self.num_envs, self.env.observation_space, self.env.action_space, gae_lam=gae_lam, gamma=gamma, int_gamma=int_gamma) self.optimizer = optim.Adam(self.policy.net.parameters(), lr=lr) self.rnd_optimizer = optim.Adam(self.rnd.parameters(), lr=int_lr) self.rnd_start = rnd_start self.int_vf_coef = int_vf_coef self.last_obs = self.env.reset() self.int_rew_rms = RunningMeanStd() self.normalize = True self.last_dones = np.array([0 for _ in range(self.num_envs)])
def initialize(self, session_name="default_session", num_slaves=8, tps=10000, use_evaluation=False): # get parameters from config self._numSlaves = num_slaves self._gamma = 0.99 self._lambd = 0.95 self._clipRange = 0.2 self._learningRatePolicy = 1e-4 self._learningRatePolicyDecay = 0.9993 self._learningRateValueFunction = 1e-3 self._batchSize = 1024 self._transitionsPerIteration = 20000 # if useEvaluation is true, evaluation of training progress is performed by evaluation function, else it is done by transitions collected in training session. self._useEvaluation = use_evaluation self._sessionName = session_name # initialize environment # TODO agents = [ holodeck.agents.AgentDefinition( agent_name="android" + str(i), agent_type=holodeck.agents.AndroidAgent, sensors=[holodeck.sensors.CustomSensor], starting_loc=(-1, 0, .3), starting_rot=(0, 0, 0), is_main_agent=True) for i in range(self._numSlaves) ] self._env = HolodeckEnvironment(agent_definitions=agents, start_world=False, ticks_per_sec=tps) # self._env = holodeck.make("PPO") # self._env.should_render_viewport(False) self._stateSize = 18 * 3 + 18 * 3 + 5 * 3 + 5 * 3 + 1 self._rewardSize = 5 self._eoeSize = 2 self._actionSize = 18 * 3 # initialize networks self._policy = Policy(self._actionSize) self._policy.build(self._stateSize) self._valueFunction = ValueFunction() self._valueFunction.build(self._stateSize) # initialize RunningMeanStd self._rms = RunningMeanStd(shape=(self._stateSize)) # initialize replay buffer self._replayBuffer = ReplayBuffer() self._policyOptimizer = tf.keras.optimizers.Adam( learning_rate=self.decayedLearningRatePolicy) self._valueFunctionOptimizer = tf.keras.optimizers.Adam( learning_rate=self._learningRateValueFunction) # initialize saver # self._saver = tf.train.Saver(var_list=tf.trainable_variables(), max_to_keep=1) # save maximum step network self._smax = 0 # save maximum reward network self._rmax = 0 # initialize statistics variables # TODO self._summary_num_log = 0 self._summary_num_episodes_total = 0 self._summary_num_transitions_total = 0 self._summary_max_episode_length = 0 self._summary_total_rewards = [] self._summary_total_rewards_by_parts = np.array([[]] * 5) self._summary_mean_rewards = [] self._summary_transition_per_episodes = [] self._summary_noise_records = [] self._summary_evaluation_total_rewards = [] self._summary_evaluation_total_rewards_by_parts = np.array([[]] * 5) self._summary_evaluation_mean_rewards = [] self._summary_evaluation_transition_per_episodes = [] # initialize checkpoint self._ckpt = tf.train.Checkpoint( policy_mean=self._policy.mean, policy_logstd=self._policy.logstd, valueFunction=self._valueFunction.value # policyOptimizer=self._policyOptimizer, # valueFunctionOptimizer=self._valueFunctionOptimizer ) self._isNetworkLoaded = False self._loadedNetwork = ""
class TrackingController: def __init__(self): random.seed(int(time.time())) np.random.seed(int(time.time())) self._startTime = time.time() self._summary_sim_time = 0 self._summary_train_time = 0 self._timeChecker = util.TimeChecker() def initialize(self, session_name="default_session", num_slaves=8, tps=10000, use_evaluation=False): # get parameters from config self._numSlaves = num_slaves self._gamma = 0.99 self._lambd = 0.95 self._clipRange = 0.2 self._learningRatePolicy = 1e-4 self._learningRatePolicyDecay = 0.9993 self._learningRateValueFunction = 1e-3 self._batchSize = 1024 self._transitionsPerIteration = 20000 # if useEvaluation is true, evaluation of training progress is performed by evaluation function, else it is done by transitions collected in training session. self._useEvaluation = use_evaluation self._sessionName = session_name # initialize environment # TODO agents = [ holodeck.agents.AgentDefinition( agent_name="android" + str(i), agent_type=holodeck.agents.AndroidAgent, sensors=[holodeck.sensors.CustomSensor], starting_loc=(-1, 0, .3), starting_rot=(0, 0, 0), is_main_agent=True) for i in range(self._numSlaves) ] self._env = HolodeckEnvironment(agent_definitions=agents, start_world=False, ticks_per_sec=tps) # self._env = holodeck.make("PPO") # self._env.should_render_viewport(False) self._stateSize = 18 * 3 + 18 * 3 + 5 * 3 + 5 * 3 + 1 self._rewardSize = 5 self._eoeSize = 2 self._actionSize = 18 * 3 # initialize networks self._policy = Policy(self._actionSize) self._policy.build(self._stateSize) self._valueFunction = ValueFunction() self._valueFunction.build(self._stateSize) # initialize RunningMeanStd self._rms = RunningMeanStd(shape=(self._stateSize)) # initialize replay buffer self._replayBuffer = ReplayBuffer() self._policyOptimizer = tf.keras.optimizers.Adam( learning_rate=self.decayedLearningRatePolicy) self._valueFunctionOptimizer = tf.keras.optimizers.Adam( learning_rate=self._learningRateValueFunction) # initialize saver # self._saver = tf.train.Saver(var_list=tf.trainable_variables(), max_to_keep=1) # save maximum step network self._smax = 0 # save maximum reward network self._rmax = 0 # initialize statistics variables # TODO self._summary_num_log = 0 self._summary_num_episodes_total = 0 self._summary_num_transitions_total = 0 self._summary_max_episode_length = 0 self._summary_total_rewards = [] self._summary_total_rewards_by_parts = np.array([[]] * 5) self._summary_mean_rewards = [] self._summary_transition_per_episodes = [] self._summary_noise_records = [] self._summary_evaluation_total_rewards = [] self._summary_evaluation_total_rewards_by_parts = np.array([[]] * 5) self._summary_evaluation_mean_rewards = [] self._summary_evaluation_transition_per_episodes = [] # initialize checkpoint self._ckpt = tf.train.Checkpoint( policy_mean=self._policy.mean, policy_logstd=self._policy.logstd, valueFunction=self._valueFunction.value # policyOptimizer=self._policyOptimizer, # valueFunctionOptimizer=self._valueFunctionOptimizer ) self._isNetworkLoaded = False self._loadedNetwork = "" def decayedLearningRatePolicy(self): return self._learningRatePolicy # load trained networks & rms def loadNetworks(self, directory, network_type=None): # load rms rms_dir = "{}/rms/".format(directory) if (network_type is None) or (network_type == ""): mean_dir = rms_dir + "mean.npy" var_dir = rms_dir + "var.npy" else: mean_dir = rms_dir + "mean_{}.npy".format(network_type) var_dir = rms_dir + "var_{}.npy".format(network_type) if os.path.exists(mean_dir): print("Loading RMS parameters") self._rms.mean = np.load(mean_dir) self._rms.var = np.load(var_dir) self._rms.count = 200000000 # load netowrk if network_type is not None: network_dir = "{}/network-{}".format(directory, network_type) else: network_dir = "{}/network".format(directory) print("Loading networks from {}".format(network_dir)) self.restore(network_dir) self._isNetworkLoaded = True self._loadedNetwork = "{}".format(network_dir) def computeTDAndGAE(self): self._collectedStates = [ None ] * self._summary_num_transitions_per_iteration self._collectedActions = [ None ] * self._summary_num_transitions_per_iteration self._collectedNeglogprobs = [ None ] * self._summary_num_transitions_per_iteration self._collectedTDs = [None ] * self._summary_num_transitions_per_iteration self._collectedGAEs = [None ] * self._summary_num_transitions_per_iteration startIdx = 0 for epi in self._collectedEpisodes: data = epi.data size = len(data) # update max episorde length if size > self._summary_max_episode_length: self._summary_max_episode_length = size states, actions, rewards, values, neglogprobs, TDs, GAEs = zip( *data) values = tf.convert_to_tensor(values).numpy() values = np.concatenate((values, [0]), axis=0) advantages = np.zeros(size) ad_t = 0 for i in reversed(range(size)): delta = rewards[i] + values[i + 1] * self._gamma - values[i] ad_t = delta + self._gamma * self._lambd * ad_t advantages[i] = ad_t TD = values[:size] + advantages self._collectedStates[startIdx:startIdx + size] = list(states) self._collectedActions[startIdx:startIdx + size] = list(actions) self._collectedNeglogprobs[startIdx:startIdx + size] = list(neglogprobs) self._collectedTDs[startIdx:startIdx + size] = list(TD) self._collectedGAEs[startIdx:startIdx + size] = list(advantages) startIdx += size self._collectedStates = np.array(self._collectedStates, dtype=np.float32) self._collectedActions = tf.convert_to_tensor( self._collectedActions).numpy() self._collectedNeglogprobs = tf.convert_to_tensor( self._collectedNeglogprobs).numpy() self._collectedTDs = np.array(self._collectedTDs, dtype=np.float32) self._collectedGAEs = np.array(self._collectedGAEs, dtype=np.float32) def optimize(self): self.computeTDAndGAE() if len(self._collectedStates) < self._batchSize: return GAE = np.array(self._collectedGAEs) GAE = (GAE - GAE.mean()) / (GAE.std() + 1e-5) ind = np.arange(len(GAE)) np.random.shuffle(ind) for s in range(int(len(ind) // self._batchSize)): selectedIndex = ind[s * self._batchSize:(s + 1) * self._batchSize] selectedStates = tf.convert_to_tensor( self._collectedStates[selectedIndex]) selectedActions = tf.convert_to_tensor( self._collectedActions[selectedIndex]) selectedNeglogprobs = tf.convert_to_tensor( self._collectedNeglogprobs[selectedIndex]) selectedTDs = tf.convert_to_tensor( self._collectedTDs[selectedIndex]) selectedGAEs = tf.convert_to_tensor(GAE[selectedIndex]) self.optimizeStep(selectedActions, selectedStates, selectedNeglogprobs, selectedTDs, selectedGAEs) def optimizeStep(self, a, s, nl, td, gae): with tf.GradientTape() as tape: curNeglogprob = self._policy.neglogprob(a, s) ratio = tf.exp(nl - curNeglogprob) clippedRatio = tf.clip_by_value(ratio, 1.0 - self._clipRange, 1.0 + self._clipRange) policyLoss = -tf.reduce_mean( tf.minimum(ratio * gae, clippedRatio * gae)) gradients = tape.gradient(policyLoss, self._policy.trainable_variables()) gradients, _grad_norm = tf.clip_by_global_norm(gradients, 0.5) self._policyOptimizer.apply_gradients( zip(gradients, self._policy.trainable_variables())) # optimize value function with tf.GradientTape() as tape: valueLoss = tf.reduce_mean( tf.square(self._valueFunction.getValue(s) - td)) gradients = tape.gradient( valueLoss, self._valueFunction._value.trainable_variables) gradients, _grad_norm = tf.clip_by_global_norm(gradients, 0.5) self._valueFunctionOptimizer.apply_gradients( zip(gradients, self._valueFunction._value.trainable_variables)) def reset(self): return def act(self, index, action): self._env.act("android" + str(index), action) def step(self, actions): for _ in range(20): for i in range(self._numSlaves): self.act(i, actions[i]) res = self._env.tick() states = [] rewards = [] eoes = [] for i in range(self._numSlaves): s = res["android" + str(i)]["CustomSensor"] states.append(s[:self._stateSize]) rewards.append(s[self._stateSize:self._stateSize + self._rewardSize]) eoes.append(s[self._stateSize + self._rewardSize:]) return states, rewards, eoes def runTraining(self, num_iteration=1): # create logging directory if not os.path.exists("output/"): os.mkdir("output/") self._directory = 'output/' + self._sessionName + '/' if not os.path.exists(self._directory): os.mkdir(self._directory) directory = self._directory + "rms/" if not os.path.exists(directory): os.mkdir(directory) directory = directory + "cur/" if not os.path.exists(directory): os.mkdir(directory) self.printParameters() while True: print("\nTraining start") self._summary_num_episodes_per_epoch = 0 self._summary_num_transitions_per_epoch = 0 self._summary_reward_per_epoch = 0 self._summary_reward_by_part_per_epoch = [] self._summary_max_episode_length = 0 for it in range(num_iteration): self._summary_sim_time -= time.time() self._collectedEpisodes = [] nan_count = 0 # TODO : implement reset actions = [None] * self._numSlaves for i in range(self._numSlaves): actions[i] = [1, random.random()] next_states, _, _ = self.step(actions) rewards = [None] * self._numSlaves episodes = [None] * self._numSlaves terminated = [False] * self._numSlaves resetRequired = [False] * self._numSlaves for j in range(self._numSlaves): episodes[j] = Episode() self._summary_num_transitions_per_iteration = 0 last_print = 0 while True: # get states states = np.array(next_states) states_for_update = states[~np.array(terminated)] states_for_update = self._rms.apply(states_for_update) states[~np.array(terminated)] = states_for_update # set action actions, logprobs = self._policy.getActionAndNeglogprob( states) values = self._valueFunction.getValue(states) action_with_reset_signal = [None] * self._numSlaves for j in range(self._numSlaves): action_with_reset_signal[j] = [ 0, 0 ] + actions[j].numpy().tolist() if resetRequired[j]: action_with_reset_signal[j][0] = 1 action_with_reset_signal[j][1] = random.random() # run one step next_states, r, e = self.step(action_with_reset_signal) for j in range(self._numSlaves): if terminated[j]: continue is_terminal = e[j][0] > 0.5 and True or False nan_occur = e[j][1] > 0.5 and True or False # push tuples only if nan did not occur if nan_occur is not True: if resetRequired[j]: resetRequired[j] = False else: rewards[j] = r[j][0] self._summary_reward_per_epoch += rewards[j] self._summary_reward_by_part_per_epoch.append( r[j]) episodes[j].push(states[j], actions[j], rewards[j], values[j], logprobs[j]) self._summary_num_transitions_per_iteration += 1 else: nan_count += 1 # if episode is terminated if is_terminal: # push episodes if len(episodes[j].data) != 0: self._collectedEpisodes.append(episodes[j]) if self._summary_num_transitions_per_iteration < self._transitionsPerIteration: episodes[j] = Episode() resetRequired[j] = True else: terminated[j] = True # if local step exceeds t_p_i: wait for others to terminate if self._summary_num_transitions_per_iteration >= self._transitionsPerIteration: if all(t is True for t in terminated): print('\r{}/{} : {}/{}'.format( it + 1, num_iteration, self._summary_num_transitions_per_iteration, self._transitionsPerIteration), end='') break # print progress per 100 steps if last_print + 100 < self._summary_num_transitions_per_iteration: print('\r{}/{} : {}/{}'.format( it + 1, num_iteration, self._summary_num_transitions_per_iteration, self._transitionsPerIteration), end='') last_print = self._summary_num_transitions_per_iteration self._summary_sim_time += time.time() self._summary_train_time -= time.time() # optimization print('') if (nan_count > 0): print("nan_count : {}".format(nan_count)) self._summary_num_episodes_per_epoch += len( self._collectedEpisodes) self._summary_num_transitions_per_epoch += self._summary_num_transitions_per_iteration self.optimize() ##SM) after getting all tuples, optimize once self._summary_train_time += time.time() # decay learning rate if self._learningRatePolicy > 1e-5: self._learningRatePolicy = self._learningRatePolicy * self._learningRatePolicyDecay print('Training end\n') self._summary_total_rewards.append( self._summary_reward_per_epoch / self._summary_num_episodes_per_epoch) self._summary_total_rewards_by_parts = np.insert( self._summary_total_rewards_by_parts, self._summary_total_rewards_by_parts.shape[1], np.asarray(self._summary_reward_by_part_per_epoch).sum(axis=0) / self._summary_num_episodes_per_epoch, axis=1) self._summary_mean_rewards.append( np.asarray(self._summary_total_rewards)[-10:].mean()) self._summary_noise_records.append( self._policy.std().numpy().mean()) self._summary_num_episodes_total += self._summary_num_episodes_per_epoch self._summary_num_transitions_total += self._summary_num_transitions_per_epoch t_per_e = 0 if self._summary_num_episodes_per_epoch is not 0: t_per_e = self._summary_num_transitions_per_epoch / self._summary_num_episodes_per_epoch self._summary_transition_per_episodes.append(t_per_e) # print summary self.printSummary() def play(self): # create logging directory if not os.path.exists("output/"): os.mkdir("output/") self._directory = 'output/' + self._sessionName + '/' if not os.path.exists(self._directory): os.mkdir(self._directory) directory = self._directory + "rms/" if not os.path.exists(directory): os.mkdir(directory) directory = directory + "cur/" if not os.path.exists(directory): os.mkdir(directory) self.printParameters() actions = [None] * self._numSlaves for i in range(self._numSlaves): actions[i] = [1, 0.0] next_states, _, _ = self.step(actions) rewards = [None] * self._numSlaves episodes = [None] * self._numSlaves terminated = [False] * self._numSlaves resetRequired = [False] * self._numSlaves last_print = 0 while True: # get states states = np.array(next_states) states_for_update = states[~np.array(terminated)] states_for_update = self._rms.apply(states_for_update) states[~np.array(terminated)] = states_for_update # set action if self._isNetworkLoaded: # actions, _ = self._policy.getActionAndNeglogprob(states) actions = self._policy.getMeanAction(states) else: actions = np.zeros(shape=(self._numSlaves, self._actionSize)) action_with_reset_signal = [None] * self._numSlaves for j in range(self._numSlaves): action_with_reset_signal[j] = [0, 0] + np.array( actions[j]).tolist() if resetRequired[j]: action_with_reset_signal[j][0] = 1 action_with_reset_signal[j][1] = random.random() # run one step next_states, r, e = self.step(action_with_reset_signal) for j in range(self._numSlaves): is_terminal = e[j][0] > 0.5 and True or False nan_occur = e[j][1] > 0.5 and True or False # push tuples only if nan did not occur if nan_occur is not True: if resetRequired[j]: resetRequired[j] = False # if episode is terminated if is_terminal: resetRequired[j] = True # optimization print('') def printParameters(self): # print on shell print( "===============================================================") print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) print("Elapsed time : {:.2f}s".format(time.time() - self._startTime)) print("Session Name : {}".format(self._sessionName)) print("Slaves number : {}".format(self._numSlaves)) print("State size : {}".format(self._stateSize)) print("Action size : {}".format(self._actionSize)) print("Learning rate : {:.6f}".format(self._learningRatePolicy)) print("Gamma : {}".format(self._gamma)) print("Lambda : {}".format(self._lambd)) print("Batch size : {}".format(self._batchSize)) print("Transitions per iter : {}".format( self._transitionsPerIteration)) print("PPO clip range : {}".format(self._clipRange)) print("Loaded netowrks : {}".format(self._loadedNetwork)) print( "===============================================================") # print to file out = open(self._directory + "parameters", "w") out.write(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S\n")) out.write("Session Name : {}\n".format(self._sessionName)) out.write("Slaves number : {}\n".format(self._numSlaves)) out.write("State size : {}\n".format(self._stateSize)) out.write("Action size : {}\n".format(self._actionSize)) out.write("Learning rate : {:.6f}\n".format( self._learningRatePolicy)) out.write("Gamma : {}\n".format(self._gamma)) out.write("Lambda : {}\n".format(self._lambd)) out.write("Batch size : {}\n".format(self._batchSize)) out.write("Transitions per iter : {}\n".format( self._transitionsPerIteration)) out.write("PPO clip range : {}\n".format(self._clipRange)) out.write("Loaded netowrks : {}\n".format(self._loadedNetwork)) out.close() # pre make results file out = open(self._directory + "results", "w") out.close() def printSummary(self): np.save(self._directory + "rms/mean.npy".format(self._summary_num_log), self._rms.mean) np.save(self._directory + "rms/var.npy".format(self._summary_num_log), self._rms.var) print( '===============================================================') print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) print("Elapsed time : {:.2f}s".format(time.time() - self._startTime)) print("Simulation time : {}s".format(self._summary_sim_time)) print("Training time : {}s".format(self._summary_train_time)) print("Session Name : {}".format(self._sessionName)) print("Logging Count : {}".format(self._summary_num_log)) print('Noise : {:.3f}'.format( self._summary_noise_records[-1])) print('Learning rate : {:.6f}'.format(self._learningRatePolicy)) print('Total episode : {}'.format( self._summary_num_episodes_total)) print('Total trans : {}'.format( self._summary_num_transitions_total)) total_t_per_e = 0 if self._summary_num_episodes_total is not 0: total_t_per_e = self._summary_num_transitions_total / self._summary_num_episodes_total print('Total trans per epi : {:.2f}'.format(total_t_per_e)) print('Episode : {}'.format( self._summary_num_episodes_per_epoch)) print('Transition : {}'.format( self._summary_num_transitions_per_epoch)) print('Trans per epi : {:.2f}'.format( self._summary_transition_per_episodes[-1])) print('Max episode length : {}'.format( self._summary_max_episode_length)) print('Rewards per episodes : {:.2f}'.format( self._summary_total_rewards[-1])) print( '===============================================================') # print plot y_list = [[np.asarray(self._summary_total_rewards_by_parts[0]), 'r'], [np.asarray(self._summary_mean_rewards), 'r_mean'], [np.asarray(self._summary_transition_per_episodes), 'steps'], [np.asarray(self._summary_total_rewards_by_parts[1]), 'p'], [np.asarray(self._summary_total_rewards_by_parts[2]), 'v'], [np.asarray(self._summary_total_rewards_by_parts[3]), 'com'], [np.asarray(self._summary_total_rewards_by_parts[4]), 'ee']] Plot(y_list, self._sessionName, 1, path=self._directory + "result.png") for i in range(len(y_list)): y_list[i][0] = np.array(y_list[i][0]) / np.array( self._summary_transition_per_episodes) y_list[1][0] = np.asarray(self._summary_noise_records) y_list[1][1] = 'noise' Plot(y_list, self._sessionName + "_per_step", 2, path=self._directory + "result_per_step.png") # log to file out = open(self._directory + "results", "a") out.write( '===============================================================\n' ) out.write(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S\n")) out.write("Elapsed time : {:.2f}s\n".format(time.time() - self._startTime)) out.write("Simulation time : {}s\n".format( self._summary_sim_time)) out.write("Training time : {}s\n".format( self._summary_train_time)) out.write("Session Name : {}\n".format(self._sessionName)) out.write("Logging Count : {}\n".format(self._summary_num_log)) out.write('Noise : {:.3f}\n'.format( self._summary_noise_records[-1])) out.write('Learning rate : {:.6f}\n'.format( self._learningRatePolicy)) out.write('Total episode : {}\n'.format( self._summary_num_episodes_total)) out.write('Total trans : {}\n'.format( self._summary_num_transitions_total)) out.write('Total trans per epi : {:.2f}\n'.format(total_t_per_e)) out.write('Episode : {}\n'.format( self._summary_num_episodes_per_epoch)) out.write('Transition : {}\n'.format( self._summary_num_transitions_per_epoch)) out.write('Trans per epi : {:.2f}\n'.format( self._summary_transition_per_episodes[-1])) out.write('Max episode length : {}\n'.format( self._summary_max_episode_length)) out.write('Rewards per episodes : {:.2f}\n'.format( self._summary_total_rewards[-1])) out.write( '===============================================================\n' ) out.close() # save network self.save(self._directory + "network") t_per_e = self._summary_transition_per_episodes[-1] tr = self._summary_total_rewards[-1] if t_per_e > self._smax: self._smax = t_per_e np.save(self._directory + "rms/mean_smax.npy", self._rms.mean) np.save(self._directory + "rms/var_smax.npy", self._rms.var) os.system( str( Path( "copy {}/network.data-00000-of-00001 {}/network-smax.data-00000-of-00001" .format(self._directory, self._directory)))) os.system( str( Path( "copy {}/network.data-00000-of-00002 {}/network-smax.data-00000-of-00002" .format(self._directory, self._directory)))) os.system( str( Path( "copy {}/network.data-00001-of-00002 {}/network-smax.data-00001-of-00002" .format(self._directory, self._directory)))) os.system( str( Path("copy {}/network.index {}/network-smax.index".format( self._directory, self._directory)))) if tr > self._rmax: self._rmax = tr np.save(self._directory + "rms/mean_rmax.npy", self._rms.mean) np.save(self._directory + "rms/var_rmax.npy", self._rms.var) os.system( str( Path( "copy {}/network.data-00000-of-00001 {}/network-rmax.data-00000-of-00001" .format(self._directory, self._directory)))) os.system( str( Path( "copy {}/network.data-00000-of-00002 {}/network-rmax.data-00000-of-00002" .format(self._directory, self._directory)))) os.system( str( Path( "copy {}/network.data-00001-of-00002 {}/network-rmax.data-00001-of-00002" .format(self._directory, self._directory)))) os.system( str( Path("copy {}/network.index {}/network-rmax.index".format( self._directory, self._directory)))) self._summary_num_log = self._summary_num_log + 1 return def save(self, path): self._ckpt.write(path) def restore(self, path): self._ckpt.restore(path)
class MultiLayerPolicy: def __init__(self, name, ob, ac_shape, hid_size=128, num_hid_layers=3, reuse=False): with tf.variable_scope(name, reuse): self.scope = tf.get_variable_scope().name self.build_net(ob, ac_shape, hid_size, num_hid_layers) def build_net(self, ob, ac_shape, hid_size, num_hid_layers): self.ob = ob self.ob_shape = ob.shape.as_list()[1:] with tf.variable_scope("ob_filter"): self.ob_rms = RunningMeanStd(ob.shape.as_list()[1:]) # normalized observation obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) # net to fit value function net = obz for i in range(num_hid_layers): net = tf.layers.dense( inputs=net, units=hid_size, activation=tf.nn.tanh, kernel_initializer=tf.random_normal_initializer(mean=0, stddev=1), name="vffc%i" % (i + 1)) self.vpred = tf.layers.dense( inputs=net, units=1, activation=None, kernel_initializer=tf.random_normal_initializer(mean=0, stddev=1), name="vffinal") # train value function self.vreal = tf.placeholder(dtype=tf.float32, shape=(None, ), name="vreal") vloss = tf.reduce_mean(tf.square(self.vreal - self.vpred)) valueFunctionVars = [ v for v in self.get_trainable_variables() if v.name.startswith("%s/vff" % self.scope) ] self.vadam = tf.train.AdamOptimizer().minimize( vloss, var_list=valueFunctionVars) # net to predict mean and standard deviation of action net = obz for i in range(num_hid_layers): net = tf.layers.dense( inputs=net, units=hid_size, activation=tf.nn.tanh, kernel_initializer=tf.random_normal_initializer(mean=0, stddev=1), name="polc%i" % (i + 1)) mean = tf.layers.dense(inputs=net, units=ac_shape[0], activation=None, kernel_initializer=tf.random_normal_initializer( mean=0, stddev=0.01)) logstd = mean * 0.0 + tf.get_variable( name="logstd", shape=[1, ac_shape[0]], initializer=tf.zeros_initializer(), dtype=tf.float32) # std not related to observation # action is normally distributed self.pd = DiagGaussianPd(mean, logstd) self.stochastic = tf.placeholder(dtype=tf.bool, shape=(), name="stochastic") self.action = tf.cond(self.stochastic, lambda: self.pd.sample(), lambda: self.pd.mode()) def act(self, stochastic, ob): action, vpred = tf.get_default_session().run( [self.action, self.vpred], { self.ob: ob[None], self.stochastic: stochastic }) return action[0], vpred[0] def train_value_function(self, obs, vreals): self.ob_rms.update(obs) tf.get_default_session().run([self.vadam], { self.ob: obs, self.vreal: vreals }) def get_variables(self): return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) def get_trainable_variables(self): return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
def build_net(self, ob, ac_shape, hid_size, num_hid_layers): self.ob = ob self.ob_shape = ob.shape.as_list()[1:] with tf.variable_scope("ob_filter"): self.ob_rms = RunningMeanStd(ob.shape.as_list()[1:]) # normalized observation obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) # net to fit value function net = obz for i in range(num_hid_layers): net = tf.layers.dense( inputs=net, units=hid_size, activation=tf.nn.tanh, kernel_initializer=tf.random_normal_initializer(mean=0, stddev=1), name="vffc%i" % (i + 1)) self.vpred = tf.layers.dense( inputs=net, units=1, activation=None, kernel_initializer=tf.random_normal_initializer(mean=0, stddev=1), name="vffinal") # train value function self.vreal = tf.placeholder(dtype=tf.float32, shape=(None, ), name="vreal") vloss = tf.reduce_mean(tf.square(self.vreal - self.vpred)) valueFunctionVars = [ v for v in self.get_trainable_variables() if v.name.startswith("%s/vff" % self.scope) ] self.vadam = tf.train.AdamOptimizer().minimize( vloss, var_list=valueFunctionVars) # net to predict mean and standard deviation of action net = obz for i in range(num_hid_layers): net = tf.layers.dense( inputs=net, units=hid_size, activation=tf.nn.tanh, kernel_initializer=tf.random_normal_initializer(mean=0, stddev=1), name="polc%i" % (i + 1)) mean = tf.layers.dense(inputs=net, units=ac_shape[0], activation=None, kernel_initializer=tf.random_normal_initializer( mean=0, stddev=0.01)) logstd = mean * 0.0 + tf.get_variable( name="logstd", shape=[1, ac_shape[0]], initializer=tf.zeros_initializer(), dtype=tf.float32) # std not related to observation # action is normally distributed self.pd = DiagGaussianPd(mean, logstd) self.stochastic = tf.placeholder(dtype=tf.bool, shape=(), name="stochastic") self.action = tf.cond(self.stochastic, lambda: self.pd.sample(), lambda: self.pd.mode())
class PPO_RND(BaseAlgorithm): """ Base algorithm class that each agent has to inherit from. :param env_id: (str) name of environment to perform training on :param lr: (float) learning rate :param int_lr: (float) intrinsic learning rate :param nstep: (int) storage rollout steps :param batch_size: (int) batch size for training :param n_epochs: (int) number of training epochs :param gamma: (float) discount factor :param int_gamma: (float) discount factor for intrinsic rewards :param gae_lam: (float) lambda for generalized advantage estimation :param clip_range: (float) clip range for surrogate loss :param ent_coef: (float) entropy loss coefficient :param vf_coef: (float) value loss coefficient :param int_vf_coef: (float) intrinsic value loss coefficient :param max_grad_norm: (float) max grad norm for optimizer :param hidden_size: (int) size of the hidden layers of policy :param int_hidden_size: (int) size of the hidden layers for the RND target and predictor networks :param rnd_start: (int) the number of initial steps to normalize observations """ def __init__(self, *, env_id, lr=3e-4, nstep=128, batch_size=128, n_epochs=10, gamma=0.99, int_gamma=0.99, gae_lam=0.95, clip_range=0.2, ent_coef=.01, vf_coef=0.5, int_vf_coef=0.5, max_grad_norm=0.2, hidden_size=128, int_hidden_size=128, int_lr=3e-4, rnd_start=1e+3): super(PPO_RND, self).__init__(env_id, lr, nstep, batch_size, n_epochs, gamma, gae_lam, clip_range, ent_coef, vf_coef, max_grad_norm) self.policy = Policy(self.env, hidden_size, intrinsic_model=True) self.rnd = RndNetwork(self.state_dim, hidden_size=int_hidden_size) self.rollout = IntrinsicStorage(nstep, self.num_envs, self.env.observation_space, self.env.action_space, gae_lam=gae_lam, gamma=gamma, int_gamma=int_gamma) self.optimizer = optim.Adam(self.policy.net.parameters(), lr=lr) self.rnd_optimizer = optim.Adam(self.rnd.parameters(), lr=int_lr) self.rnd_start = rnd_start self.int_vf_coef = int_vf_coef self.last_obs = self.env.reset() self.int_rew_rms = RunningMeanStd() self.normalize = True self.last_dones = np.array([0 for _ in range(self.num_envs)]) def collect_samples(self): """ Collect one full rollout, as determined by the nstep parameter, and add it to the buffer """ assert self.last_obs is not None rollout_step = 0 self.rollout.reset() while rollout_step < self.nstep: with torch.no_grad(): actions, values, int_values, log_probs = self.policy.act( self.last_obs) actions = actions.numpy() obs, rewards, dones, infos = self.env.step(actions) if any(dones): self.num_episodes += sum(dones) self.num_timesteps += self.num_envs self.update_info_buffer(infos) actions = actions.reshape(self.num_envs, self.action_converter.action_output) log_probs = log_probs.reshape(self.num_envs, self.action_converter.action_output) if (self.num_timesteps / self.num_envs) < self.rnd_start: int_rewards = np.zeros_like(rewards) self.obs_rms.update(self.env.unnormalize_obs(self.last_obs)) else: normalized_obs = self.normalize_obs(obs) int_rewards = self.rnd.int_reward( normalized_obs).detach().numpy() self.int_rew_rms.update(int_rewards) int_rewards /= (np.sqrt(self.int_rew_rms.var) + 1e-08) self.rollout.add(self.last_obs, actions, rewards, int_rewards, values, int_values, dones, log_probs) self.last_obs = obs self.last_dones = dones rollout_step += 1 self.rollout.compute_returns_and_advantages(values, int_values, dones) return True def train(self): """ Use the collected data from the buffer to train the policy network """ total_losses, policy_losses, value_losses, entropy_losses, intrinsic_losses = [], [], [], [], [] rnd_trained = False for epoch in range(self.n_epochs): for batch in self.rollout.get(self.batch_size): observations = batch.observations actions = batch.actions old_log_probs = batch.old_log_probs old_values = batch.old_values old_int_values = batch.int_values advantages = batch.advantages int_advantages = batch.int_advantages returns = batch.returns int_returns = batch.int_returns # Get values and action probabilities using the updated policy on gathered observations state_values, int_values, action_log_probs, entropy = self.policy.evaluate( observations, actions) # Normalize batch advantages advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) int_advantages = (int_advantages - int_advantages.mean()) / ( int_advantages.std() + 1e-8) advantages = advantages + int_advantages # Compute policy gradient ratio of current actions probs over previous ratio = torch.exp(action_log_probs - old_log_probs) # Compute surrogate loss surr_loss_1 = advantages * ratio surr_loss_2 = advantages * torch.clamp( ratio, 1 - self.clip_range, 1 + self.clip_range) policy_loss = -torch.min(surr_loss_1, surr_loss_2).mean() # Clip state values for stability state_values_clipped = old_values + ( state_values - old_values).clamp(-self.clip_range, self.clip_range) value_loss = F.mse_loss(returns, state_values).mean() value_loss_clipped = F.mse_loss(returns, state_values_clipped).mean() value_loss = torch.max(value_loss, value_loss_clipped).mean() # Clip state values for stability int_values_clipped = old_int_values + ( int_values - old_int_values).clamp(-self.clip_range, self.clip_range) int_value_loss = F.mse_loss(int_returns, int_values).mean() int_value_loss_clipped = F.mse_loss(int_returns, int_values_clipped).mean() int_value_loss = torch.max(int_value_loss, int_value_loss_clipped).mean() # Compute entropy loss entropy_loss = -torch.mean(entropy) # Total loss loss = policy_loss + self.ent_coef * entropy_loss + self.vf_coef * value_loss + self.int_vf_coef * int_value_loss # Perform optimization self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.policy.net.parameters(), self.max_grad_norm) self.optimizer.step() if np.random.randn() < 0.25: self.train_rnd(batch) total_losses.append(loss.item()) policy_losses.append(policy_loss.item()) value_losses.append(value_loss.item()) entropy_losses.append(entropy_loss.item()) intrinsic_losses.append(int_value_loss.item()) rnd_trained = True logger.record("train/intrinsic_loss", np.mean(intrinsic_losses)) logger.record("train/entropy_loss", np.mean(entropy_losses)) logger.record("train/policy_gradient_loss", np.mean(policy_losses)) logger.record("train/value_loss", np.mean(value_losses)) logger.record("train/total_loss", np.mean(total_losses)) self._n_updates += self.n_epochs def train_rnd(self, batch): """ Train the predictor RND network :param batch: (np.ndarray) batch from the current experience buffer """ obs = batch.observations #self.rew_norm_and_clip(batch.observations.numpy()) obs = self.normalize_obs(obs.numpy()) pred, target = self.rnd(torch.from_numpy(obs).float()) loss = F.mse_loss(pred, target) self.rnd_optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.rnd.parameters(), self.max_grad_norm) self.rnd_optimizer.step() def learn(self, total_timesteps, log_interval, reward_target=None, log_to_file=False): """ Initiate the training of the algorithm. :param total_timesteps: (int) total number of timesteps the agent is to run for :param log_interval: (int) how often to perform logging :param reward_target: (int) reaching the reward target stops training early :param log_to_file: (bool) specify whether output ought to be logged """ logger.configure("RND", self.env_id, log_to_file) start_time = time.time() iteration = 0 while self.num_timesteps < total_timesteps: self.collect_samples() iteration += 1 if log_interval is not None and iteration % log_interval == 0: logger.record("time/total timesteps", self.num_timesteps) if len(self.ep_info_buffer) > 0 and len( self.ep_info_buffer[0]) > 0: logger.record( "rollout/ep_rew_mean", np.mean( [ep_info["r"] for ep_info in self.ep_info_buffer])) logger.record("rollout/num_episodes", self.num_episodes) fps = int(self.num_timesteps / (time.time() - start_time)) logger.record("time/total_time", (time.time() - start_time)) logger.dump(step=self.num_timesteps) self.train() if reward_target is not None and np.mean( [ep_info["r"] for ep_info in self.ep_info_buffer]) > reward_target: logger.record("time/total timesteps", self.num_timesteps) if len(self.ep_info_buffer) > 0 and len( self.ep_info_buffer[0]) > 0: logger.record( "rollout/ep_rew_mean", np.mean( [ep_info["r"] for ep_info in self.ep_info_buffer])) logger.record("rollout/num_episodes", self.num_episodes) fps = int(self.num_timesteps / (time.time() - start_time)) logger.record("time/total_time", (time.time() - start_time)) logger.dump(step=self.num_timesteps) break return self