def _set_prioritized_buffer(self): buffer_kw = {"size": self.buffer_size, "alpha": 0.7} if self.buffer_type.__name__ == "RankPrioritizedReplayBuffer": buffer_kw.update({ "learning_starts": self.prioritization_starts, "batch_size": self.batch_size }) r_buf = self.buffer_type(**buffer_kw) for i, transition in enumerate(self.replay_buffer._storage): r_buf.add(*transition) r_buf.update_priorities([i], self.policy_tf.get_q_discrepancy( transition[0])[0]) if r_buf.__name__ == "RankPrioritizedReplayBuffer": r_buf.rebalance() if isinstance(self.replay_buffer, HindsightExperienceReplayWrapper): self.replay_buffer.replay_buffer = r_buf else: self.replay_buffer = r_buf self.learning_rate = get_schedule_fn( self.learning_rate(1) / 4) # TODO: will not work with non-constant self.beta_schedule = get_schedule_fn(self.beta_schedule) print("Enabled prioritized replay buffer")
def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name="PPO2", reset_num_timesteps=True): self.pretrained_weight = self.load_weight() self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam) obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run( ) self.set_ewc_model(runner) restores = [] for param, loaded_p in zip(self.params, self.pretrained_weight): restores.append(param.assign(loaded_p)) self.sess.run(restores) self.episode_reward = np.zeros((self.n_envs, )) ep_info_buf = deque(maxlen=100) t_first_start = time.time() nupdates = total_timesteps // self.n_batch flag_ewc = False for update in range(1, nupdates + 1): assert self.n_batch % self.nminibatches == 0 if (update > 8.e5 // self.n_batch): flag_ewc = True batch_size = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update - 1.0) / nupdates lr_now = self.learning_rate(frac) cliprangenow = self.cliprange(frac) # true_reward is the reward without discount obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run( ) ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: # nonrecurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1 inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, batch_size): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) // batch_size) end = start + batch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append( self._train_step(lr_now, cliprangenow, *slices, writer=writer, update=timestep, ewc=flag_ewc)) self.num_timesteps += (self.n_batch * self.noptepochs ) // batch_size * update_fac else: # recurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1 assert self.n_envs % self.nminibatches == 0 env_indices = np.arange(self.n_envs) flat_indices = np.arange(self.n_envs * self.n_steps).reshape( self.n_envs, self.n_steps) envs_per_batch = batch_size // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(env_indices) for start in range(0, self.n_envs, envs_per_batch): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) // envs_per_batch) end = start + envs_per_batch mb_env_inds = env_indices[start:end] mb_flat_inds = flat_indices[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append( self._train_step(lr_now, cliprangenow, *slices, update=timestep, writer=writer, states=mb_states, ewc=flag_ewc)) self.num_timesteps += (self.n_envs * self.noptepochs ) // envs_per_batch * update_fac loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if writer is not None: self.episode_reward = total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", update * self.n_steps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", self.num_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv( 'ep_reward_mean', safe_mean( [ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'ep_len_mean', safe_mean( [ep_info['l'] for ep_info in ep_info_buf])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name="SAC", print_freq=100): with TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) start_time = time.time() episode_rewards = [0.0] is_teleop_env = hasattr(self.env, "wait_for_teleop_reset") # TeleopEnv if is_teleop_env: print("Waiting for teleop") obs = self.env.wait_for_teleop_reset() info = {"cte": 0.0} else: obs = self.env.reset() info = {"cte": 0.0} self.episode_reward = np.zeros((1, )) ep_info_buf = deque(maxlen=100) ep_len = 0 self.n_updates = 0 infos_values = [] mb_infos_vals = [] # ---------------------load the trained NN for safety signal tf_obs = tf.placeholder(tf.float32, shape=(1, 104)) hidden1 = tf.layers.dense(tf_obs, 64, tf.nn.relu) hidden2 = tf.layers.dense(hidden1, 16, tf.nn.relu) output1 = tf.layers.dense(hidden2, 2) hidden3 = tf.layers.dense(tf_obs, 64, tf.nn.relu) hidden4 = tf.layers.dense(hidden3, 16, tf.nn.relu) output2 = tf.layers.dense(hidden4, 3) sess = tf.Session() saver = tf.train.Saver() saver.restore(sess, "./saved_params/param03-level1-quad/safe_layer") # -------------------------------------------------------- fr = open("dump_reward.txt", "w") fv = open("dump_violation.txt", "w") cum_reward = [] num_vio = 0 for step in range(total_timesteps): # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy. if step < self.learning_starts: action = self.env.action_space.sample() # No need to rescale when sampling random action rescaled_action = action else: action = self.policy_tf.step( obs[None], deterministic=False).flatten() # Rescale from [-1, 1] to the correct bounds rescaled_action = action * np.abs(self.action_space.low) assert action.shape == self.env.action_space.shape # ---------- use trained NN to revise the action if action[1] < 0: action[1] *= -1 print("h1, action ", action) proposed_action = action.copy() action_take = action.copy() proposed_action = np.asarray(proposed_action).reshape((1, 2)) #print ("h2, proposed_action", proposed_action) #print("obs shape", obs.shape) v1 = sess.run(output1, {tf_obs: obs.reshape((1, 104))}) v2 = sess.run(output2, {tf_obs: obs.reshape((1, 104))}) q = [v2[0][0], 0.5 * v2[0][1], 0.5 * v2[0][1], v2[0][2]] q = np.reshape(q, (2, 2)) x = cvx.Variable(1, 2) obj = cvx.sum_squares(x - proposed_action) cons = [info["cte"] + v1 * x.T + x * q * x.T <= 4.8, x[1] > 0] prob = cvx.Problem(cvx.Minimize(obj), cons) try: qcqp = QCQP(prob) qcqp.suggest(SDR) f_cd, v_cd = qcqp.improve(COORD_DESCENT) print( "Coordinate descent: objective %.3f, violation %.3f" % (f_cd, v_cd)) if v_cd == 0: new_action = x.value new_action = np.asarray(new_action).reshape((1, 2)) print("h5, action ", new_action) action_take[0] = new_action[0][0] action_take[1] = new_action[0][1] new_obs, reward, done, new_info = self.env.step( action_take) action = action_take else: new_obs, reward, done, new_info = self.env.step(action) except: new_obs, reward, done, new_info = self.env.step(action) # ----------------------------------------- ep_len += 1 if (len(cum_reward) == 10): cum_reward.pop(0) cum_reward.append(reward) curr = 0.0 for i in range(len(cum_reward)): idx = len(cum_reward) - i - 1 curr += cum_reward[idx] * (0.99**i) fr.write("%f \n" % (curr)) fv.write("%d \n" % (num_vio)) if print_freq > 0 and ep_len % print_freq == 0 and ep_len > 0: print("{} steps".format(ep_len)) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs info = new_info # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, step) if ep_len > self.train_freq: print("Additional training") self.env.reset() mb_infos_vals = self.optimize(step, writer, current_lr) done = True episode_rewards[-1] += reward if done: num_vio += 1 if not (isinstance(self.env, VecEnv) or is_teleop_env): obs = self.env.reset() print("Episode finished. Reward: {:.2f} {} Steps".format( episode_rewards[-1], ep_len)) episode_rewards.append(0.0) ep_len = 0 mb_infos_vals = self.optimize(step, writer, current_lr) # Refresh obs when using TeleopEnv if is_teleop_env: print("Waiting for teleop") obs = self.env.wait_for_teleop_reset() # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) logger.logkv( 'ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", self.n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', "{:.2f}".format(time.time() - start_time)) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", step) logger.dumpkvs() # Reset infos: infos_values = [] if is_teleop_env: self.env.is_training = False # Use last batch print("Final optimization before saving") self.env.reset() mb_infos_vals = self.optimize(step, writer, current_lr) return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=4, tb_log_name="SAC", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] episode_successes = [] if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() self.episode_reward = np.zeros((1, )) ep_info_buf = deque(maxlen=100) n_updates = 0 infos_values = [] tra_obs = [] ep_count = 0 selected_goal = None tra_count = 0 for step in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) if (self.num_timesteps < self.learning_starts or np.random.rand() < self.random_exploration): # No need to rescale when sampling random action rescaled_action = action = self.env.action_space.sample() else: action = self.policy_tf.step( obs[None], deterministic=False).flatten() # Add noise to the action (improve exploration, # not needed in general) if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # Rescale from [-1, 1] to the correct bounds rescaled_action = action * np.abs(self.action_space.low) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(rescaled_action) ################################################################# # fit density model and update goal proposing model skew_explore_obs = obs.copy() if isinstance(self.env, HERGoalEnvWrapper): skew_explore_obs_dict = self.env.convert_obs_to_dict( skew_explore_obs) skew_explore_obs = np.array( [skew_explore_obs_dict['observation']]) tra_obs.append(skew_explore_obs[0]) if selected_goal is None: selected_goal = np.array( skew_explore_obs_dict['desired_goal']) else: tra_obs.append(skew_explore_obs) self.skew_explore.update_history(skew_explore_obs, [done]) if (step % self.goal_update_frequency == 0 and step != 0) or step == 2000: logging.info('update buffer') self.skew_explore.activate_buffer() ################################################################# # Store transition in the replay buffer. self.replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if step % self.train_freq == 0: mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): if self.num_timesteps < self.batch_size or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) mb_infos_vals.append( self._train_step(step, writer, current_lr)) # Update target network if (step + grad_step) % self.target_update_interval == 0: # Update target network self.sess.run(self.target_update_op) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) episode_rewards[-1] += reward if done: self.plot_tra(tra_count, tra_obs, selected_goal) tra_obs = [] selected_goal = None if self.action_noise is not None: self.action_noise.reset() if not isinstance(self.env, VecEnv): obs = self.env.reset() ep_count += 1 episode_rewards.append(0.0) maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) tra_count += 1 self.save(self.args.save_path + '/model') if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) self.num_timesteps += 1 # Display training infos if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv( 'ep_rewmean', safe_mean( [ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'eplenmean', safe_mean( [ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=4, tb_log_name="CLAC", reset_num_timesteps=True, randomization=0): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] learning_results = pd.DataFrame() obs = self.env.reset() self.episode_reward = np.zeros((1, )) ep_info_buf = deque(maxlen=100) n_updates = 0 infos_values = [] reward_data = pd.DataFrame() for step in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy. if self.num_timesteps < self.learning_starts: if (isinstance(self.env.action_space, Discrete)): action = [] for _ in range(self.env.action_space.n): action.append(1 / self.env.action_space.n) rescaled_action = self.env.action_space.sample() else: action = self.env.action_space.sample() # No need to rescale when sampling random action rescaled_action = action else: if (isinstance(self.env.action_space, Discrete)): actions = list(range(self.env.action_space.n)) action = self.policy_tf.step( obs[None], deterministic=False).flatten() rescaled_action = np.random.choice(actions, 1, p=action)[0] else: action = self.policy_tf.step( obs[None], deterministic=False).flatten() # Rescale from [-1, 1] to the correct bounds rescaled_action = action * np.abs( self.action_space.low) if (not isinstance(self.env.action_space, Discrete)): assert action.shape == self.env.action_space.shape # If coinrunner environment # rescaled_action = np.array(rescaled_action, ndmin=1) new_obs, reward, done, info = self.env.step(rescaled_action) act_mu, act_std = self.policy_tf.proba_step(obs[None]) if (len(act_std) == 1): act_std = act_std[0] #print("ACT MU FROM PROBA STEP", act_mu) #print("ACT STD FROM PROBA STEP", act_std) if self.num_timesteps > self.learning_starts: # Only update marginal approximation after learning starts is completed if (self.multivariate_mean is None): self.multivariate_mean = act_mu else: previous_mean = self.multivariate_mean self.multivariate_mean = ( (1 - self.learning_rate_phi) * self.multivariate_mean) + (self.learning_rate_phi * act_mu) if (self.multivariate_cov is None): self.multivariate_cov = np.diag(act_std) else: cov = (self.learning_rate_phi * np.diag(act_std) + (1 - self.learning_rate_phi) * self.multivariate_cov) mom_1 = (self.learning_rate_phi * np.square(np.diag(act_mu))) + ( (1 - self.learning_rate_phi) * np.square(np.diag(previous_mean))) mom_2 = np.square((self.learning_rate_phi * np.diag(act_mu)) + (1 - self.learning_rate_phi) * np.diag(previous_mean)) self.multivariate_cov = cov + mom_1 - mom_2 # Update Beta parameter if coef_schedule is set if (self.coef_schedule is not None and self.mut_inf_coef > 1e-12): # (1 - a) B + a(1/L()) # Loss based update schdule, for later # Currently using linear schedule: self.mut_inf_coef *= (1 - self.coef_schedule) """if(self.num_timesteps % 1000 == 0): print("updated mut_inf_coef: ", self.mut_inf_coef, " at time step ", self.num_timesteps)""" # Store transition in the replay buffer. #print("adding action to replay buffer: ", action) self.replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs # Retrieve reward and episode length if using Monitor wrapper # info = info[0] maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if step % self.train_freq == 0: mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): if self.num_timesteps < self.batch_size or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) mb_infos_vals.append( self._train_step(step, writer, current_lr)) # Update target network if (step + grad_step) % self.target_update_interval == 0: # Update target network self.sess.run(self.target_update_op) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: for mb_info_val in mb_infos_vals: for mb_info in mb_info_val: if mb_info is not None: infos_values.append(np.mean(mb_info)) #infos_values = np.mean(mb_infos_vals, axis=0) episode_rewards[-1] += reward if done: if not isinstance(self.env, VecEnv): obs = self.env.reset() if (randomization == 1): try: for env in self.env.unwrapped.envs: env.randomize() except: print( "Trying to randomize an environment that is not set up for randomization, check environment file" ) assert (False) if (randomization == 2): try: for env in self.env.unwrapped.envs: env.randomize_extreme() except: print( "Trying to extremely randomize an environment that is not set up for randomization, check environment file" ) assert (False) Model_String = "CLAC" if not self.auto_mut_inf_coef: Model_String = "CLAC " + str(self.mut_inf_coef) env_name = self.env.unwrapped.envs[0].spec.id mut_inf_coef = self.init_mut_inf_coef if (type(self.mut_inf_coef) == tf.Tensor or np.isnan(mut_inf_coef)): mut_inf_coef = "auto" Model_String = "CLAC" + str(mut_inf_coef) d = { 'Episode Reward': episode_rewards[-1], 'Coefficient': mut_inf_coef, 'Timestep': self.num_timesteps, 'Episode Number': len(episode_rewards) - 1, 'Env': env_name, 'Randomization': randomization, 'Model': "CLAC" } learning_results = learning_results.append( d, ignore_index=True) self.tf_logged_reward = episode_rewards[-1] episode_rewards.append(0.0) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) self.num_timesteps += 1 # Display training infos if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) logger.logkv( 'ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] return (self, learning_results)
def learn(self, total_timesteps, callback=None, log_interval=4, tb_log_name="SAC", reset_num_timesteps=True, replay_wrapper=None,use_action_repeat = False,poisson=False): new_tb_log = self._init_num_timesteps(reset_num_timesteps) self.use_action_repeat=use_action_repeat # self.action_repetition = 0.8 self.running_action_repetition = self.action_repetition self.poisson=poisson self.poisson_action = 4 self.poisson_mean = 4 prev_action = None # self.prob_past = 0.6 #self.env.act_rep-=(21-4)/float(total_timesteps) if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] episode_successes = [] if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() # if(poisson): # np.concatenate((obs,)) # print(obs) self.episode_reward = np.zeros((1,)) ep_info_buf = deque(maxlen=100) n_updates = 0 infos_values = [] self.num_timesteps=0 for step in range(total_timesteps): if poisson: if(self.poisson_mean<1): self.poisson_mean=1 self.poisson_action = int(np.random.poisson(self.poisson_mean)) self.poisson_mean-=((5)/float(total_timesteps)) if(self.poisson_action<1): self.poisson_action=1 if use_action_repeat: # self.action_repetition-=((0.9)/float(total_timesteps)) amount = ((4)/float(total_timesteps)) self.running_action_repetition -= amount # print("Action repetition is :{}".format(self.action_repetition)) if(self.running_action_repetition<=2 and self.running_action_repetition>1): # if(self.action_repetition==4): # print("Flushing replay buffer 4, {}".format(self.action_repetition)) # self.replay_buffer = ReplayBuffer(self.buffer_size) self.action_repetition=2 if(self.running_action_repetition<=1): # if(self.action_repetition==2): # print("Flushing replay buffer 2, {}".format(self.action_repetition)) # self.replay_buffer = ReplayBuffer(self.buffer_size) self.action_repetition=1 # self.action_repetition = (self.action_repetition*amount +self.action_repetition-amount)/(1-amount+amount*self.action_repetition) # if(self.action_repetition<0): # self.action_repetition=0 # self.env.dec_act_rep((21-4)/float(total_timesteps)) # self.running_action_repetition -= ((6-1)/float(total_timesteps)) # self.action_repetition = int(self.running_action_repetition) # if(self.action_repetition<1): # self.action_repetition=1 if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) if (self.num_timesteps < self.learning_starts or np.random.rand() < self.random_exploration): # No need to rescale when sampling random action rescaled_action = action = self.env.action_space.sample() else: if poisson: action = self.policy_tf.step(np.concatenate((obs,np.array([self.poisson_action])))[None], deterministic=False).flatten() else: action = self.policy_tf.step(obs[None], deterministic=False).flatten() # Add noise to the action (improve exploration, # not needed in general) if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # Rescale from [-1, 1] to the correct bounds rescaled_action = action * np.abs(self.action_space.low) # if use_action_repeat and prev_action is not None: # if(np.random.uniform(0,1)<self.action_repetition): # rescaled_action=prev_action assert action.shape == self.env.action_space.shape # Add action repetition # print("Action repetition is {}".format(self.action_repetition)) if self.use_action_repeat: repeated_reward = 0 # print("Repeating actions for: {}".format(int(rescaled_action[-1])+4)) for repeat_step in range(int(rescaled_action[-1])+4): prev_action = rescaled_action new_obs, reward, done, info = self.env.step(rescaled_action[:len(rescaled_action)-1]) repeated_reward+=reward buffer_action = action.copy() buffer_action[-1] = (rescaled_action[-1]+4-int(rescaled_action[-1]+4))+repeat_step+1 - 4 # print("Sub actions for: {}".format(buffer_action[-1])) # Add extra supervision # self.replay_buffer.add(obs, action, repeated_reward, new_obs, float(done)) if done: break reward = repeated_reward elif poisson: repeated_reward = 0 # print("Poisson repetition is {}".format(self.poisson_action)) for _ in range(self.poisson_action): # print("Repeating actions for: {}".format(self.action_repetition)) prev_action = rescaled_action new_obs, reward, done, info = self.env.step(rescaled_action) repeated_reward+=reward if done: break reward = repeated_reward else: new_obs, reward, done, info = self.env.step(rescaled_action) # Store transition in the replay buffer. if poisson: self.replay_buffer.add(np.concatenate((obs,np.array([self.poisson_action]))), action, reward, np.concatenate((new_obs,np.array([self.poisson_action]))), float(done)) else: self.replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if step % self.train_freq == 0: mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): # Break if the warmup phase is not over # or if there are not enough samples in the replay buffer if not self.replay_buffer.can_sample(self.batch_size) \ or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) mb_infos_vals.append(self._train_step(step, writer, current_lr)) # Update target network if (step + grad_step) % self.target_update_interval == 0: # Update target network self.sess.run(self.target_update_op) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) episode_rewards[-1] += reward if done: prev_action=None if self.action_noise is not None: self.action_noise.reset() if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) self.num_timesteps += 1 # Display training infos if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=2000, tb_log_name="MDPO", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose): # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) frac = 0 t_k = 0 start_time = time.time() episode_rewards = [0.0] episode_successes = [] if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() self.episode_reward = np.zeros((1, )) ep_info_buf = deque(maxlen=100) n_updates = 0 infos_values = [] for step in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) if (self.num_timesteps < self.learning_starts or np.random.rand() < self.random_exploration): # No need to rescale when sampling random action rescaled_action = action = self.env.action_space.sample() else: action = self.policy_tf.step(obs[None], deterministic=True).flatten() #action = self.policy_tf.step(obs[None], deterministic=True).flatten() # Add noise to the action (improve exploration, # not needed in general) if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # Rescale from [-1, 1] to the correct bounds rescaled_action = action * np.abs(self.action_space.low) assert action.shape == self.env.action_space.shape #print("action", action) new_obs, reward, done, info = self.env.step(rescaled_action) #print("new obs", new_obs) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, reward, new_obs, float(done), info) obs = new_obs # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if step % self.train_freq == 0: mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(1): #int(self.gradient_steps) # Break if the warmup phase is not over # or if there are not enough samples in the replay buffer if not self.replay_buffer.can_sample(self.batch_size) \ or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate t_k = self.klconst # step / total_timesteps frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) mb_infos_vals.append( self._train_step(step, current_lr, t_k)) # Update target network if (step + grad_step) % self.target_update_interval == 0: # Update target network self.sess.run(self.target_update_op) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) if step % self.gradient_steps == 0: self.sess.run(self.assign_policy_op) episode_rewards[-1] += reward if done: if self.action_noise is not None: self.action_noise.reset() if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) self.num_timesteps += 1 # Display training infos if self.verbose >= 1 and step % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv( 'ep_rewmean', safe_mean( [ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'eplenmean', safe_mean( [ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("lamda", self.lamda) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.logkv("t_k", t_k) logger.logkv("steps", step) logger.dumpkvs() # Reset infos: infos_values = [] return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=4, tb_log_name="SAC", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] obs = self.env.reset() self.episode_reward = np.zeros((1,)) ep_info_buf = deque(maxlen=100) n_updates = 0 infos_values = [] obs = self.env.reset() for i in range(128): action = self.env.action_space.sample() new_obs, reward, done, info = self.env.step(action) # print(new_obs) # self.env.render() self.iiayn.update_history([obs]) obs = new_obs for step in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy. if self.num_timesteps < self.learning_starts: action = self.env.action_space.sample() # No need to rescale when sampling random action rescaled_action = action else: action = self.policy_tf.step(obs[None], deterministic=False).flatten() # Rescale from [-1, 1] to the correct bounds rescaled_action = action #* np.abs(self.action_space.low) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(rescaled_action) print(step, action) # self.env.render() self.iiayn.update_history([obs]) if step % 2048 == 0: self.iiayn.activate_buffer() # Store transition in the replay buffer. self.replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if step % self.train_freq == 0: mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): if self.num_timesteps < self.batch_size or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) mb_infos_vals.append(self._train_step(step, writer, current_lr)) # Update target network if (step + grad_step) % self.target_update_interval == 0: # Update target network self.sess.run(self.target_update_op) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) episode_rewards[-1] += reward if done or step%1024 == 0: obs = self.env.reset() # if not isinstance(self.env, VecEnv): # obs = self.env.reset() episode_rewards.append(0.0) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) self.num_timesteps += 1 # Display training infos if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name="SAC", print_freq=100): with TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) start_time = time.time() episode_rewards = [0.0] is_teleop_env = hasattr(self.env, "wait_for_teleop_reset") # TeleopEnv if is_teleop_env: print("Waiting for teleop") obs = self.env.wait_for_teleop_reset() info = {"cte": 0.0} else: obs = self.env.reset() info = {"cte": 0.0} self.episode_reward = np.zeros((1,)) ep_info_buf = deque(maxlen=100) ep_len = 0 self.n_updates = 0 infos_values = [] mb_infos_vals = [] # ---------------------load the trained NN for safety signal tf_obs = tf.placeholder(tf.float32, obs.shape) hidden1 = tf.layers.dense(tf_obs, 64, tf.nn.relu) hidden2 = tf.layers.dense(hidden1, 16, tf.nn.relu) output = tf.layers.dense(hidden2, 2) sess = tf.Session() saver = tf.train.Saver() saver.restore(sess, "./saved_params/param02-level1-linear/safe_layer") # -------------------------------------------------------- fr = open("dump_reward.txt", "w") fv = open("dump_violation.txt", "w") fl = open("dump_lambda.txt", "w") cum_reward = [] num_vio = 0 for step in range(total_timesteps): # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy. if step < self.learning_starts: action = self.env.action_space.sample() # No need to rescale when sampling random action rescaled_action = action else: action = self.policy_tf.step(obs[None], deterministic=False).flatten() # Rescale from [-1, 1] to the correct bounds rescaled_action = action * np.abs(self.action_space.low) assert action.shape == self.env.action_space.shape # ---------- use trained NN to revise the action #print("-------------------------------") print("h1, rescaled_action ", rescaled_action) proposed_action = rescaled_action.copy() proposed_action = np.asarray(proposed_action).reshape((1, 2)) #print ("h2, proposed_action", proposed_action) print("obs shape: ", obs.shape) corr_v = sess.run(output, {tf_obs: obs}) lambda_v = (info["cte"] + np.dot(corr_v, proposed_action.T) - 1.3) / np.dot(corr_v, corr_v.T) #print (info["cte"], info["cte"] + np.asscalar(np.dot(corr_v, proposed_action.T)) ) print("lambda: ", lambda_v) if lambda_v < 0: lambda_v = 0.0 proposed_action -= lambda_v * corr_v proposed_action *= np.abs(self.action_space.low) #print ("h3 proposed_action: ", proposed_action) #print("h4 rescaled_action: ", rescaled_action) rescaled_action[0] = proposed_action[0][0] rescaled_action[1] = proposed_action[0][1] # ----------------------------------------- print("h5 rescaled_action: ", rescaled_action) new_obs, reward, done, new_info = self.env.step(rescaled_action) ep_len += 1 if (len(cum_reward) == 10): cum_reward.pop(0) cum_reward.append(reward) curr = 0.0 for i in range(len(cum_reward)): idx = len(cum_reward) - i - 1 curr += cum_reward[idx] * (0.99**i) fr.write("%f \n" %(curr)) fv.write("%d \n" %(num_vio)) fl.write("%f \n" %(lambda_v)) if print_freq > 0 and ep_len % print_freq == 0 and ep_len > 0: print("{} steps".format(ep_len)) # Store transition in the replay buffer. self.replay_buffer.add(obs, rescaled_action, reward, new_obs, float(done)) obs = new_obs info = new_info # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_reward, ep_done, writer, step) if ep_len > self.train_freq: print("Additional training") self.env.reset() mb_infos_vals = self.optimize(step, writer, current_lr) done = True episode_rewards[-1] += reward if done: num_vio += 1 if not (isinstance(self.env, VecEnv) or is_teleop_env): obs = self.env.reset() print("Episode finished. Reward: {:.2f} {} Steps".format(episode_rewards[-1], ep_len)) episode_rewards.append(0.0) ep_len = 0 mb_infos_vals = self.optimize(step, writer, current_lr) # Refresh obs when using TeleopEnv if is_teleop_env: print("Waiting for teleop") obs = self.env.wait_for_teleop_reset() # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", self.n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', "{:.2f}".format(time.time() - start_time)) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", step) logger.dumpkvs() # Reset infos: infos_values = [] if is_teleop_env: self.env.is_training = False # Use last batch print("Final optimization before saving") self.env.reset() mb_infos_vals = self.optimize(step, writer, current_lr) fr.close() fv.close() fl.close() return self
def learn(self, total_timesteps, callback=None, log_interval=1, tb_log_name="PPO2", reset_num_timesteps=True): # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) cliprange_vf = get_schedule_fn(self.cliprange_vf) new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() t_first_start = time.time() n_updates = total_timesteps // self.n_batch for update in range(1, n_updates + 1): assert self.n_batch % self.nminibatches == 0, ("The number of minibatches (`nminibatches`) " "is not a factor of the total number of samples " "collected per rollout (`n_batch`), " "some samples won't be used." ) batch_size = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update - 1.0) / n_updates lr_now = self.learning_rate(frac) cliprange_now = self.cliprange(frac) cliprange_vf_now = cliprange_vf(frac) # true_reward is the reward without discount obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = self.runner.run() self.num_timesteps += self.n_batch self.ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: # non-recurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1 inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, batch_size): timestep = self.num_timesteps // update_fac + ((self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) // batch_size) end = start + batch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append(self._train_step(lr_now, cliprange_now, *slices, writer=writer, update=timestep, cliprange_vf=cliprange_vf_now)) else: # recurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1 assert self.n_envs % self.nminibatches == 0 env_indices = np.arange(self.n_envs) flat_indices = np.arange(self.n_envs * self.n_steps).reshape(self.n_envs, self.n_steps) envs_per_batch = batch_size // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(env_indices) for start in range(0, self.n_envs, envs_per_batch): timestep = self.num_timesteps // update_fac + ((self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) // envs_per_batch) end = start + envs_per_batch mb_env_inds = env_indices[start:end] mb_flat_inds = flat_indices[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append(self._train_step(lr_now, cliprange_now, *slices, update=timestep, writer=writer, states=mb_states, cliprange_vf=cliprange_vf_now)) loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if writer is not None: total_episode_reward_logger(self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", update * self.n_steps) logger.logkv("n_updates", update) logger.logkv("learning_rate", self.curr_lr) logger.logkv("total_timesteps", self.num_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) if len(self.ep_info_buf) > 0 and len(self.ep_info_buf[0]) > 0: logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in self.ep_info_buf])) logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in self.ep_info_buf])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name="PPO2", reset_num_timesteps=True): # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) cliprange_vf = get_schedule_fn(self.cliprange_vf) new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) # runner = DistributedRunner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam) ctx = multiprocessing.get_context('spawn') q = ctx.Queue() p = ctx.Process( target=runDRunner, kwargs={'examples_queue': q} ) #, 'env' : self.env, 'model' : self, 'n_steps' : self.n_steps, 'gamma' : self.gamma, 'lam':self.lam}) p.start() print("STarted up queue from master") self.episode_reward = np.zeros((self.n_envs, )) ep_info_buf = deque(maxlen=100) t_first_start = time.time() n_updates = total_timesteps // self.n_batch print("about to run...", n_updates, "updates and batch size", self.n_batch) for update in range(1, n_updates + 1): print("In loop.") assert self.n_batch % self.nminibatches == 0 batch_size = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update - 1.0) / n_updates lr_now = self.learning_rate(frac) cliprange_now = self.cliprange(frac) cliprange_vf_now = cliprange_vf(frac) # true_reward is the reward without discount # pull from queue print("Pulling from quee...") obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = q.get( block=True) print("Got something!") self.num_timesteps += self.n_batch ep_info_buf.extend(ep_infos) mb_loss_vals = [] #non-recurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1 inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, batch_size): timestep = self.num_timesteps // update_fac + ( (self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) // batch_size) end = start + batch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append( self._train_step(lr_now, cliprange_now, *slices, writer=writer, update=timestep, cliprange_vf=cliprange_vf_now)) loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) ## BRODCAST WEIGHTS if writer is not None: self.episode_reward = total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", update * self.n_steps) logger.logkv("n_updates", update) logger.logkv("total_timesteps", self.num_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv( 'ep_reward_mean', safe_mean( [ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'ep_len_mean', safe_mean( [ep_info['l'] for ep_info in ep_info_buf])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break return self
def learn_jirl(self, total_timesteps, joystick=None, callback=None, seed=None, log_interval=1, tb_log_name="SAC", print_freq=100, base_policy=None, stochastic_actor=True, expert_guidance_steps=50000, save_path=None): with TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer: # Add path to model in this function self._setup_learn(seed) # Joystick object js = JoyStick() # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) episode_rewards = [0.0] # Reset the environment obs = self.env.reset() # Book keeping self.episode_reward = np.zeros((1, )) ep_info_buf = deque(maxlen=100) ep_len = 0 self.n_updates = 0 n_crashes = 0 infos_values = [] mb_infos_vals = [] pred_action_info = deque(maxlen=20) mean_info = deque(maxlen=50) std_info = deque(maxlen=50) throttle_info = deque(maxlen=1000) is_action_expert = False is_action_actor = True was_last_action_actor = False last_action_actor = None last_obs = None # steps in which expert takes control expert_control_steps = [] state = {} # for the imitation learning agent MAX_LEN = 10 is_ratios_target_expert = deque( maxlen=MAX_LEN) # IS ratios over the last few steps is_ratios_target_actor = deque(maxlen=MAX_LEN) EPS = 1e-10 # Stats for plotting rew_per_step = [] rew_per_step_rl = [] rl_control = [] # Buffer to control the threshold dynamically thresh_buffer = deque(maxlen=1000) std_buffer = deque(maxlen=10000) mean_buffer = deque(maxlen=10000) import time start_time = time.time() try: for step in range(total_timesteps): # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Get prediction from base policy steerCmd = float(base_policy.predict(obs)[0][0]) # print("Steering from IL: ", steerCmd) throttleCmd = -1 action_expert = [steerCmd, throttleCmd] # mean_exp, std_exp = il_model.get_proba_actions(state) # print(scipy.stats.multivariate_normal(mean = mean, cov = std).pdf(action_expert)) # Test with hard coded variance # std_exp = [0.1, 0.1] # proba_expert_policy = scipy.stats.norm(mean_exp[0], std_exp[0]).pdf(action_expert[0]) # proba_expert_policy = scipy.stats.norm(mean_exp[0], std_exp[0]).cdf(action_expert[0] + EPS) - scipy.stats.norm(mean_exp[0], std_exp[0]).cdf(action_expert[0] - EPS) # if 2*np.pi*np.prod(std) <= 1: # proba_expert_policy = 2*np.pi*np.prod(std)*scipy.stats.multivariate_normal(mean = mean, cov = std).pdf(action_expert) # else: # proba_expert_policy = scipy.stats.multivariate_normal(mean = mean, cov = std).pdf(action_expert) ## ====== Test code snippet ====== # action_expert, _ = model.predict(obs, deterministic=True) # new_obs, reward, done, info = self.env.step(action_expert) ## =============================== if not stochastic_actor: action_actor = self.policy_tf.step( obs[None], deterministic=True).flatten() else: action_actor = self.policy_tf.step( obs[None], deterministic=False).flatten() if step >= expert_guidance_steps: action_actor = self.policy_tf.step( obs[None], deterministic=True).flatten() mean_act, std_act = self.policy_tf.proba_step(obs[None]) # print(scipy.stats.multivariate_normal(mean = mean.flatten(), cov = std.flatten()).pdf(action_actor)) proba_actor_policy = scipy.stats.norm( mean_act.flatten()[0], std_act.flatten()[0]).pdf(action_actor[0]) proba_expert_policy = scipy.stats.norm( mean_act.flatten()[0], std_act.flatten()[0]).pdf(action_expert[0]) # proba_actor_policy = scipy.stats.norm(mean_act.flatten()[0], std_act.flatten()[0]).cdf(action_actor[0] + EPS) - scipy.stats.norm(mean_act.flatten()[0], std_act.flatten()[0]).cdf(action_actor[0] - EPS) # if 2*np.pi*np.prod(std) <= 1: # proba_actor_policy = 2*np.pi*np.prod(std.flatten())*scipy.stats.multivariate_normal(mean = mean.flatten(), cov = std.flatten()).pdf(action_actor) # else: # proba_actor_policy = scipy.stats.multivariate_normal(mean = mean.flatten(), cov = std.flatten()).pdf(action_actor) # Update entropy buffer std_buffer.append(std_act) # Update mean difference buffer mean_buffer.append(np.linalg.norm(mean_act - action_expert)) # mean_buffer.append(np.linalg.norm(action_actor - action_expert)) rho = round(float(step) / expert_guidance_steps, 2) # THRESH = (1 - rho) * (scipy.stats.norm(0, 0.1).pdf(0) - 1.0)**MAX_LEN # _THRESH = (1 - rho) * (scipy.stats.norm(0, 0.1).pdf(0) - 2.0) _THRESH = (np.mean(std_buffer) + np.mean(mean_buffer)) * (1 - rho) THRESH = _THRESH**MAX_LEN if step >= expert_guidance_steps: # Only let the RL control the car # If this doesn't work, tune MAX_LEN THRESH = _THRESH = 0 if js.is_on(): ## ===================================== ## MANUAL CONTROL ## ===================================== # Execute commands from the joystick in the environment action_js = [js.get_steer(), -1] new_obs, reward, done, info = self.env.step(action_js) # Store transition in the replay buffer. self.replay_buffer.add(obs, action_js, reward, new_obs, float(done)) ## ========================================== sigma_p = 0.01 reward_hat = reward * np.exp( -np.linalg.norm(action_actor - action_js) / sigma_p) self.replay_buffer.add(obs, action_actor, reward_hat, new_obs, float(done)) ## ========================================== if was_last_action_actor: # Train the actor when the expert's actions are executed # mb_infos_vals = self.optimize(step, writer, current_lr) penalty = -1 #-10 self.replay_buffer.add(last_obs, last_action_actor, penalty, obs, float(done)) is_ratios_target_expert = deque(maxlen=MAX_LEN) was_last_action_actor = False last_action_actor = None last_obs = None is_action_actor = False # print("Actor IS ratio: ", is_ratio) # if ep_len > 700: # print("Expert: ", np.prod(is_ratios_target_actor)) if (len(is_ratios_target_actor) == MAX_LEN) and np.all( [(p > _THRESH) for p in is_ratios_target_actor]): # Switch control to actor in the next step is_action_actor = True rew_per_step_rl.append(0.0) rl_control.append(0) # else: elif is_action_actor: ## ===================================== ## RL CONTROL ## ===================================== # Execute actor's actions in the environment new_obs, reward, done, info = self.env.step( action_actor) # Update IS ratiowill need to is_ratio = self.importance_sampling_ratio( 1.0, proba_expert_policy) is_ratios_target_expert.append(is_ratio) # Store transition in the replay buffer. self.replay_buffer.add(obs, action_actor, reward, new_obs, float(done)) if not was_last_action_actor: is_ratios_target_actor = deque(maxlen=MAX_LEN) is_action_actor = True # print("Actor: ", np.prod(is_ratios_target_expert)) # Per step safety check if is_ratio < _THRESH: # Switch control to the expert is_action_actor = False # Safe ty check for a sequence of states if (len(is_ratios_target_actor) == MAX_LEN) and np.all( [(p > _THRESH) for p in is_ratios_target_actor]): #if (len(is_ratios_target_expert) == MAX_LEN) and (np.prod(is_ratios_target_expert) <= THRESH): # Switch control to expert in the next step is_action_actor = False was_last_action_actor = True last_action_actor = action_actor last_obs = obs rew_per_step_rl.append(reward) rl_control.append(1) else: ## ======================================= ## EXPERT CONTROL ## ======================================= # Execute expert action in the environment new_obs, reward, done, info = self.env.step( action_expert) # Update IS ratio # is_ratio = self.importance_sampling_ratio(1.0, proba_actor_policy) is_ratio = self.importance_sampling_ratio( 1.0, proba_expert_policy) is_ratios_target_actor.append(is_ratio) # print("Expert ", is_ratio) # Store transition in the replay buffer. self.replay_buffer.add(obs, action_expert, reward, new_obs, float(done)) ## ========================================== # # NOTE: Figure out what's going wrong here # # Without the penalized reward the policy diverges (mean doesn't go towards 0 # # Also test with stochastic actions from the RL policy # # # Add penalized reward to actor's action # # r_hat: penalized reward sigma_p = 0.01 reward_hat = reward * np.exp( -np.linalg.norm(action_actor - action_expert) / sigma_p) self.replay_buffer.add(obs, action_actor, reward_hat, new_obs, float(done)) ## ========================================== if was_last_action_actor: # Train the actor when the expert's actions are executed # mb_infos_vals = self.optimize(step, writer, current_lr) penalty = -1 #-10 self.replay_buffer.add(last_obs, last_action_actor, penalty, obs, float(done)) is_ratios_target_expert = deque(maxlen=MAX_LEN) was_last_action_actor = False last_action_actor = None last_obs = None is_action_actor = False # print("Actor IS ratio: ", is_ratio) # if ep_len > 700: # print("Expert: ", np.prod(is_ratios_target_actor)) # if (len(is_ratios_target_actor) == MAX_LEN) and (np.prod(is_ratios_target_actor) > THRESH): if (len(is_ratios_target_actor) == MAX_LEN) and np.all( [(p > _THRESH) for p in is_ratios_target_actor]): # Switch control to actor in the next step is_action_actor = True rew_per_step_rl.append(0.0) rl_control.append(0) throttle_info.append(float(self.env.last_throttle)) rew_per_step.append(reward) pred_action_info.append( np.abs(action_actor[0] - action_expert[0])) # mean_info.append([mean_exp[0], mean_act.flatten()[0]]) # std_info.append([std_exp[0], std_act.flatten()[0]]) ep_len += 1 obs = new_obs if ep_len % 400 == 0: print("Mean error pred actions: {}".format( np.mean(pred_action_info))) print("Mean difference: {}".format( np.mean(mean_buffer))) print("Mean std: {}".format(np.mean(std_buffer))) # print("Mean: ", [np.mean([x[0] for x in mean_info]), np.mean([x[1] for x in mean_info])]) # print("Std: ", [np.mean([x[0] for x in std_info]), np.mean([x[1] for x in std_info])]) # print(np.prod(is_ratios_target_actor)) # Train every step ---under consideratioon if (ep_len % 400) == 0: self.env.jet.apply_throttle(0) mb_infos_vals = self.optimize(step, writer, current_lr) # if print_freq > 0 and ep_len % print_freq == 0 and ep_len > 0: # print("{} steps".format(ep_len)) # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, step) episode_rewards[-1] += reward # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) if len(rl_control) < 1000: mean_rl_control = round( 100 * float(np.mean(rl_control)), 3) else: mean_rl_control = round( 100 * float(np.mean(rl_control[-1001:-1])), 3) num_episodes = len(episode_rewards) if self.verbose >= 1 and (ep_len % 400) == 0: logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) logger.logkv( 'ep_rewmean', safe_mean( [ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'eplenmean', safe_mean( [ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", self.n_updates) logger.logkv("current_lr", current_lr) logger.logkv("mean RL control percent", mean_rl_control) logger.logkv("mean of throttle values", mean(throttle_info)) logger.logkv("time elapsed", int(time.time() - start_time)) #logger.logkv("n_crashes", n_crashes) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", step) logger.dumpkvs() # Reset infos: infos_values = [] except KeyboardInterrupt: print("Exiting") self.env.reset() import sys sys.exit(0) # Use last batch print("Final optimization before saving") self.env.reset() mb_infos_vals = self.optimize(step, writer, current_lr) # save stats np.save(save_path + '/episode_reward', episode_rewards) np.save(save_path + '/stepwise_reward', rew_per_step) np.save(save_path + '/stepwise_reward_rl', rew_per_step_rl) print("Saving complete. Give a keyboard interrupt to end") return self
def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None, distinct_replay_buffer=False): new_tb_log = self._init_num_timesteps(reset_num_timesteps) for i, m in enumerate(self.sub_models): m.learning_rate = get_schedule_fn(m.learning_rate) if len(self.replay_wrappers) != 0: m.replay_buffer = self.replay_wrappers[i](m.replay_buffer) m._setup_learn() with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=self.exploration_initial_eps, final_p=self.exploration_final_eps) episode_rewards = [0.0] episode_successes = [] obs = self.env.reset() reset = True macro_count = 0 macro_len = self.macro_len macro_choices = [] n_updates = 0 for step in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True with self.sess.as_default(): if reset or macro_count % macro_len == 0: macro_action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] # macro_action = 1 macro_obs = obs reward_in_one_macro = 0 macro_count += 1 macro_choices.append(macro_action) # use sub_model to decide action # env_action = self.sub_models[macro_action] current_sub = self.sub_models[macro_action] if self.num_timesteps < self.learning_starts or np.random.rand( ) < current_sub.random_exploration: # actions sampled from action space are from range specific to the environment # but algorithm operates on tanh-squashed actions therefore simple scaling is used unscaled_action = self.env.action_space.sample() action = scale_action(self.env.action_space, unscaled_action) else: action = current_sub.policy_tf.step( obs[None], deterministic=False).flatten() # Add noise to the action (improve exploration, # not needed in general) if current_sub.action_noise is not None: action = np.clip(action + current_sub.action_noise(), -1, 1) # inferred actions need to be transformed to environment action_space before stepping unscaled_action = unscale_action(self.env.action_space, action) assert action.shape == self.env.action_space.shape reset = False new_obs, rew, done, info = self.env.step(unscaled_action) episode_rewards[-1] += rew # rew -= self.args.policy_cost_coef * self.args.sub_policy_costs[macro_action] reward_in_one_macro += rew - self.args.policy_cost_coef * self.args.sub_policy_costs[ macro_action] # Store transition in the replay buffer. if macro_count % macro_len == 0 or done: self.replay_buffer.add(macro_obs, macro_action, reward_in_one_macro, new_obs, float(done)) for i, m in enumerate(self.sub_models): if distinct_replay_buffer: if i == macro_action: m.replay_buffer.add(obs, action, rew, new_obs, float(done)) else: m.replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs if writer is not None: ep_rew = np.array([rew]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) # print("step: %d, done: %d" % (self.num_timesteps, done)) if done: maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True macro_action = None macro_count = 0 prev_macro_choices = macro_choices macro_choices = [] # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample and self.num_timesteps > self.learning_starts \ and self.num_timesteps % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. # pytype:disable=bad-unpacking if self.prioritized_replay: assert self.beta_schedule is not None, \ "BUG: should be LinearSchedule when self.prioritized_replay True" experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(self.num_timesteps)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size) weights, batch_idxes = np.ones_like(rewards), None # pytype:enable=bad-unpacking if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata( run_metadata, 'step%d' % self.num_timesteps) else: summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: new_priorities = np.abs( td_errors) + self.prioritized_replay_eps assert isinstance(self.replay_buffer, PrioritizedReplayBuffer) self.replay_buffer.update_priorities( batch_idxes, new_priorities) if can_sample and self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if step % self.sub_models[0].train_freq == 0: mb_infos_vals = [] for m in self.sub_models: # Update policy, critics and target networks for grad_step in range(m.gradient_steps): # Break if the warmup phase is not over # or if there are not enough samples in the replay buffer if not m.replay_buffer.can_sample(m.batch_size) \ or self.num_timesteps < m.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = m.learning_rate(frac) # Update policy and critics (q functions) mb_infos_vals.append( m._train_step(step, writer, current_lr)) # Update target network if (step + grad_step) % m.target_update_interval == 0: # Update target network m.sess.run(m.target_update_op) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) # print(done, log_interval, len(episode_rewards), self.num_timesteps) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: logger.record_tabular("steps", self.num_timesteps) prev_macro_choices = np.array(prev_macro_choices) macro_choices_ratio = [ '%.2f' % ((prev_macro_choices[prev_macro_choices == i]).size / prev_macro_choices.size) for i in range(self.n_actions) ] logger.record_tabular("macro choices", macro_choices_ratio) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.logkv("n_updates_of_sub", n_updates) logger.dump_tabular() print("macro choices", prev_macro_choices) self.num_timesteps += 1 return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=4, tb_log_name="TD3", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) last_replay_update = 0 if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] episode_successes = [] if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() self.episode_reward = np.zeros((1, )) ep_info_buf = deque(maxlen=100) n_updates = 0 infos_values = [] self.active_sampling = False initial_step = self.num_timesteps if self.buffer_is_prioritized and \ ((replay_wrapper is not None and self.replay_buffer.replay_buffer.__name__ == "ReplayBuffer") or (replay_wrapper is None and self.replay_buffer.__name__ == "ReplayBuffer")) \ and self.num_timesteps >= self.prioritization_starts: self._set_prioritized_buffer() for step in range(initial_step, total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) if (self.num_timesteps < self.learning_starts or np.random.rand() < self.random_exploration): # No need to rescale when sampling random action rescaled_action = action = self.env.action_space.sample() else: action = self.policy_tf.step(obs[None]).flatten() # Add noise to the action, as the policy # is deterministic, this is required for exploration if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # Rescale from [-1, 1] to the correct bounds rescaled_action = action * np.abs(self.action_space.low) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(rescaled_action) # Store transition in the replay buffer. self.replay_buffer.add( obs, action, reward, new_obs, float(done if not self.time_aware else done and info["termination"] != "steps")) obs = new_obs if ((replay_wrapper is not None and self.replay_buffer.replay_buffer.__name__ == "RankPrioritizedReplayBuffer")\ or self.replay_buffer.__name__ == "RankPrioritizedReplayBuffer") and \ self.num_timesteps % self.buffer_size == 0: self.replay_buffer.rebalance() # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if step % self.train_freq == 0: mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): # Break if the warmup phase is not over # or if there are not enough samples in the replay buffer if not self.replay_buffer.can_sample(self.batch_size) \ or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - self.num_timesteps / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) # Note: the policy is updated less frequently than the Q functions # this is controlled by the `policy_delay` parameter step_writer = writer if grad_step % self.write_freq == 0 else None mb_infos_vals.append( self._train_step(step, step_writer, current_lr, (step + grad_step) % self.policy_delay == 0)) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) episode_rewards[-1] += reward if done: if isinstance(self.replay_buffer, DiscrepancyReplayBuffer ) and n_updates - last_replay_update >= 5000: self.replay_buffer.update_priorities() last_replay_update = n_updates if self.action_noise is not None: self.action_noise.reset() if not isinstance(self.env, VecEnv): if self.active_sampling: sample_obs, sample_state = self.env.get_random_initial_states( 25) obs_discrepancies = self.policy_tf.get_q_discrepancy( sample_obs) obs = self.env.reset( **sample_state[np.argmax(obs_discrepancies)]) else: obs = self.env.reset() episode_rewards.append(0.0) maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) self.num_timesteps += 1 if self.buffer_is_prioritized and \ ((replay_wrapper is not None and self.replay_buffer.replay_buffer.__name__ == "ReplayBuffer") or (replay_wrapper is None and self.replay_buffer.__name__ == "ReplayBuffer"))\ and self.num_timesteps >= self.prioritization_starts: self._set_prioritized_buffer() # Display training infos if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv( 'ep_rewmean', safe_mean( [ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'eplenmean', safe_mean( [ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name="SAC", print_freq=100): with TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) start_time = time.time() episode_rewards = [0.0] is_teleop_env = hasattr(self.env, "wait_for_teleop_reset") # TeleopEnv if is_teleop_env: print("Waiting for teleop") obs = self.env.wait_for_teleop_reset() else: obs = self.env.reset() self.episode_reward = np.zeros((1, )) ep_info_buf = deque(maxlen=100) ep_len = 0 self.n_updates = 0 infos_values = [] mb_infos_vals = [] for step in range(total_timesteps): # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy. if step < self.learning_starts: action = self.env.action_space.sample() # No need to rescale when sampling random action rescaled_action = action else: action = self.policy_tf.step( obs[None], deterministic=False).flatten() # Rescale from [-1, 1] to the correct bounds rescaled_action = action * np.abs(self.action_space.low) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(rescaled_action) ep_len += 1 if print_freq > 0 and ep_len % print_freq == 0 and ep_len > 0: print("{} steps".format(ep_len)) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, step) if ep_len > self.train_freq: print("Additional training") self.env.reset() mb_infos_vals = self.optimize(step, writer, current_lr) done = True episode_rewards[-1] += reward if done: if not (isinstance(self.env, VecEnv) or is_teleop_env): obs = self.env.reset() print("Episode finished. Reward: {:.2f} {} Steps".format( episode_rewards[-1], ep_len)) episode_rewards.append(0.0) ep_len = 0 mb_infos_vals = self.optimize(step, writer, current_lr) # Refresh obs when using TeleopEnv if is_teleop_env: print("Waiting for teleop") obs = self.env.wait_for_teleop_reset() # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) logger.logkv( 'ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", self.n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', "{:.2f}".format(time.time() - start_time)) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", step) logger.dumpkvs() # Reset infos: infos_values = [] if is_teleop_env: self.env.is_training = False # Use last batch print("Final optimization before saving") self.env.reset() mb_infos_vals = self.optimize(step, writer, current_lr) return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name="PPO2", reset_num_timesteps=True): # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) cliprange_vf = get_schedule_fn(self.cliprange_vf) new_tb_log = self._init_num_timesteps(reset_num_timesteps) with TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) as writer: # self._setup_learn(seed) runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam) self.episode_reward = np.zeros((self.n_envs, )) self.total_episode_reward = np.zeros((1, )) ep_info_buf = deque(maxlen=100) t_first_start = time.time() n_timesteps = 0 # nupdates = total_timesteps // self.n_batch for timestep in range(1, total_timesteps + 1): assert self.n_batch % self.nminibatches == 0 batch_size = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - timestep / total_timesteps lr_now = self.learning_rate(frac) cliprangenow = self.cliprange(frac) cliprange_vf_now = cliprange_vf(frac) # true_reward is the reward without discount obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run( ) n_timesteps += len(obs) ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: # nonrecurrent version inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, batch_size): end = start + batch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append( self._train_step( lr_now, cliprangenow, *slices, writer=writer, update=n_timesteps, cliprange_vf=cliprange_vf_now)) else: # recurrent version assert self.n_envs % self.nminibatches == 0 env_indices = np.arange(self.n_envs) flat_indices = np.arange(self.n_envs * self.n_steps).reshape( self.n_envs, self.n_steps) envs_per_batch = batch_size // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(env_indices) for stan_timestepsrt in range(0, self.n_envs, envs_per_batch): # timestep = ((update * self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) // # envs_per_batch) end = start + envs_per_batch mb_env_inds = env_indices[start:end] mb_flat_inds = flat_indices[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append( self._train_step(lr_now, cliprangenow, *slices, update=n_timesteps, writer=writer, states=mb_states)) loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) # if writer is not None: # self.episode_reward = total_episode_reward_logger(self.episode_reward, # true_reward.reshape((self.n_envs, self.n_steps)), # masks.reshape((self.n_envs, self.n_steps)), # writer, n_timesteps) if self.verbose >= 1 and (timestep % log_interval == 0 or timestep == 1): explained_var = explained_variance(values, returns) logger.logkv("total_timesteps", n_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv( 'ep_reward_mean', safe_mean( [ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'ep_len_mean', safe_mean( [ep_info['l'] for ep_info in ep_info_buf])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() self.total_episode_reward = runner.total_episode_reward if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break if n_timesteps > total_timesteps: break return self
def learn(self, total_timesteps, env_eval, callback=None, seed=None, path=None, dis_path=None, score_path=None, dis_eval_interval=100, log_interval=4, tb_log_name="SAC", reset_num_timesteps=True, replay_wrapper=None): self.eval_env = env_eval new_tb_log = self._init_num_timesteps(reset_num_timesteps) if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] episode_successes = [] if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() self.episode_reward = np.zeros((1,)) ep_info_buf = deque(maxlen=100) n_updates = 0 infos_values = [] dis_eval_array = [] # (total_step % eval_intervel) x 2 x n_batch self.ep_length = 0 for step in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) if (self.num_timesteps < self.learning_starts or np.random.rand() < self.random_exploration): # No need to rescale when sampling random action rescaled_action = action = self.env.action_space.sample() noise = np.zeros(self.noise_dim) else: noise = self.policy_tf.gen_noise(obs[None]).flatten() action = self.policy_tf.step(obs[None],noise[None] ,deterministic=False).flatten() # Add noise to the action (improve exploration, # not needed in general) if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # Rescale from [-1, 1] to the correct bounds rescaled_action = action * np.abs(self.action_space.low) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(rescaled_action) self.ep_length += 1 # Store transition in the replay buffer. self.replay_buffer.add(obs, action, reward, new_obs, float(done), noise) episode_rewards[-1] += reward reset_flag = done or self.ep_length >= self.max_ep_length if reset_flag: if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() episode_rewards.append(0.0) self.ep_length = 0 maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) else: obs = new_obs # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if step % self.train_freq == 0: mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): # Break if the warmup phase is not over # or if there are not enough samples in the replay buffer if not self.replay_buffer.can_sample(self.batch_size) \ or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) mb_infos_vals.append(self._train_step(step, writer, current_lr, dis_eval_array, dis_eval_interval, dis_path)) # Update target network if (step + grad_step) % self.target_update_interval == 0: # Update target network self.sess.run(self.target_update_op) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) if self.num_timesteps % 2000 == 0: eval_ob = self.eval_env.reset() eval_epi_rewards = 0 eval_epis = 0 eval_performance = [] eval_ep_step = 0 while True: eval_noise = self.policy_tf.gen_noise(eval_ob[None]).flatten() eval_action = self.policy_tf.step(eval_ob[None], eval_noise[None], deterministic=True).flatten() eval_rescaled_action = eval_action * np.abs(self.action_space.low) eval_new_obs, eval_reward, eval_done, eval_info = self.eval_env.step(eval_rescaled_action) eval_epi_rewards += eval_reward eval_ob = eval_new_obs eval_ep_step += 1 if eval_done or eval_ep_step >= self.max_ep_length: eval_ob = self.eval_env.reset() eval_performance.append(eval_epi_rewards) eval_epi_rewards = 0 eval_epis += 1 eval_ep_step = 0 if eval_epis > 5: break with open(score_path, 'a') as f2: f2.write("%i %f\n" % (self.num_timesteps, np.mean(eval_performance))) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) self.num_timesteps += 1 # Display training infos if self.verbose >= 1 and reset_flag and log_interval is not None and len(episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) with open(path,'a') as f1: f1.write("%f " % step) f1.write("%f " % mean_reward) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) with open(path,'a') as f1: f1.write("%f " % safe_mean([ep_info['r'] for ep_info in ep_info_buf])) f1.write("%f " % safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", n_updates) with open(path,'a') as f1: f1.write("%f " % n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] return self
def learn(self, total_timesteps, callback=None, log_interval=1, tb_log_name="SAC", print_freq=100, save_path=None): with TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn() # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) start_time = time.time() episode_rewards = [0.0] is_teleop_env = hasattr(self.env, "wait_for_teleop_reset") # TeleopEnv if is_teleop_env: print("Waiting for teleop") obs = self.env.wait_for_teleop_reset() else: obs = self.env.reset() self.episode_reward = np.zeros((1, )) ep_info_buf = deque(maxlen=100) ep_len = 0 self.n_updates = 0 infos_values = [] mb_infos_vals = [] model_path = "--Path to model--/myNewModdel.h5" # model_path= None if model_path is not None: cfg = dk.load_config( config_path='--Path to config file inside mycar/config.py') kl = KerasLinear() kl.load(model_path) # vae = self.env.get_vae() self.training_started = False self.start_training = False for step in range(total_timesteps): # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy. if step < self.learning_starts and not self.training_started: if model_path is not None: try: img_arr = self.env.get_images() # print(img_arr[0].shape) img_arr = np.asarray(img_arr[0]) img_arr = normalize_and_crop(img_arr, cfg) croppedImgH = img_arr.shape[0] croppedImgW = img_arr.shape[1] if img_arr.shape[2] == 3 and cfg.IMAGE_DEPTH == 1: img_arr = dk.utils.rgb2gray(img_arr).reshape( croppedImgH, croppedImgW, 1) steering, throttle = kl.run(img_arr) action = [steering, throttle / 6.0] action = np.asarray(action) # rescaled_action = action * np.abs(self.action_space.low) rescaled_action = action print('Predicted action :', action) except Exception as e: print(e) action = self.env.action_space.sample() rescaled_action = action else: action = self.env.action_space.sample() rescaled_action = action print(action) # No need to rescale when sampling random action elif not self.training_started: self.start_training = True obs = self.env.reset() else: action = self.policy_tf.step( obs[None], deterministic=False).flatten() # Rescale from [-1, 1] to the correct bounds rescaled_action = action * np.abs(self.action_space.low) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(rescaled_action) ep_len += 1 if print_freq > 0 and ep_len % print_freq == 0 and ep_len > 0: print("{} steps".format(ep_len)) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, step) if ep_len > self.train_freq: print("Additional training") self.env.reset() mb_infos_vals = self.optimize(step, writer, current_lr) done = True episode_rewards[-1] += reward if done or self.start_training: self.start_training = False if not (isinstance(self.env, VecEnv) or is_teleop_env): obs = self.env.reset() print("Episode finished. Reward: {:.2f} {} Steps".format( episode_rewards[-1], ep_len)) episode_rewards.append(0.0) ep_len = 0 mb_infos_vals = self.optimize(step, writer, current_lr) # Refresh obs when using TeleopEnv if is_teleop_env: print("Waiting for teleop") obs = self.env.wait_for_teleop_reset() # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) logger.logkv( 'ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", self.n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', "{:.2f}".format(time.time() - start_time)) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", step) logger.dumpkvs() # Reset infos: infos_values = [] if is_teleop_env: self.env.is_training = False # Use last batch print("Final optimization before saving") self.env.reset() mb_infos_vals = self.optimize(step, writer, current_lr) plt.figure(1) plt.plot(episode_rewards) plt.title('Episode Rewards') plt.ylabel("Reward") plt.xlabel("Epoch") filename = "training" + str(random.random()) + ".png" plt.savefig(filename) plt.show() return self
def learn(self, total_timesteps, callback=None, log_interval=4, tb_log_name="SAC", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] episode_successes = [] if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() self.episode_reward = np.zeros((1, )) ep_info_buf = deque(maxlen=100) n_updates = 0 infos_values = [] for step in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) if self.num_timesteps < self.learning_starts or np.random.rand( ) < self.random_exploration: # actions sampled from action space are from range specific to the environment # but algorithm operates on tanh-squashed actions therefore simple scaling is used unscaled_action = self.env.action_space.sample() action = scale_action(self.action_space, unscaled_action) else: action = self.policy_tf.step( obs[None], deterministic=False).flatten() # Add noise to the action (improve exploration, # not needed in general) if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # inferred actions need to be transformed to environment action_space before stepping unscaled_action = unscale_action(self.action_space, action) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(unscaled_action) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if step % self.train_freq == 0: mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): # Break if the warmup phase is not over # or if there are not enough samples in the replay buffer if not self.replay_buffer.can_sample(self.batch_size) \ or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) mb_infos_vals.append( self._train_step(step, writer, current_lr)) # Update target network if (step + grad_step) % self.target_update_interval == 0: # Update target network self.sess.run(self.target_update_op) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) episode_rewards[-1] += reward if done: if self.action_noise is not None: self.action_noise.reset() if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) self.num_timesteps += 1 # Display training infos if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) logger.logkv("episode reward", episode_rewards[-2]) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv( 'ep_rewmean', safe_mean( [ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'eplenmean', safe_mean( [ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=4, tb_log_name="SAC", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) if self.priority_buffer: self.replay_buffer.set_model(self) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) self.env_id = self.env.env.get_attr('spec')[0].id # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() store_time = 0.0 step_time = 0.0 train_time = 0.0 episode_rewards = [[0.0] for _ in range(self.env.env.num_envs)] episode_successes = [[] for _ in range(self.env.env.num_envs)] if self.action_noise is not None: self.action_noise.reset() assert isinstance(self.env.env, VecEnv) self.episode_reward = np.zeros((1, )) ep_info_buf = deque(maxlen=100) n_updates = 0 infos_values = [] pp_sr_buf = deque(maxlen=5) stack_sr_buf = deque(maxlen=5) start_decay = total_timesteps if self.sequential and 'FetchStack' in self.env_id: current_max_nobject = 2 self.env.env.env_method('set_task_array', [[(2, 0), (2, 1), (1, 0)]] * self.env.env.num_envs) print('Set task_array to ', self.env.env.get_attr('task_array')[0]) self.env.env.env_method('set_random_ratio', [0.7] * self.env.env.num_envs) obs = self.env.reset() print(obs.shape) for step in range(total_timesteps): if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break if self.curriculum and step % 3000 == 0: if 'FetchStack' in self.env.env.get_attr('spec')[0].id: # Stacking pp_sr = eval_model( self.eval_env, self, current_max_nobject if self.sequential else self.env.env.get_attr('n_object')[0], 1.0, init_on_table=(self.env.env.get_attr('spec')[0].id == 'FetchStack-v2')) pp_sr_buf.append(pp_sr) stack_sr = eval_model( self.eval_env, self, current_max_nobject if self.sequential else self.env.env.get_attr('n_object')[0], 0.0, init_on_table=(self.env.env.get_attr('spec')[0].id == 'FetchStack-v2')) stack_sr_buf.append(stack_sr) print('Pick-and-place success rate', np.mean(pp_sr_buf)) if self.sequential: if self.env.env.get_attr('random_ratio')[ 0] > 0.5 and np.mean(pp_sr_buf) > 0.8: _ratio = 0.3 elif self.env.env.get_attr('random_ratio')[0] < 0.5 \ and current_max_nobject < self.env.env.get_attr('n_object')[0] \ and np.mean(stack_sr_buf) > 1 / current_max_nobject: _ratio = 0.7 current_max_nobject += 1 previous_task_array = self.env.env.get_attr( 'task_array')[0] self.env.env.env_method( 'set_task_array', [ previous_task_array + [(current_max_nobject, j) for j in range(current_max_nobject)] ] * self.env.env.num_envs) print('Set task_array to', self.env.env.get_attr('task_array')[0]) else: _ratio = self.env.env.get_attr( 'random_ratio')[0] else: if start_decay == total_timesteps and np.mean( pp_sr_buf) > 0.8: start_decay = step _ratio = np.clip(0.7 - (step - start_decay) / 2e6, 0.3, 0.7) # from 0.7 to 0.3 elif 'FetchPushWallObstacle' in self.env_id: _ratio = max(1.0 - step / total_timesteps, 0.0) else: raise NotImplementedError self.env.env.env_method('set_random_ratio', [_ratio] * self.env.env.num_envs) print('Set random_ratio to', self.env.env.get_attr('random_ratio')[0]) # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) if (self.num_timesteps < self.learning_starts or np.random.rand() < self.random_exploration): rescaled_action = np.stack([ self.env.action_space.sample() for _ in range(self.env.env.num_envs) ], axis=0) action = rescaled_action else: action = self.policy_tf.step(obs, deterministic=False) # Add noise to the action (improve exploration, # not needed in general) if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # Rescale from [-1, 1] to the correct bounds rescaled_action = action * np.abs(self.action_space.low) assert action.shape == ( self.env.env.num_envs, ) + self.env.action_space.shape step_time0 = time.time() new_obs, reward, done, info = self.env.step(rescaled_action) step_time += time.time() - step_time0 next_obs = new_obs.copy() for idx, _done in enumerate(done): if _done: next_obs[idx] = self.env.convert_dict_to_obs( info[idx]['terminal_observation']) # Store transition in the replay buffer. store_time0 = time.time() self.replay_buffer.add(obs, action, reward, next_obs, done) store_time += time.time() - store_time0 obs = new_obs for idx, _done in enumerate(done): episode_rewards[idx][-1] += reward[idx] if _done: episode_rewards[idx].append(0.0) maybe_is_success = info[idx].get('is_success') if maybe_is_success is not None: episode_successes[idx].append( float(maybe_is_success)) # Retrieve reward and episode length if using Monitor wrapper for _info in info: maybe_ep_info = _info.get('episode') if maybe_ep_info is not None: ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.reshape(reward, (self.env.env.num_envs, -1)) ep_done = np.reshape(done, (self.env.env.num_envs, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) train_time0 = time.time() if step % self.train_freq == 0: mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): # Break if the warmup phase is not over # or if there are not enough samples in the replay buffer if not self.replay_buffer.can_sample(self.batch_size) \ or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) mb_infos_vals.append( self._train_step(step, writer, current_lr)) # Update target network if (step + grad_step) % self.target_update_interval == 0: # Update target network self.sess.run(self.target_update_op) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) train_time += time.time() - train_time0 if len(episode_rewards[0][-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float( np.mean( np.concatenate([ episode_rewards[i][-101:-1] for i in range(self.env.env.num_envs) ]))), 1) num_episodes = sum([ len(episode_rewards[i]) for i in range(len(episode_rewards)) ]) self.num_timesteps += self.env.env.num_envs # Display training infos if self.verbose >= 1 and done[ 0] and log_interval is not None and len( episode_rewards[0]) % (log_interval // self.env.env.num_envs) == 0: fps = int(self.num_timesteps / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv( 'ep_rewmean', safe_mean( [ep_info['r'] for ep_info in ep_info_buf])) logger.logkv( 'eplenmean', safe_mean( [ep_info['l'] for ep_info in ep_info_buf])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(episode_successes[0]) > 0: logger.logkv( "success rate", np.mean( np.concatenate([ episode_successes[i][-100:] for i in range(self.env.env.num_envs) ]))) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) if hasattr(self.eval_env.unwrapped, 'random_ratio'): logger.logkv("random_ratio", self.env.env.get_attr('random_ratio')[0]) logger.dumpkvs() # Reset infos: infos_values = [] return self