def fit(self, paths, targvals): X = np.concatenate([self._preproc(p) for p in paths]) y = np.concatenate(targvals) logger.record_tabular( "EVBefore", explained_variance(self._predict(X), y) ) for _ in range(25): self.do_update(X, y) logger.record_tabular("EVAfter", explained_variance(self._predict(X), y))
def call(self, on_policy): env_runner, model, buffer, steps = self.env_runner, self.model, \ self.buffer, self.steps if on_policy: enc_obs, obs, actions, rewards, mus, dones, masks = env_runner.run() self.episode_stats.feed(rewards, dones) if buffer is not None: buffer.put(enc_obs, actions, rewards, mus, dones, masks) else: # get obs, actions, rewards, mus, dones from buffer. obs, actions, rewards, mus, dones, masks = buffer.get() # reshape stuff correctly obs = obs.reshape(env_runner.batch_ob_shape) actions = actions.reshape([env_runner.nbatch]) rewards = rewards.reshape([env_runner.nbatch]) mus = mus.reshape([env_runner.nbatch, env_runner.nact]) dones = dones.reshape([env_runner.nbatch]) masks = masks.reshape([env_runner.batch_ob_shape[0]]) names_ops, values_ops = model.predict( obs, actions, rewards, dones, mus, model.initial_state, masks, steps ) if on_policy and (int(steps/env_runner.nbatch) % self.log_interval == 0): logger.record_tabular("total_timesteps", steps) logger.record_tabular("fps", int(steps/(time.time() - self.tstart))) # IMP: In EpisodicLife env, during training, we get # done=True at each loss of life, not just at the terminal # state. Thus, this is mean until end of life, not end of # episode. For true episode rewards, see the monitor # files in the log folder. logger.record_tabular( "mean_episode_length", self.episode_stats.mean_length() ) logger.record_tabular( "mean_episode_reward", self.episode_stats.mean_reward() ) for name, val in zip(names_ops, values_ops): logger.record_tabular(name, float(val)) logger.dump_tabular()
def find_best_alpha_val(kargs): if len(kargs['alpha']) == 1: return {'alpha': kargs['alpha'][0]} args = kargs.copy() pool = mp.Pool(mp.cpu_count()) results = [] for alpha in kargs['alpha']: args['alpha'] = alpha res = [ res['val_acc'] for res in pool.map(run_nn_peer_val, make_arg_list(args)) ] res = np.mean(res, axis=0)[-1] if 'verbose' in args.keys() and args['verbose']: logger.record_tabular(f'[PEER] alpha = {alpha}', res) results.append(res) pool.close() pool.join() logger.dump_tabular() best_alpha = kargs['alpha'][np.argmax(results)] return {'alpha': best_alpha}
def KL_summary(expert_samples, agent_emp_states, env_steps: int, policy_type: str, show_ent=False): start = time.time() fkl = collect.forward_kl_knn_based(expert_samples.copy(), agent_emp_states.copy()) rkl = collect.reverse_kl_knn_based(expert_samples.copy(), agent_emp_states.copy()) print("*****************************************") print( f'env_steps: {env_steps:d}: {policy_type} fkl: {fkl:.3f} rkl: {rkl:.3f} time: {time.time()-start:.0f}s' ) print("*****************************************") logger.record_tabular(f"{policy_type} Forward KL", round(fkl, 4)) logger.record_tabular(f"{policy_type} Reverse KL", round(rkl, 4)) if show_ent: ent = collect.entropy(agent_emp_states) print(f'ent: {ent:.3f}') logger.record_tabular(f"{policy_type} Entropy", round(ent, 4)) return {'fkl': fkl, 'rkl': rkl, 'ent': ent} else: return {'fkl': fkl, 'rkl': rkl}
def try_evaluate(itr: int, policy_type: str, sac_info): assert policy_type in ["Running"] update_time = itr * v['reward']['gradient_step'] env_steps = itr * v['sac']['epochs'] * v['env']['T'] agent_emp_states = samples[0].copy() assert agent_emp_states.shape[0] == v['irl']['training_trajs'] metrics = eval.KL_summary(expert_samples, agent_emp_states.reshape(-1, agent_emp_states.shape[2]), env_steps, policy_type) # eval real reward real_return_det = eval.evaluate_real_return(sac_agent.get_action, env_fn(), v['irl']['eval_episodes'], v['env']['T'], True) metrics['Real Det Return'] = real_return_det print(f"real det return avg: {real_return_det:.2f}") logger.record_tabular("Real Det Return", round(real_return_det, 2)) real_return_sto = eval.evaluate_real_return(sac_agent.get_action, env_fn(), v['irl']['eval_episodes'], v['env']['T'], False) metrics['Real Sto Return'] = real_return_sto print(f"real sto return avg: {real_return_sto:.2f}") logger.record_tabular("Real Sto Return", round(real_return_sto, 2)) if v['obj'] in ["emd"]: eval_len = int(0.1 * len(critic_loss["main"])) emd = -np.array(critic_loss["main"][-eval_len:]).mean() metrics['emd'] = emd logger.record_tabular(f"{policy_type} EMD", emd) # plot_disc(v['obj'], log_folder, env_steps, # sac_info, critic_loss if v['obj'] in ["emd"] else disc_loss, metrics) if "PointMaze" in env_name: visual_disc(agent_emp_states, reward_func.get_scalar_reward, disc.log_density_ratio, v['obj'], log_folder, env_steps, gym_env.range_lim, sac_info, disc_loss, metrics) logger.record_tabular(f"{policy_type} Update Time", update_time) logger.record_tabular(f"{policy_type} Env Steps", env_steps) return real_return_det, real_return_sto
def try_evaluate(itr: int, policy_type: str, sac_info, old_reward=None): assert policy_type in ["Running"] update_time = itr * v['reward']['gradient_step'] env_steps = itr * v['sac']['epochs'] * v['env']['T'] agent_emp_states = samples[0].copy() metrics = eval.KL_summary( expert_samples, agent_emp_states.reshape(-1, agent_emp_states.shape[2]), env_steps, policy_type, task_name == 'uniform') if v['obj'] in ["emd"]: eval_len = int(0.1 * len(critic_loss["main"])) emd = -np.array(critic_loss["main"][-eval_len:]).mean() metrics['emd'] = emd logger.record_tabular(f"{policy_type} EMD", emd) plot_disc(agent_emp_states, reward_func.get_scalar_reward, critic.value, v['obj'], log_folder, env_steps, range_lim, sac_info, critic_loss, metrics) elif v['density']['model'] == "disc": plot_disc(agent_emp_states, reward_func.get_scalar_reward, disc.log_density_ratio, v['obj'], log_folder, env_steps, range_lim, sac_info, disc_loss, metrics) elif env_name == 'ReacherDraw-v0': plot_submission(agent_emp_states, reward_func.get_scalar_reward, v['obj'], log_folder, env_steps, range_lim, metrics, rho_expert) else: # kde plot(agent_emp_states, reward_func.get_scalar_reward, agent_density.score_samples, lambda x: np.log(rho_expert(x)) - agent_density.score_samples(x), v['obj'], log_folder, env_steps, range_lim, sac_info, metrics, reward_losses, old_reward=old_reward.get_scalar_reward) logger.record_tabular(f"{policy_type} Update Time", update_time) logger.record_tabular(f"{policy_type} Env Steps", env_steps)
def try_evaluate(itr: int, policy_type: str): assert policy_type in ["Running"] update_time = itr * v['bc']['eval_freq'] # eval real reward real_return_det = eval.evaluate_real_return(sac_agent.get_action, env_fn(), v['bc']['eval_episodes'], v['env']['T'], True) print(f"real det return avg: {real_return_det:.2f}") logger.record_tabular("Real Det Return", round(real_return_det, 2)) real_return_sto = eval.evaluate_real_return(sac_agent.get_action, env_fn(), v['bc']['eval_episodes'], v['env']['T'], False) print(f"real sto return avg: {real_return_sto:.2f}") logger.record_tabular("Real Sto Return", round(real_return_sto, 2)) logger.record_tabular(f"{policy_type} Update Time", update_time) return real_return_det, real_return_sto
def _evaluate(self, epoch): """Perform evaluation for the current policy. :param epoch: The epoch number. :return: None """ if self._eval_n_episodes < 1: return with self._policy.deterministic(self._eval_deterministic): paths = rollouts( self._eval_env, self._policy, self.sampler._max_path_length, self._eval_n_episodes, ) total_returns = [path['rewards'].sum() for path in paths] episode_lengths = [len(p['rewards']) for p in paths] logger.record_tabular('return-average', np.mean(total_returns)) logger.record_tabular('return-min', np.min(total_returns)) logger.record_tabular('return-max', np.max(total_returns)) logger.record_tabular('return-std', np.std(total_returns)) logger.record_tabular('episode-length-avg', np.mean(episode_lengths)) logger.record_tabular('episode-length-min', np.min(episode_lengths)) logger.record_tabular('episode-length-max', np.max(episode_lengths)) logger.record_tabular('episode-length-std', np.std(episode_lengths)) self._eval_env.log_diagnostics(paths) if self._eval_render: self._eval_env.render(paths) iteration = epoch * self._epoch_length batch = self.sampler.random_batch() self.log_diagnostics(iteration, batch)
def step(self, action): if sum(np.isnan(action)) > 0: raise ValueError("Passed in nan to step! Action: " + str(action)) action = np.clip(action, -2, 2) # [-2, 2] --> [-0.8, 2] linear_vel = -0.8 + (2 - (-0.8)) * (action[0] - (-2)) / (2 - (-2)) # For ppo, do nothing to ang_vel # angular_vel = action[1] # For ddpg, [-2, 2] --> [-0.8, 0.8] angular_vel = -0.8 + (0.8 - (-0.8)) * (action[1] - (-2)) / (2 - (-2)) # For trpo, clip to [-1,1], then [-1,1] --> [-0.5,0.5] # angular_vel = np.clip(action[1], -1, 1) # angular_vel = -0.5 + (0.5 - (-0.5)) * (angular_vel - (-1)) / (1 - (-1)) vel_cmd = Twist() vel_cmd.linear.x = linear_vel vel_cmd.angular.z = angular_vel # print("vel_cmd",vel_cmd) # print("angvel:", angular_vel) self.vel_pub.publish(vel_cmd) # Unpause simulation only for obtaining observation rospy.wait_for_service('/gazebo/unpause_physics') try: self.unpause() except rospy.ServiceException as e: print("/gazebo/unpause_physics service call failed") contact_data = None laser_data = None while contact_data is None or laser_data is None: contact_data = rospy.wait_for_message('/gazebo_ros_bumper', ContactsState, timeout=50) laser_data = rospy.wait_for_message('/scan', LaserScan, timeout=50) # Pause the simulation to do other operations rospy.wait_for_service('/gazebo/pause_physics') try: self.pause() except rospy.ServiceException as e: print("/gazebo/pause_physics service call failed") dynamic_data = None rospy.wait_for_service("/gazebo/get_model_state") while dynamic_data is None: dynamic_data = self.get_model_state(model_name="mobile_base") obsrv = self.get_obsrv(laser_data, dynamic_data) # --- special solution for nan/inf observation (especially in case of any invalid sensor readings) --- # if any(np.isnan(np.array(obsrv))) or any(np.isinf(np.array(obsrv))): logger.record_tabular("found nan or inf in observation:", obsrv) obsrv = self.pre_obsrv done = True self.step_counter = 0 self.pre_obsrv = obsrv assert self.reward_type is not None reward = 0 if self.reward_type == 'hand_craft': # reward = 1 reward += 0 else: raise ValueError("reward type is invalid!") done = False suc = False self.step_counter += 1 event_flag = None # {'collision', 'safe', 'goal', 'steps exceeding', 'fast rotation'} # 1. when collision happens, done = True # if self._in_obst(laser_data): # reward += self.collision_reward # done = True # self.step_counter = 0 # event_flag = 'collision' # temporary change for ddpg only. For PPO, use things above. if self._in_obst(contact_data): print("collision") reward += self.collision_reward done = True self.step_counter = 0 event_flag = 'collision' # 2. In the neighbor of goal state, done is True as well. Only considering velocity and pos if self._in_goal(np.array(obsrv[:3])): print("goal") reward += self.goal_reward done = True suc = True self.step_counter = 0 event_flag = 'goal' if self.step_counter >= 300: print("steps exceed") reward += self.collision_reward done = True self.step_counter = 0 event_flag = 'steps exceeding' # cur_w = dynamic_data.twist.angular.z # print("cur_w:", cur_w) # if cur_w > np.pi: # print("rotate fast") # print("cur_w:", cur_w) # input() # done = True # reward += self.collision_reward / 2 # self.step_counter = 0 # event_flag = 'fast rotation' if event_flag is None: event_flag = 'safe' return np.asarray(obsrv), reward, done, {'suc':suc, 'event':event_flag}
device, expert_trajs=expert_trajs_train) reward_losses.append(loss.item()) print(f"{v['obj']} loss: {loss}") reward_optimizer.zero_grad() loss.backward() reward_optimizer.step() # evaluating the learned reward real_return_det, real_return_sto = try_evaluate( itr, "Running", sac_info) if real_return_det > max_real_return_det and real_return_sto > max_real_return_sto: max_real_return_det, max_real_return_sto = real_return_det, real_return_sto torch.save( reward_func.state_dict(), os.path.join( logger.get_dir(), f"model/reward_model_itr{itr}_det{max_real_return_det:.0f}_sto{max_real_return_sto:.0f}.pkl" )) logger.record_tabular("Itration", itr) logger.record_tabular("Reward Loss", loss.item()) if v['sac']['automatic_alpha_tuning']: logger.record_tabular("alpha", sac_agent.alpha.item()) # if v['irl']['save_interval'] > 0 and (itr % v['irl']['save_interval'] == 0 or itr == v['irl']['n_itrs']-1): # torch.save(reward_func.state_dict(), os.path.join(logger.get_dir(), f"model/reward_model_{itr}.pkl")) logger.dump_tabular()
# ------- logger initialize and configuration ------- logger.configure(dir=args['RUN_DIR']) # --------------------------------------------------- # Initialize environment and reward type env = gym.make(args['gym_env'], reward_type=args['reward_type'], set_additional_goal=args['set_additional_goal']) # Set random seed in hope to reproductability env.seed(args['seed']) np.random.seed(args['seed']) tf.set_random_seed(args['seed']) logger.record_tabular("algo", args['algo']) logger.record_tabular("env", args['gym_env']) logger.record_tabular("env.set_additional_goal", env.set_additional_goal) logger.record_tabular("env.reward_type", env.reward_type) logger.dump_tabular() if args['algo'] == "ppo": # Make necessary directories maybe_mkdir(args['RUN_DIR']) maybe_mkdir(args['MODEL_DIR']) maybe_mkdir(args['FIGURE_DIR']) maybe_mkdir(args['RESULT_DIR']) ppo_params_json = os.environ[ 'PROJ_HOME_3'] + '/ppo1/ppo_params.json'
def fit( env, q_func, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, checkpoint_path=None, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None ): """Train a deepq model. Parameters ------- env: gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model model = DeepDQN() sess = model.init_session().__enter__() # capture the shape outside the closure so that the env object is # not serialized by cloudpickle when serializing make_obs_ph def make_obs_ph(name): return ObservationInput(env.observation_space, name=name) act, train, update_target, debug = model.build_train( make_obs_ph, q_func, env.action_space.n, tf.train.AdamOptimizer(learning_rate=lr), 10, gamma, param_noise ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule( schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. model.init_vars() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: td = checkpoint_path or td model_file = os.path.join(td, "model") model_saved = False if tf.train.latest_checkpoint(td) is not None: model.load_state(model_file) logger.log('Loaded model from {}'.format(model_file)) model_saved = True for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence # between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with # eps = exploration.value(t). See Appendix C.1 in # Parameter Space Noise for Exploration, Plappert et # al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value( t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = \ update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act( np.array(obs)[None], update_eps=update_eps, **kwargs )[0] env_action = action reset = False new_obs, rew, done, _ = env.step(env_action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t) ) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = \ replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train( obses_t, actions, rewards, obses_tp1, dones, weights ) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities( batch_idxes, new_priorities ) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}". format(saved_mean_reward, mean_100ep_reward) ) model.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward) ) model.load_state(model_file) return act
def learn( env, policy_fn, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant' # annealing for stepsize parameters (epsilon and adam) ): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular()
def train(env_name, num_episodes, gamma, lam, kl_targ, batch_size, eval_freq, hid1_mult, init_policy_logvar, seed): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch eval_freq: number of training batch before test hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) init_policy_logvar: natural log of initial policy variance seed: random seed for all modules with randomness """ # set seeds set_global_seed(seed) # configure log configure_log_info(env_name, seed) # create env env = gym.make(env_name) env.seed(seed) # set env seed obs_dim = env.observation_space.shape[0] obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) act_dim = env.action_space.shape[0] # create scaler scaler = Scaler(obs_dim) # create policy policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, init_policy_logvar).to(device) # create value_function value_function = ValueFunction(obs_dim, hid1_mult).to(device) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, episodes=5) # train & test models num_iteration = num_episodes // eval_freq current_episodes = 0 current_steps = 0 for iter in range(num_iteration): # train models for i in range(eval_freq): # rollout trajectories, steps = run_policy(env, policy, scaler, episodes=batch_size) # process data current_episodes += len(trajectories) current_steps += steps add_value(trajectories, value_function) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set(trajectories) train_returns = [np.sum(t["rewards"]) for t in trajectories] logger.info('[train] average return:{0}, std return: {1}'.format(np.mean(train_returns), np.std(train_returns))) # add various stats to training log: #log_batch_stats(observes, actions, advantages, disc_sum_rew) # update policy policy.update(observes, actions, advantages) # update policy # update value function value_function.update(observes, disc_sum_rew) # update value function # test models num_test_episodes = 10 trajectories, _ = run_policy(env, policy, scaler, episodes=num_test_episodes) avg_return = np.mean([np.sum(t["rewards"]) for t in trajectories]) std_return = np.std([np.sum(t["rewards"]) for t in trajectories]) logger.record_tabular('iteration', iter) logger.record_tabular('episodes', current_episodes) logger.record_tabular('steps', current_steps) logger.record_tabular('avg_return', avg_return) logger.record_tabular('std_return', std_return) logger.dump_tabular()
agent_density, reward_func, device) elif v['obj'] in ['maxentirl']: loss = f_div_current_state_loss(v['obj'], samples, rho_expert, agent_density, reward_func, device) elif v['obj'] == 'emd': loss, _ = ipm_loss(v['obj'], v['IS'], samples, critic.value, reward_func, device) reward_losses.append(loss.item()) print(f"{v['obj']} loss: {loss}") reward_optimizer.zero_grad() loss.backward() reward_optimizer.step() # evaluating the learned reward try_evaluate(itr, "Running", sac_info, old_reward) logger.record_tabular("Itration", itr) logger.record_tabular("Reward Loss", loss.item()) if v['irl']['save_interval'] > 0 and ( itr % v['irl']['save_interval'] == 0 or itr == v['irl']['n_itrs'] - 1): torch.save( reward_func.state_dict(), os.path.join(logger.get_dir(), f"model/reward_model_{itr}.pkl")) logger.dump_tabular()
def log_diagnostics(self, iteration, batch): """Record diagnostic information to the logger. Records mean and standard deviation of Q-function and state value function, and TD-loss (mean squared Bellman error) for the sample batch. Also calls the `draw` method of the plotter, if plotter defined. """ feed_dict = self._get_feed_dict(iteration, batch) qf1, qf2, vf, td_loss1, td_loss2 = self._sess.run( (self._qf1_t, self._qf2_t, self._vf_t, self._td_loss1_t, self._td_loss2_t), feed_dict) logger.record_tabular('qf1-avg', np.mean(qf1)) logger.record_tabular('qf1-std', np.std(qf1)) logger.record_tabular('qf2-avg', np.mean(qf1)) logger.record_tabular('qf2-std', np.std(qf1)) logger.record_tabular('mean-qf-diff', np.mean(np.abs(qf1-qf2))) logger.record_tabular('vf-avg', np.mean(vf)) logger.record_tabular('vf-std', np.std(vf)) logger.record_tabular('mean-sq-bellman-error1', td_loss1) logger.record_tabular('mean-sq-bellman-error2', td_loss2) self._policy.log_diagnostics(iteration, batch) if self._plotter: self._plotter.draw()
def step(self, action): # Check for possible nan action if sum(np.isnan(action)) > 0: raise ValueError("Passed in nan to step! Action: " + str(action)) action = np.clip(action, -2, 2) # For linear vel, [-2, 2] --> [-0.8, 2] linear_vel = -0.8 + (2 - (-0.8)) * (action[0] - (-2)) / (2 - (-2)) # For angular vel, [-2, 2] --> [-0.8, 0.8]. If something wrong happens, check old code to specify for PPO, DDPG or TRPO # angular_vel = -0.8 + (0.8 - (-0.8)) * (action[1] - (-2)) / (2 - (-2)) # if use ddpg (or TD3), use this line angular_vel = action[1] # if use ppo, use this line # Publish control command vel_cmd = Twist() vel_cmd.linear.x = linear_vel vel_cmd.angular.z = angular_vel self.vel_pub.publish(vel_cmd) # print("before sending cmd, linear_vel: {}; angular_vel: {}".format(vel_cmd.linear.x, vel_cmd.angular.z)) # Prepare for receive sensor readings. Laser data as part of obs; contact data used for collision detection contact_data = self.get_contact() laser_data = self.get_laser() new_contact_data = contact_data new_laser_data = laser_data # Unpause simulation only for obtaining valid data streaming rospy.wait_for_service('/gazebo/unpause_physics') try: self.unpause() except rospy.ServiceException as e: print("/gazebo/unpause_physics service call failed") while new_contact_data.header.stamp <= contact_data.header.stamp or \ new_laser_data.header.stamp <= laser_data.header.stamp: new_contact_data = self.get_contact() new_laser_data = self.get_laser() # Pause the simulation to do other operations rospy.wait_for_service('/gazebo/pause_physics') try: self.pause() except rospy.ServiceException as e: print("/gazebo/pause_physics service call failed") # Call a service to get model state dynamic_data = None rospy.wait_for_service("/gazebo/get_model_state") while dynamic_data is None: dynamic_data = self.get_model_state(model_name="mobile_base") obsrv = self.get_obsrv(new_laser_data, dynamic_data) # special solution for nan/inf observation (especially in case of any invalid sensor readings) if any(np.isnan(np.array(obsrv))) or any(np.isinf(np.array(obsrv))): logger.record_tabular("found nan or inf in observation:", obsrv) obsrv = self.pre_obsrv done = True self.step_counter = 0 self.pre_obsrv = obsrv assert self.reward_type is not None reward = 0 if self.reward_type == 'hand_craft': reward += 0 else: raise ValueError("reward type is invalid!") done = False suc = False self.step_counter += 1 event_flag = None # {'collision', 'safe', 'goal', 'steps exceeding'} # 1. Check collision. If something is wrong, go check old code to specify another _in_obst function # if self._in_obst(new_contact_data): # reward += self.collision_reward # done = True # self.step_counter = 0 # event_flag = 'collision' if self._in_obst(laser_data): reward += self.collision_reward done = True self.step_counter = 0 event_flag = 'collision' # 2. In the neighbor of goal state, done is True as well. Only considering velocity and pos if self._in_goal(np.array(obsrv[:3])): reward += self.goal_reward done = True suc = True self.step_counter = 0 event_flag = 'goal' # 3. If reaching maximum episode step, then we reset and give penalty. if self.step_counter >= 300: reward += self.collision_reward done = True self.step_counter = 0 event_flag = 'steps exceeding' if event_flag is None: event_flag = 'safe' return np.asarray(obsrv), reward, done, { 'suc': suc, 'event': event_flag }
def run(args): logger.configure( f'logs/{args["dataset"]}/pam/{datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S")}' ) logger.info(args) pool = mp.Pool(mp.cpu_count()) pam_arg = args.copy() if 'margin' not in pam_arg.keys(): best_margin = pool.map(find_best_margin, make_arg_list(pam_arg)) best_margin = np.mean(best_margin, 0) if 'verbose' in pam_arg.keys() and pam_arg['verbose']: for i in range(len(best_margin)): logger.record_tabular(f'[PAM] margin = {MARGINS[i]}', best_margin[i]) logger.dump_tabular() best_margin = MARGINS[best_margin.argmax()] logger.record_tabular('[PAM] best margin', best_margin) pam_arg['margin'] = best_margin results_pam = pool.map(run_pam, make_arg_list(pam_arg)) logger.record_tabular('[PAM] accuracy mean', np.mean(results_pam)) logger.record_tabular('[PAM] accuracy max', np.max(results_pam)) logger.record_tabular('[PAM] accuracy min', np.min(results_pam)) logger.record_tabular('[PAM] accuracy std', np.std(results_pam)) logger.dump_tabular()
def run_one_agent(index, args, unknown_args, actor_status): from tensorflow.keras.backend import set_session import tensorflow.compat.v1 as tf # Set 'allow_growth' config = tf.ConfigProto() config.gpu_options.allow_growth = True set_session(tf.Session(config=config)) # Connect to learner context = zmq.Context() context.linger = 0 # For removing linger behavior socket = context.socket(zmq.REQ) socket.connect(f'tcp://{args.ip}:{args.data_port}') # Initialize environment and agent instance env, agent = init_components(args, unknown_args) # Configure logging only in one process if index == 0: logger.configure(str(args.log_path)) save_yaml_config(args.exp_path / 'config.yaml', args, 'actor', agent) else: logger.configure(str(args.log_path), format_strs=[]) # Create local queues for collecting data transitions = [] # A list to store raw transitions within an episode mem_pool = MemPool() # A pool to store prepared training data # Initialize values model_id = -1 episode_rewards = [0.0] episode_lengths = [0] num_episodes = 0 mean_10ep_reward = 0 mean_10ep_length = 0 send_time_start = time.time() state = env.reset() for step in range(args.num_steps): # Do some updates agent.update_sampling(step, args.num_steps) # Sample action action, extra_data = agent.sample(state) next_state, reward, done, info = env.step(action) # Record current transition transitions.append( (state, action, reward, next_state, done, extra_data)) episode_rewards[-1] += reward episode_lengths[-1] += 1 state = next_state is_terminal = done or episode_lengths[-1] >= args.max_episode_length > 0 if is_terminal or len(mem_pool) + len( transitions) >= args.max_steps_per_update: # Current episode is terminated or a trajectory of enough training data is collected data = agent.prepare_training_data(transitions) transitions.clear() mem_pool.push(data) if is_terminal: # Log information at the end of episode num_episodes = len(episode_rewards) mean_10ep_reward = round(np.mean(episode_rewards[-10:]), 2) mean_10ep_length = round(np.mean(episode_lengths[-10:]), 2) episode_rewards.append(0.0) episode_lengths.append(0) # Reset environment state = env.reset() if len(mem_pool) >= args.max_steps_per_update: # Send training data after enough training data (>= 'arg.max_steps_per_update') is collected post_processed_data = agent.post_process_training_data( mem_pool.sample()) socket.send(serialize(post_processed_data).to_buffer()) socket.recv() mem_pool.clear() send_data_interval = time.time() - send_time_start send_time_start = time.time() if num_episodes > 0: # Log information logger.record_tabular("iteration", (step + 1) // args.max_steps_per_update) logger.record_tabular("steps", step) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular("mean 10 episode reward", mean_10ep_reward) logger.record_tabular("mean 10 episode length", mean_10ep_length) logger.record_tabular( "send data fps", args.max_steps_per_update // send_data_interval) logger.record_tabular("send data interval", send_data_interval) logger.dump_tabular() # Update weights new_weights, model_id = find_new_weights(model_id, args.ckpt_path) if new_weights is not None: agent.set_weights(new_weights) actor_status[index] = 1
def learn( env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation entcoeff=0.0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) args): # Setup losses and stuff` # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy # Ops to reassign params from new to old assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent newprob = tf.exp(pi.pd.logp(ac)) oldprob = tf.exp(oldpi.pd.logp(ac)) ratio = newprob / oldprob kl = pi.pd.kl(oldpi.pd) mean_kl = tf.reduce_mean(kl) get_kl = U.function([ob, ac], kl) get_mean_kl = U.function([ob, ac], mean_kl) threshold = kl < args.kl_threshold threshold = tf.cast(threshold, tf.float32) pol_surr = (kl - ratio * atarg / args.sepg_lam) * threshold pol_surr = tf.reduce_mean(pol_surr) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards running_scores = [] assert sum([ max_iters > 0, args.num_timesteps > 0, max_episodes > 0, max_seconds > 0 ]) == 1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if args.num_timesteps and timesteps_so_far >= args.num_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max( 1.0 - float(timesteps_so_far) / args.num_timesteps, 0) else: raise NotImplementedError if MPI.COMM_WORLD.Get_rank() == 0: logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / ( atarg.std() + 1e-8) # standardized advantage function estimate optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) # Here we do a bunch of optimization epochs over the data for num_epoch in count(): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) g = np.nan_to_num(g) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) agg_mean_kl = get_mean_kl(ob, ac) if agg_mean_kl > args.agg_kl_threshold or num_epoch == args.optim_epochs: break lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) rewbuffer.extend(rews) mean_score = None if rewbuffer: mean_score = np.mean(rewbuffer) running_scores.append((timesteps_so_far, mean_score)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 if MPI.COMM_WORLD.Get_rank() == 0: logger.record_tabular("EpRewMean", mean_score) logger.record_tabular("EpThisIter", len(lens)) logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) logger.record_tabular("NumEpoch", num_epoch) logger.dump_tabular() return running_scores
def _train(self, env, policy, initial_exploration_policy, pool): """Perform RL training. Args: env (`rllab.Env`): Environment used for training policy (`Policy`): Policy used for training initial_exploration_policy ('Policy'): Policy used for exploration If None, then all exploration is done using policy pool (`PoolBase`): Sample pool to add samples to """ self._init_training(env, policy, pool) if initial_exploration_policy is None: self.sampler.initialize(env, policy, pool) initial_exploration_done = True else: self.sampler.initialize(env, initial_exploration_policy, pool) initial_exploration_done = False with self._sess.as_default(): gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): # logger.push_prefix() logger.log('Epoch #%d | ' % epoch) for t in range(self._epoch_length): # TODO.codeconsolidation: Add control interval to sampler if not initial_exploration_done: if self._epoch_length * epoch >= self._n_initial_exploration_steps: self.sampler.set_policy(policy) initial_exploration_done = True self.sampler.sample() if not self.sampler.batch_ready(): continue gt.stamp('sample') for i in range(self._n_train_repeat): self._do_training(iteration=t + epoch * self._epoch_length, batch=self.sampler.random_batch()) gt.stamp('train') self._evaluate(epoch) params = self.get_snapshot(epoch) # logger.save_itr_params(epoch, params) times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) self.sampler.log_diagnostics() # logger.dump_tabular(with_prefix=False) logger.dump_tabular() # logger.pop_prefix() gt.stamp('eval') self.sampler.terminate()
def rollouts(self): # Prepare for rollouts # ---------------------------------------- seg_gen = self.traj_segment_generator(self.pi, self.env, self.timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum([ self.max_iters > 0, self.max_timesteps > 0, self.max_episodes > 0, self.max_seconds > 0 ]) == 1, "Only one time constraint permitted" while True: if self.callback: self.callback(locals(), globals()) if self.max_timesteps and timesteps_so_far >= self.max_timesteps: break elif self.max_episodes and episodes_so_far >= self.max_episodes: break elif self.max_iters and iters_so_far >= self.max_iters: break elif self.max_seconds and time.time() - tstart >= self.max_seconds: break if self.schedule == 'constant': cur_lrmult = 1.0 elif self.schedule == 'linear': cur_lrmult = max( 1.0 - float(timesteps_so_far) / self.max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() self.add_vtarg_and_adv(seg, self.gamma, self.lam) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], \ seg["tdlamret"] vpredbefore = seg[ "vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not self.pi.recurrent) optim_batchsize = self.optim_batchsize or ob.shape[0] if hasattr(self.pi, "ob_rms"): self.pi.ob_rms.update(ob) # update running mean/std for policy self.assign_old_eq_new( ) # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, self.loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(self.optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = self.lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) self.adam.update(g, self.optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = self.loss(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, self.loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(self.flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() return self.pi
def run(args): logger.configure( f'logs/{args["dataset"]}/svm/{datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S")}' ) logger.info(args) pool = mp.Pool(mp.cpu_count()) svm_arg = args.copy() if 'C1' not in svm_arg.keys(): best_c1 = pool.map(find_best_c1, make_arg_list(svm_arg)) best_c1 = np.mean(best_c1, 0) if 'verbose' in svm_arg.keys() and svm_arg['verbose']: for i in range(len(best_c1)): logger.record_tabular(f'[C-SVM] C1 = {CLASS_WEIGHTS[i]}', best_c1[i]) logger.dump_tabular() best_c1 = CLASS_WEIGHTS[best_c1.argmax()] logger.record_tabular('[C-SVM] best C1', best_c1) svm_arg['C1'] = best_c1 results_svm = pool.map(run_c_svm, make_arg_list(svm_arg)) logger.record_tabular('[C-SVM] accuracy mean', np.mean(results_svm)) logger.record_tabular('[C-SVM] accuracy max', np.max(results_svm)) logger.record_tabular('[C-SVM] accuracy min', np.min(results_svm)) logger.record_tabular('[C-SVM] accuracy std', np.std(results_svm)) logger.dump_tabular()
def main(): # Parse input parameters args, unknown_args = parser.parse_known_args() args.num_steps = int(args.num_steps) unknown_args = parse_cmdline_kwargs(unknown_args) # Load config file load_yaml_config(args, 'learner') # Expose socket to actor(s) context = zmq.Context() weights_socket = context.socket(zmq.PUB) weights_socket.bind(f'tcp://*:{args.param_port}') _, agent = init_components(args, unknown_args) # Configure experiment directory create_experiment_dir(args, 'LEARNER-') save_yaml_config(args.exp_path / 'config.yaml', args, 'learner', agent) args.log_path = args.exp_path / 'log' args.ckpt_path = args.exp_path / 'ckpt' args.ckpt_path.mkdir() args.log_path.mkdir() logger.configure(str(args.log_path)) # Record commit hash with open(args.exp_path / 'hash', 'w') as f: f.write( str( subprocess.run('git rev-parse HEAD'.split(), stdout=subprocess.PIPE).stdout.decode('utf-8'))) # Variables to control the frequency of training receiving_condition = multiprocessing.Condition() num_receptions = multiprocessing.Value('i', 0) # Start memory pool in another process manager = MemPoolManager() manager.start() mem_pool = manager.MemPool(capacity=args.pool_size) Process(target=recv_data, args=(args.data_port, mem_pool, receiving_condition, num_receptions, args.keep_training)).start() # Print throughput statistics Process(target=MultiprocessingMemPool.record_throughput, args=(mem_pool, args.record_throughput_interval)).start() freq = 0 learn_flag = 0 while True: if learn_flag == 0: weights_socket.send(pickle.dumps(agent.get_weights())) if len(mem_pool) >= args.batch_size: # Sync weights to actor weights = agent.get_weights() if hvd.rank() == 0: weights_socket.send(pickle.dumps(weights)) if freq % args.ckpt_save_freq == 0: if args.ckpt_save_type == 'checkpoint': agent.save(args.ckpt_path / 'ckpt') elif args.ckpt_save_type == 'weight': with open(args.ckpt_path / 'weight.ckpt', 'wb') as f: pickle.dump(weights, f) if args.keep_training: agent.learn(mem_pool.sample(size=args.batch_size)) else: with receiving_condition: while num_receptions.value < args.training_freq: receiving_condition.wait() data = mem_pool.sample(size=args.batch_size) num_receptions.value -= args.training_freq # Training stat = agent.learn(data) learn_flag = 1 if stat is not None: for k, v in stat.items(): logger.record_tabular(k, v) logger.dump_tabular() freq += 1
def main(_): # create visualizer #visualizer = TensorboardVisualizer() monitor = Monitor(FLAGS) #log_dir = monitor.log_dir #visualizer.initialize(log_dir, None) saved_mean_reward = None # openAI logger L.configure(monitor.log_dir, format_strs=['stdout', 'csv']) # initialize env atari_env = AtariEnv(monitor) #screen_shot_subgoal(atari_env) # we should probably follow deepmind style env # stack 4 frames and scale float env = wrapper.wrap_deepmind(atari_env, frame_stack=True, scale=True) # get default tf_session sess = U.get_session() # create q networks for controller controller_optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE) controller_network = Q_network(env.observation_space, env.action_space.n, controller_optimizer, scope='controller') controller = Controller(controller_network, env.action_space.n) # create q networks for meta-controller num_goals = env.unwrapped.goals_space.n metacontroller_optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE) metacontroller_network = Q_network(env.observation_space, num_goals, metacontroller_optimizer, scope='metacontroller') metacontroller = MetaController(metacontroller_network, num_goals) # Create the schedule for exploration starting from 1. exploration2 = LinearSchedule(schedule_timesteps=int(EXPLORATION_FRACTION * monitor.num_timesteps), initial_p=1.0, final_p=EXPLORATION_FINAL_EPS) # initialize experience replay controller_replay_buffer = ReplayBuffer(D1_MEMORY_SIZE) metacontroller_replay_buffer = ReplayBuffer(D2_MEMORY_SIZE) # initialize critic critic = Critic(env.unwrapped) total_extrinsic_reward = [] # for success rate total_goal_reached = np.zeros(num_goals, dtype=np.int32) total_goal_sampled = np.zeros(num_goals, dtype=np.int32) total_goal_epsilon = np.ones(num_goals, dtype=np.float32) ep = 0 total_step = 0 init_ob = env.reset() U.initialize() # initialize target network in both controller and meta sess.run(metacontroller.network.update_target_op) sess.run(controller.network.update_target_op) # load ckpt if presence model_path = tf.train.latest_checkpoint(monitor.ckpt_dir) model_saved = False model_file = os.path.join(monitor.ckpt_dir, 'model') if model_path is not None: U.load_variables(model_file) L.log('loaded model from %s' % model_file) model_saved = True while ep < MAX_EPISODE: # count number of steps # init environment game play variables init_ob = env.reset() observation = np.reshape(init_ob['observation'], (1, )+init_ob['observation'].shape) desired_goal = metacontroller.sample_act(sess, observation, update_eps=1.0)[0] env.unwrapped.desired_goal = desired_goal total_goal_sampled[desired_goal] += 1 # given predicted goal, we encode this goal bounding mask to the observation np array ob_with_g = env.unwrapped._add_goal_mask(init_ob['observation'], desired_goal) # NOTE: Below code verify added mask correctly # for i in range(ob_with_g.shape[-1]): # ob = ob_with_g[:,:,i] # image = Image.fromarray(ob) # image = image.convert('RGB') # image.save('test_%i.png' % i) done = False reached_goal = False while not done: extrinsic_rewards = 0 s0 = init_ob['observation'] while not (done or reached_goal): update_eps1_with_respect_to_g = get_epsilon(total_goal_epsilon, total_goal_reached, total_goal_sampled, desired_goal, total_step, EXPLORATION_WARM_UP) ob_with_g_reshaped = np.reshape(ob_with_g, (1, )+ob_with_g.shape) primitive_action_t = controller.sample_act(sess, ob_with_g_reshaped, update_eps=update_eps1_with_respect_to_g)[0] # obtain extrinsic reward from environment ob_tp1, extrinsic_reward_t, done_t, info = env.step(primitive_action_t) reached_goal = env.unwrapped.reached_goal(desired_goal) ob_with_g_tp1 = env.unwrapped._add_goal_mask(ob_tp1['observation'], desired_goal) intrinsic_reward_t = critic.criticize(desired_goal, reached_goal, primitive_action_t, done_t) controller_replay_buffer.add(ob_with_g, primitive_action_t, intrinsic_reward_t, ob_with_g_tp1, done_t) # sample from replay_buffer1 to train controller obs_with_g_t, primitive_actions_t, intrinsic_rewards_t, obs_with_g_tp1, dones_t = controller_replay_buffer.sample(TRAIN_BATCH_SIZE) weights, batch_idxes = np.ones_like(intrinsic_rewards_t), None # get q estimate for tp1 as 'supervised' ob_with_g_tp1_reshaped = np.reshape(ob_with_g_tp1, (1, )+ob_with_g.shape) q_tp1 = controller.get_q(sess, ob_with_g_tp1_reshaped)[0] td_error = controller.train(sess, obs_with_g_t, primitive_actions_t, intrinsic_rewards_t, obs_with_g_tp1, dones_t, weights, q_tp1) # join train meta-controller only sample from replay_buffer2 to train meta-controller if total_step >= WARMUP_STEPS: L.log('join train has started ----- step %d', total_step) # sample from replay_buffer2 to train meta-controller init_obs, goals_t, extrinsic_rewards_t, obs_terminate_in_g, dones_t = metacontroller_replay_buffer.sample(TRAIN_BATCH_SIZE) weights, batch_idxes = np.ones_like(extrinsic_rewards_t), None # get q estimate for tp1 as 'supervised' obs_terminate_in_g_reshaped = np.reshape(obs_terminate_in_g, (1, )+obs_terminate_in_g.shape) q_tp1 = metacontroller.get_q(sess, obs_terminate_in_g_reshaped)[0] td_error = metacontroller.train(sess, init_obs, goals_t, extrinsic_rewards_t, obs_terminate_in_g, dones_t, weights, q_tp1) if total_step % UPDATE_TARGET_NETWORK_FREQ == 0: #L.log('UPDATE BOTH CONTROLLER Q NETWORKS ----- step %d', step) sess.run(controller.network.update_target_op) # its fine, we aren't really training meta dqn until after certain steps. sess.run(metacontroller.network.update_target_op) extrinsic_rewards += extrinsic_reward_t ob_with_g = ob_with_g_tp1 done = done_t total_step += 1 # we are done / reached_goal # store transitions of init_ob, goal, all the extrinsic rewards, current ob in D2 # print("ep %d : step %d, goal extrinsic total %d" % (ep, step, extrinsic_rewards)) # clean observation without goal encoded metacontroller_replay_buffer.add(init_ob['observation'], desired_goal, extrinsic_rewards, ob_tp1['observation'], done) # if we are here then we have finished the desired goal if not done: #print("ep %d : goal %d reached, not yet done, extrinsic %d" % (ep, desired_goal, extrinsic_rewards)) exploration_ep = 1.0 total_goal_reached[env.unwrapped.achieved_goal] += 1 if total_step >= WARMUP_STEPS: t = total_step - WARMUP_STEPS exploration_ep = exploration2.value(t) ob_with_g_reshaped = np.reshape(ob_with_g, (1, )+ob_with_g.shape) while env.unwrapped.achieved_goal == desired_goal: desired_goal = metacontroller.sample_act(sess, ob_with_g_reshaped, update_eps=exploration_ep)[0] env.unwrapped.desired_goal = desired_goal total_goal_sampled[desired_goal] += 1 L.log('ep %d : achieved goal was %d ----- new goal --- %d' % (ep, env.unwrapped.achieved_goal, desired_goal)) # start again reached_goal = False # finish an episode total_extrinsic_reward.append(extrinsic_rewards) ep += 1 mean_100ep_reward = round(np.mean(total_extrinsic_reward[-101:-1]), 1) if ep % monitor.print_freq == 0 : L.record_tabular("steps", total_step) L.record_tabular("episodes", ep) L.record_tabular("mean 100 episode reward", mean_100ep_reward) L.dump_tabular() if total_step % monitor.ckpt_freq == 0: if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: L.log("Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward)) U.save_variables(model_file) model_saved = True saved_mean_reward = mean_100ep_reward # verified our model was saved if model_saved: L.log('restored model with mean reward: %d' % saved_mean_reward) U.load_variables(model_file)
def train(self): """ CG: the function that conducts ensemble training. :return: """ # Set up parameters for the training process. self._n_epochs = self._base_ac_params['n_epochs'] self._epoch_length = self._base_ac_params['epoch_length'] self._n_train_repeat = self._base_ac_params['n_train_repeat'] self._n_initial_exploration_steps = self._base_ac_params[ 'n_initial_exploration_steps'] self._eval_render = self._base_ac_params['eval_render'] self._eval_n_episodes = self._base_ac_params['eval_n_episodes'] self._eval_deterministic = self._base_ac_params['eval_deterministic'] # Set up the evaluation environment. if self._eval_n_episodes > 0: with tf.variable_scope("low_level_policy", reuse=True): self._eval_env = deep_clone(self._env) # Import required libraries for training. import random import math import operator import numpy as np # Initialize the sampler. alg_ins = random.choice(self._alg_instances) self._sampler.initialize(self._env, alg_ins[0].policy, self._pool) # Perform the training/evaluation process. num_episode = 0. with self._sess.as_default(): gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): logger.log('Epoch #%d | ' % epoch) for t in range(self._epoch_length): isEpisodeEnd = self._sampler.sample() # If an episode is ended, we need to update performance statistics for each AC instance and # pick randomly another AC instance for next episode of exploration. if isEpisodeEnd: num_episode = num_episode + 1. alg_ins[1] = 0.9 * alg_ins[ 1] + 0.1 * self._sampler._last_path_return alg_ins[2] = alg_ins[2] + 1. if self._use_ucb: # Select an algorithm instance based on UCB. selected = False for ains in self._alg_instances: if ains[2] < 1.: alg_ins = ains selected = True break else: ains[3] = ains[1] + math.sqrt( 2.0 * math.log(num_episode) / ains[2]) if not selected: alg_ins = max(self._alg_instances, key=operator.itemgetter(3)) else: # Select an algorithm instance uniformly at random. alg_ins = random.choice(self._alg_instances) self._sampler.set_policy(alg_ins[0].policy) if not self._sampler.batch_ready(): continue gt.stamp('sample') # ================ # Perform training. # ================ for i in range(self._n_train_repeat): batch = self._sampler.random_batch() # ==================================== # Perform training over all AC instances. # ==================================== for ains in self._alg_instances: ains[0]._do_training(iteration=t + epoch * self._epoch_length, batch=batch) # ================================================= # Perform training of the action-selection Q-function. # ================================================= # Set up the feed dictionary. feed_dict = { self._observations_ens_ph: batch['observations'], self._obv_act_ph: batch['actions'], self._observations_ens_next_ph: batch['next_observations'], self._rewards_ph: batch['rewards'], self._terminals_ph: batch['terminals'], } for i, ains in enumerate(self._alg_instances): with ains[0].policy.deterministic( self._eval_deterministic): feed_dict[self._acts_next_phs[i]] = ains[ 0].policy.get_actions( batch['next_observations']) # Perform training on the action-selection Q-function. self._sess.run(self._q_ens_train_operator, feed_dict) gt.stamp('train') # ============================================================ # Perform evaluation after one full epoch of training is completed. # ============================================================ if self._eval_n_episodes < 1: continue if self._evaluation_strategy == 'ensemble': # Use a whole ensemble of AC instances for evaluation. paths = rollouts(self._eval_env, self, self._sampler._max_path_length, self._eval_n_episodes) elif self._evaluation_strategy == 'best-policy': # Choose the AC instance with the highest observed performance so far for evaluation. eval_alg_ins = max(self._alg_instances, key=operator.itemgetter(1)) with eval_alg_ins[0].policy.deterministic( self._eval_deterministic): paths = rollouts(self._eval_env, eval_alg_ins[0].policy, self._sampler._max_path_length, self._eval_n_episodes) else: paths = None if paths is not None: total_returns = [path['rewards'].sum() for path in paths] episode_lengths = [len(p['rewards']) for p in paths] logger.record_tabular('return-average', np.mean(total_returns)) logger.record_tabular('return-min', np.min(total_returns)) logger.record_tabular('return-max', np.max(total_returns)) logger.record_tabular('return-std', np.std(total_returns)) logger.record_tabular('episode-length-avg', np.mean(episode_lengths)) logger.record_tabular('episode-length-min', np.min(episode_lengths)) logger.record_tabular('episode-length-max', np.max(episode_lengths)) logger.record_tabular('episode-length-std', np.std(episode_lengths)) self._eval_env.log_diagnostics(paths) if self._eval_render: self._eval_env.render(paths) # Produce log info after each episode of training and evaluation. times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) self._sampler.log_diagnostics() logger.dump_tabular() # logger.pop_prefix() gt.stamp('eval') # Terminate the sampler after the training process is completed. self._sampler.terminate()
def fit(policy, env, seed, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100): set_global_seeds(seed) model = A2C(policy=policy, observation_space=env.observation_space, action_space=env.action_space, nenvs=env.num_envs, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule) session = model.init_session() tf.global_variables_initializer().run(session=session) env_runner = Environment(env, model, nsteps=nsteps, gamma=gamma) nbatch = env.num_envs * nsteps tstart = time.time() writer = tf.summary.FileWriter('output', session.graph) for update in range(1, total_timesteps // nbatch + 1): tf.reset_default_graph() obs, states, rewards, masks, actions, values = env_runner.run(session) policy_loss, value_loss, policy_entropy = model.predict( observations=obs, states=states, rewards=rewards, masks=masks, actions=actions, values=values, session=session) nseconds = time.time() - tstart fps = int((update * nbatch) / nseconds) if update % log_interval == 0 or update == 1: ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.dump_tabular() env.close() writer.close() session.close()
def step(self, action): if len(np.shape(action)) > 1: high_dim_ac_form = True action = np.squeeze(action) else: high_dim_ac_form = False if sum(np.isnan(action)) > 0: raise ValueError("Passed in nan to step! Action: " + str(action)) # --- previously direct shift mean of Gaussian from 0 to 8.8 around --- # print("action:",action) # action = action + 8.8 # a little more power for easier launch away from ground # -------------------------------------------------------------------- # --- now, try action transformation [-2,2] -> [7,10] (for PPO and TRPO, because we are not using strict action range for them) --- # action = [7 + (10 - 7) * (a_i - (-2)) / (2 - (-2)) for a_i in action] # -------------------------------------------------------------------- # --- no matter if we use pol_load. Remember when doing supervised learning, normalize obs and rescale actions --- # --- use [-1,1] is to be consisten with DDPG and PPO2 (from stable_baselines) default action range --- # --- rescale from [-1,1] -> [0,12] because MPC control range is [0,12], not [7,10] --- # print("action before:", action) action = np.clip(action, -2, 2) # action = 2.0 * np.tanh(action) # action = [0 + (12 - 0) * (a_i - (-1)) / (1- (-1)) for a_i in action] action = [7 + (10 - 7) * (a_i - (-2)) / (2 - (-2)) for a_i in action] # print("action after:", action) pre_phi = self.pre_obsrv[4] wrench = Wrench() wrench.force.x = (action[0] + action[1]) * np.sin(pre_phi) wrench.force.y = 0 # wrench.force.z = action[0] + action[1] wrench.force.z = (action[0] + action[1]) * np.cos(pre_phi) wrench.torque.x = 0 # wrench.torque.y = (action[0] - action[1]) * 0.5 wrench.torque.y = (action[0] - action[1]) * 0.4 # wrench.torque.y = 1.0 wrench.torque.z = 0 rospy.wait_for_service('/gazebo/apply_body_wrench') self.force(body_name="base_link", reference_frame="world", wrench=wrench, start_time=rospy.Time().now(), duration=rospy.Duration(1)) # self.force(body_name="base_link", reference_frame="base_link", wrench=wrench, start_time=rospy.Time().now(), duration=rospy.Duration(1)) dynamic_data = self.get_model_state(model_name="quadrotor") # print("dynamics data after one step:", dynamic_data) rospy.wait_for_service('/gazebo/unpause_physics') try: self.unpause() except rospy.ServiceException as e: print("/gazebo/unpause_physics service call failed") laser_data = rospy.wait_for_message('/scan', LaserScan, timeout=20) # contact_data = rospy.wait_for_message('/gazebo_ros_bumper', ContactsState, timeout=50) # print("contact data:", contact_data) rospy.wait_for_service('/gazebo/pause_physics') try: self.pause() except rospy.ServiceException as e: print("/gazebo/pause_physics service call failed") done = False suc = False self.step_counter += 1 event_flag = None # {'collision', 'safe', 'goal', 'steps exceeding', 'highly tilt'} obsrv = self.get_obsrv(laser_data, dynamic_data) # --- special solution for nan/inf observation (especially in case of any invalid sensor readings) --- # if any(np.isnan(np.array(obsrv))) or any(np.isinf(np.array(obsrv))): logger.record_tabular("found nan or inf in observation:", obsrv) obsrv = self.pre_obsrv done = True self.step_counter = 0 self.pre_obsrv = obsrv assert self.reward_type is not None reward = 0 if self.reward_type == 'hand_craft': reward += 0 elif self.reward_type == 'hand_craft_mpc': # reward = -self.control_reward_coff * (action[0] ** 2 + action[1] ** 2) # reward = -1 # reward += 0 # print("using hand_craft_mpc") delta_x = obsrv[0] - GOAL_STATE[0] delta_z = obsrv[2] - GOAL_STATE[2] delta_theta = obsrv[4] - GOAL_STATE[4] reward += -1.0 * (action[0]**2 + action[1]**2) reward += -10000.0 * (delta_x**2 + delta_z**2) reward = reward * 0.0001 # print("delta x: {}".format(delta_x), "delta z: {}".format(delta_z), "reward from control: {}".format(-1.0 * (action[0] ** 2 + action[1] ** 2)), # "reward from state diff: {}".format(-100 * (delta_x ** 2 + delta_z ** 2))) elif self.reward_type == "hand_craft_mpc_without_control": delta_x = obsrv[0] - GOAL_STATE[0] delta_z = obsrv[2] - GOAL_STATE[2] delta_theta = obsrv[4] - GOAL_STATE[4] reward += -np.sqrt(delta_x**2 + delta_z**2 + 10.0 * delta_theta**2) elif self.reward_type == "hand_craft_mpc_without_control_2": delta_x = obsrv[0] - GOAL_STATE[0] delta_z = obsrv[2] - GOAL_STATE[2] reward += -np.sqrt(delta_x**2 + delta_z**2) elif self.reward_type == 'ttr' and self.brsEngine is not None: # Notice z-axis ttr space is defined from (-5,5), in gazebo it's in (0,10), so you need -5 when you want correct ttr reward ttr_obsrv = copy.deepcopy(obsrv) # because in brs_engine, z pos is defined as [-5,5]. But here, z pos is defined as [0,10] ttr_obsrv[2] = ttr_obsrv[2] - 5 ttr = self.brsEngine.evaluate_ttr( np.reshape(ttr_obsrv[:6], (1, -1))) reward += -ttr elif self.reward_type == 'distance': reward += -(Euclid_dis((obsrv[0], obsrv[2]), (GOAL_STATE[0], GOAL_STATE[2]))) # reward += (-Euclid_dis((obsrv[0], obsrv[2]), (GOAL_STATE[0], GOAL_STATE[2])) - abs(obsrv[1]-GOAL_STATE[1]) - abs(obsrv[3]-GOAL_STATE[3])) elif self.reward_type == 'distance_lambda_0.1': delta_x = obsrv[0] - GOAL_STATE[0] delta_z = obsrv[2] - GOAL_STATE[2] delta_theta = obsrv[4] - GOAL_STATE[4] reward += -np.sqrt(delta_x**2 + delta_z**2 + 0.1 * delta_theta**2) elif self.reward_type == 'distance_lambda_1': delta_x = obsrv[0] - GOAL_STATE[0] delta_z = obsrv[2] - GOAL_STATE[2] delta_theta = obsrv[4] - GOAL_STATE[4] reward += -np.sqrt(delta_x**2 + delta_z**2 + 1.0 * delta_theta**2) elif self.reward_type == 'distance_lambda_10': delta_x = obsrv[0] - GOAL_STATE[0] delta_z = obsrv[2] - GOAL_STATE[2] delta_theta = obsrv[4] - GOAL_STATE[4] reward += -np.sqrt(delta_x**2 + delta_z**2 + 10.0 * delta_theta**2) else: raise ValueError("no option for step reward!") # print("step reward:", reward) # print("self.reward_type:", self.reward_type) # 1. when collision happens, done = True if self._in_obst(laser_data, dynamic_data): reward += self.collision_reward done = True self.step_counter = 0 event_flag = 'collision' """ if self._in_obst(contact_data): reward += self.collision_reward done = True self.step_counter = 0 # print("obstacle!") """ # 2. In the neighbor of goal state, done is True as well. Only considering velocity and pos if self._in_goal(np.array(obsrv[:6])): reward += self.goal_reward done = True suc = True self.step_counter = 0 event_flag = 'goal' # print("in goal") # if abs(obsrv[4] - self.goal_state[4]) < 0.40: # print("good tilting!") # Amend: modified by xlv, abs(obsrv[4]) > 1.2 -> abs(obsrv[4]) > 1.4 if obsrv[4] > 1.4 or obsrv[4] < -1.4: reward += self.collision_reward * 2 done = True self.step_counter = 0 event_flag = 'highly tilt' # print("tilt too much") # maximum episode length allowed if self.step_counter >= 100: done = True self.step_counter = 0 event_flag = 'steps exceeding' # print('exceed max length') if event_flag is None: event_flag = 'safe' if high_dim_ac_form: # for PPO2 Vectorized Env return np.asarray(obsrv), np.asarray([reward ]), np.asarray([done]), [{ 'suc': suc }] else: return np.asarray(obsrv), reward, done, { 'suc': suc, 'event': event_flag }
def train(env_name, start_episodes, num_episodes, gamma, tau, noise_std, batch_size, eval_freq, seed): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' start_episodes: how many episodes purely random policy is run for num_episodes: maximum number of episodes to run gamma: reward discount factor tau: target network update rate batch_size: number of episodes per policy training batch eval_freq: number of training batch before test seed: random seed for all modules with randomness """ # set seeds set_global_seed(seed) # configure log configure_log_info(env_name, seed) # create env env = gym.make(env_name) env.seed(seed) # set env seed obs_dim = env.observation_space.shape[0] obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) act_dim = env.action_space.shape[0] # create actor and target actor actor = Actor(obs_dim, act_dim, float(env.action_space.high[0])).to(device) target_actor = Actor(obs_dim, act_dim, float(env.action_space.high[0])).to(device) # create critic and target critic critic = Critic(obs_dim, act_dim).to(device) target_critic = Critic(obs_dim, act_dim).to(device) # create DDPG agent (hollowed object) agent = DDPG(actor, critic, target_actor, target_critic, noise_std, gamma, tau) agent.align_target() # create replay_buffer replay_buffer = ReplayBuffer() # run a few episodes of untrained policy to initialize scaler and fill in replay buffer run_policy(env, agent, replay_buffer, mode="random", episodes=start_episodes) num_iteration = num_episodes // eval_freq current_episodes = 0 current_steps = 0 for iter in range(num_iteration): # train models for i in range(eval_freq): # sample transitions train_returns, total_steps = run_policy(env, agent, replay_buffer, mode="train", episodes=batch_size) current_episodes += batch_size current_steps += total_steps logger.info('[train] average return:{0}, std return: {1}'.format( np.mean(train_returns), np.std(train_returns))) # train num_epoch = total_steps // batch_size for e in range(num_epoch): observation, action, reward, next_obs, done = replay_buffer.sample( ) agent.update(observation, action, reward, next_obs, done) # test models num_test_episodes = 10 returns, _ = run_policy(env, agent, replay_buffer, mode="test", episodes=num_test_episodes) avg_return = np.mean(returns) std_return = np.std(returns) logger.record_tabular('iteration', iter) logger.record_tabular('episodes', current_episodes) logger.record_tabular('steps', current_steps) logger.record_tabular('avg_return', avg_return) logger.record_tabular('std_return', std_return) logger.dump_tabular()
expert_action_trajs = expert_action_trajs[:num_expert_trajs, 1:, :] # select first expert_episodes expert_actions = expert_action_trajs.reshape(-1, gym_env.action_space.shape[0]) replay_buffer = ReplayBuffer( state_size, action_size, device=device, size=v['sac']['buffer_size']) sac_agent = SAC(env_fn, replay_buffer, steps_per_epoch=v['env']['T'], update_after=v['env']['T'] * v['sac']['random_explore_episodes'], max_ep_len=v['env']['T'], seed=seed, start_steps=v['env']['T'] * v['sac']['random_explore_episodes'], reward_state_indices=state_indices, device=device, **v['sac'] ) for itr in range(v['bc']['epochs']//v['bc']['eval_freq']): loss = stochastic_bc(sac_agent, expert_states, expert_actions, epochs = v['bc']['eval_freq']) # loss = mse_bc(sac_agent, expert_states, expert_actions, epochs = 1) logger.record_tabular("BC loss", loss.item()) real_return_det, real_return_sto = try_evaluate(itr, "Running") logger.record_tabular("Iteration", itr) logger.dump_tabular()