def collect_samples(self, min_batch_size): t_start = time.time() if use_gpu: self.policy = self.policy.cpu() thread_batch_size = int(math.floor(min_batch_size / self.num_threads)) queue = multiprocessing.Queue() memory = Memory() workers = [] for i in range(self.num_threads - 1): workers.append( Worker(queue, self.env_list[i + 1], self.policy, self.custom_reward, self.tensor, False, self.running_state, thread_batch_size)) for worker in workers: worker.start() log = collect_samples(self.env_list[0], memory, self.policy, self.custom_reward, self.tensor, self.render, self.running_state, True, thread_batch_size) worker_logs = [] for _ in workers: worker_memory, worker_log = queue.get() memory.append(worker_memory) worker_logs.append(worker_log) batch = memory.sample() if self.num_threads > 1: log_list = [log] + worker_logs log = merge_log(log_list) if use_gpu: self.policy = self.policy.cuda() t_end = time.time() log['sample_time'] = t_end - t_start return batch, log
def train_gail(self, expert): '''Train Info-GAIL.''' args, dtype = self.args, self.dtype results = { 'average_reward': [], 'episode_reward': [], 'true_traj': {}, 'pred_traj': {} } self.train_step_count, self.gail_step_count = 0, 0 for ep_idx in range(args.num_epochs): memory = Memory() num_steps = 0 reward_batch, true_reward_batch = [], [] expert_true_reward_batch = [] true_traj_curr_episode, gen_traj_curr_episode = [], [] while num_steps < args.batch_size: traj_expert = expert.sample(size=1) state_expert, action_expert, _, _ = traj_expert # Expert state and actions state_expert = state_expert[0] action_expert = action_expert[0] expert_episode_len = len(state_expert) # Sample start state or should we just choose the start state # from the expert trajectory sampled above. # curr_state_obj = self.sample_start_state() curr_state_obj = State(state_expert[0], self.obstacles) curr_state_feat = self.get_state_features( curr_state_obj, self.args.use_state_features) # Add history to state if args.history_size > 1: curr_state = -1 * np.ones( (args.history_size * curr_state_feat.shape[0]), dtype=np.float32) curr_state[(args.history_size-1) \ * curr_state_feat.shape[0]:] = curr_state_feat else: curr_state = curr_state_feat # TODO: Make this a separate function. Can be parallelized. ep_reward, ep_true_reward, expert_true_reward = 0, 0, 0 true_traj, gen_traj = [], [] gen_traj_dict = { 'features': [], 'actions': [], 'c': [], 'mask': [] } disc_reward, posterior_reward = 0.0, 0.0 # Use a hard-coded list for memory to gather experience since we # need to mutate it before finally creating a memory object. c_sampled = np.zeros((self.num_goals), dtype=np.float32) c_sampled[np.random.randint(0, self.num_goals)] = 1.0 c_sampled_tensor = torch.zeros((1)).type(torch.LongTensor) c_sampled_tensor[0] = int(np.argmax(c_sampled)) if self.args.cuda: c_sampled_tensor = torch.cuda.LongTensor(c_sampled_tensor) memory_list = [] for t in range(expert_episode_len): action = self.select_action( np.concatenate((curr_state, c_sampled))) action_numpy = action.data.cpu().numpy() # Save generated and true trajectories true_traj.append((state_expert[t], action_expert[t])) gen_traj.append((curr_state_obj.coordinates, action_numpy)) gen_traj_dict['features'].append( self.get_state_features(curr_state_obj, self.args.use_state_features)) gen_traj_dict['actions'].append(action_numpy) gen_traj_dict['c'].append(c_sampled) action = epsilon_greedy_linear_decay(action_numpy, args.num_epochs * 0.5, ep_idx, self.action_size, low=0.05, high=0.3) # Get the discriminator reward disc_reward_t = float( self.reward_net( torch.cat((Variable( torch.from_numpy(curr_state).unsqueeze( 0)).type(dtype), Variable( torch.from_numpy( oned_to_onehot( action, self.action_size)). unsqueeze(0)).type(dtype)), 1)).data.cpu().numpy()[0, 0]) if args.use_log_rewards and disc_reward_t < 1e-6: disc_reward_t += 1e-6 disc_reward_t = -math.log(disc_reward_t) \ if args.use_log_rewards else -disc_reward_t disc_reward += disc_reward_t # Predict c given (x_t) predicted_posterior = self.posterior_net( Variable(torch.from_numpy(curr_state).unsqueeze( 0)).type(dtype)) posterior_reward_t = self.criterion_posterior( predicted_posterior, Variable(c_sampled_tensor)).data.cpu().numpy()[0] posterior_reward += (self.args.lambda_posterior * posterior_reward_t) # Update Rewards ep_reward += (disc_reward_t + posterior_reward_t) true_goal_state = [ int(x) for x in state_expert[-1].tolist() ] if self.args.flag_true_reward == 'grid_reward': ep_true_reward += self.true_reward.reward_at_location( curr_state_obj.coordinates, goals=[true_goal_state]) expert_true_reward += self.true_reward.reward_at_location( state_expert[t], goals=[true_goal_state]) elif self.args.flag_true_reward == 'action_reward': ep_true_reward += self.true_reward.reward_at_location( np.argmax(action_expert[t]), action) expert_true_reward += self.true_reward.corret_action_reward else: raise ValueError("Incorrect true reward type") # Update next state next_state_obj = self.transition_func( curr_state_obj, Action(action), 0) next_state_feat = self.get_state_features( next_state_obj, self.args.use_state_features) #next_state = running_state(next_state) mask = 0 if t == expert_episode_len - 1 else 1 # Push to memory memory_list.append([ curr_state, np.array([oned_to_onehot(action, self.action_size)]), mask, next_state_feat, disc_reward_t + posterior_reward_t, c_sampled, c_sampled ]) if args.render: env.render() if not mask: break curr_state_obj = next_state_obj curr_state_feat = next_state_feat if args.history_size > 1: curr_state[:(args.history_size-1) \ * curr_state_feat.shape[0]] = \ curr_state[curr_state_feat.shape[0]:] curr_state[(args.history_size-1) \ * curr_state_feat.shape[0]:] = curr_state_feat else: curr_state = curr_state_feat assert memory_list[-1][2] == 0, \ "Mask for final end state is not 0." for memory_t in memory_list: memory.push(*memory_t) self.logger.summary_writer.add_scalars( 'gen_traj/gen_reward', { 'discriminator': disc_reward, 'posterior': posterior_reward, }, self.train_step_count) num_steps += (t - 1) reward_batch.append(ep_reward) true_reward_batch.append(ep_true_reward) expert_true_reward_batch.append(expert_true_reward) results['episode_reward'].append(ep_reward) # Append trajectories true_traj_curr_episode.append(true_traj) gen_traj_curr_episode.append(gen_traj) results['average_reward'].append(np.mean(reward_batch)) # Add to tensorboard self.logger.summary_writer.add_scalars( 'gen_traj/reward', { 'average': np.mean(reward_batch), 'max': np.max(reward_batch), 'min': np.min(reward_batch) }, self.train_step_count) self.logger.summary_writer.add_scalars( 'gen_traj/true_reward', { 'average': np.mean(true_reward_batch), 'max': np.max(true_reward_batch), 'min': np.min(true_reward_batch), 'expert_true': np.mean(expert_true_reward_batch) }, self.train_step_count) # Add predicted and generated trajectories to results if ep_idx % self.args.save_interval == 0: results['true_traj'][ep_idx] = copy.deepcopy( true_traj_curr_episode) results['pred_traj'][ep_idx] = copy.deepcopy( gen_traj_curr_episode) # Update parameters gen_batch = memory.sample() # We do not get the context variable from expert trajectories. # Hence we need to fill it in later. expert_batch = expert.sample(size=args.num_expert_trajs) self.update_params(gen_batch, expert_batch, ep_idx, args.optim_epochs, args.optim_batch_size) self.train_step_count += 1 if ep_idx > 0 and ep_idx % args.log_interval == 0: print('Episode [{}/{}] Avg R: {:.2f} Max R: {:.2f} \t' \ 'True Avg {:.2f} True Max R: {:.2f} ' \ 'Expert (Avg): {:.2f}'.format( ep_idx, args.num_epochs, np.mean(reward_batch), np.max(reward_batch), np.mean(true_reward_batch), np.max(true_reward_batch), np.mean(expert_true_reward_batch))) results_path = os.path.join(args.results_dir, 'results.pkl') with open(results_path, 'wb') as results_f: pickle.dump((results), results_f, protocol=2) # print("Did save results to {}".format(results_path)) if ep_idx % args.save_interval == 0: checkpoint_filepath = self.model_checkpoint_filepath(ep_idx) torch.save(self.checkpoint_data_to_save(), checkpoint_filepath) print("Did save checkpoint: {}".format(checkpoint_filepath))