def pretrain_q_with_bc_data(self): logger.remove_tabular_output('progress.csv', relative_to_snapshot_dir=True) logger.add_tabular_output('pretrain_q.csv', relative_to_snapshot_dir=True) self.update_policy = False # first train only the Q function for i in range(self.q_num_pretrain_steps): self.eval_statistics = dict() self._need_to_update_eval_statistics = True train_data = self.replay_buffer.random_batch(128) train_data = np_to_pytorch_batch(train_data) obs = train_data['observations'] next_obs = train_data['next_observations'] if self.goal_conditioned: goals = train_data['resampled_goals'] train_data['observations'] = torch.cat((obs, goals), dim=1) train_data['next_observations'] = torch.cat((next_obs, goals), dim=1) self.train_from_torch(train_data) logger.record_dict(self.eval_statistics) logger.dump_tabular(with_prefix=True, with_timestamp=False) self.update_policy = True # then train policy and Q function together for i in range(self.q_num_pretrain_steps): self.eval_statistics = dict() self._need_to_update_eval_statistics = True train_data = self.replay_buffer.random_batch(128) train_data = np_to_pytorch_batch(train_data) obs = train_data['observations'] next_obs = train_data['next_observations'] if self.goal_conditioned: goals = train_data['resampled_goals'] train_data['observations'] = torch.cat((obs, goals), dim=1) train_data['next_observations'] = torch.cat((next_obs, goals), dim=1) self.train_from_torch(train_data) logger.record_dict(self.eval_statistics) logger.dump_tabular(with_prefix=True, with_timestamp=False) logger.remove_tabular_output( 'pretrain_q.csv', relative_to_snapshot_dir=True, ) logger.add_tabular_output( 'progress.csv', relative_to_snapshot_dir=True, )
def random_batch(self, batch_size): traj_i = np.random.choice(self.size, batch_size) trans_i = np.random.choice(self.traj_length - 1, batch_size) try: env = normalize_image(self.data['env'][traj_i, :]) except: env = normalize_image(self.data['observations'][traj_i, 0, :]) x_t = normalize_image(self.data['observations'][traj_i, trans_i, :]) x_next = normalize_image(self.data['observations'][traj_i, trans_i + 1, :]) episode_num = np.random.randint(0, self.size) episode_obs = normalize_image( self.data['observations'][episode_num, :8, :]) data_dict = { 'x_t': x_t, 'x_next': x_next, 'env': env, 'actions': self.data['actions'][traj_i, trans_i, :], 'episode_obs': episode_obs, 'episode_acts': self.data['actions'][episode_num, :7, :], } return np_to_pytorch_batch(data_dict)
def random_batch(self, batch_size): env_i = np.random.choice(self.num_envs, batch_size) trans_i = np.random.choice(self.sample_size, batch_size) match_i = np.random.choice(self.num_envs, batch_size) trans_x = np.random.choice(self.sample_size, batch_size) trans_y = np.random.choice(self.sample_size, batch_size) rand_a = np.random.choice(self.num_envs - 1, batch_size // 2) rand_b = np.add(rand_a, np.ones(batch_size // 2)).astype(int) trans_m = np.random.choice(self.sample_size, batch_size // 2) trans_n = np.random.choice(self.sample_size, batch_size // 2) matches = np.random.uniform(0, 0.1, batch_size // 2) nonmatches = np.random.uniform(0.9, 1, batch_size // 2) swap_count = int(batch_size * 0.05) matches[: swap_count], nonmatches[: swap_count] = nonmatches[: swap_count], matches[: swap_count] labels = np.concatenate([matches, nonmatches]) data_dict = { 'observations': self.data['observations'][env_i, trans_i, :], 'env_set_1': self.data['observations'][match_i, trans_x, :], 'env_set_2': self.data['observations'][match_i, trans_y, :], } return np_to_pytorch_batch(data_dict)
def random_batch(self, batch_size): i = np.random.choice(self.size, batch_size, replace=(self.size < batch_size)) data_dict = { 'observations': self.data[i, :], } return np_to_pytorch_batch(data_dict)
def get_test_batch(self): batch = self.test_replay_buffer.random_batch(self.batch_size) batch = np_to_pytorch_batch(batch) obs = batch['observations'] next_obs = batch['next_observations'] goals = batch['resampled_goals'] batch['observations'] = torch.cat((obs, goals), dim=1) batch['next_observations'] = torch.cat((next_obs, goals), dim=1) return batch
def random_batch(self, batch_size): traj_i = np.random.choice(np.arange(self.size), batch_size) trans_i = np.random.choice(np.arange(self.traj_length - 1), batch_size) data_dict = { 'observations': self.data['observations'][traj_i, trans_i, :], 'next_observations': self.data['observations'][traj_i, trans_i + 1, :], 'actions': self.data['actions'][traj_i, trans_i, :], } return np_to_pytorch_batch(data_dict)
def random_batch(self, batch_size): i = np.random.choice(self.size, batch_size, replace=(self.size < batch_size)) obs = self.data[i, :] if self.normalize: obs = normalize_image(obs) data_dict = { 'observations': obs, } return np_to_pytorch_batch(data_dict)
def get_batch(self): sample_size = self.batch_size // 2 batch1 = self.replay_buffer1().random_batch(sample_size) batch2 = self.replay_buffer2().random_batch(sample_size) new_batch = {} for k, v in batch1.items(): new_batch[k] = np.concatenate( (v, batch2[k]), axis=0, ) return np_to_pytorch_batch(new_batch)
def random_batch(self, batch_size): num_traj = self.replay_buffer._size // self.horizon traj_i = np.random.choice(num_traj, batch_size) trans_i = np.random.choice(self.horizon - 2, batch_size) indices = traj_i * self.horizon + trans_i batch = dict( x0=self.replay_buffer._obs["image_observation"][indices], x1=self.replay_buffer._obs["image_observation"][indices + 1], x2=self.replay_buffer._obs["image_observation"][indices + 2], ) return np_to_pytorch_batch(batch)
def _statistics_from_paths(self, paths, stat_prefix): rewards, terminals, obs, actions, next_obs = split_paths(paths) np_batch = dict( rewards=rewards, terminals=terminals, observations=obs, actions=actions, next_observations=next_obs, ) batch = np_to_pytorch_batch(np_batch) statistics = self._statistics_from_batch(batch, stat_prefix) statistics.update( create_stats_ordered_dict('Num Paths', len(paths), stat_prefix=stat_prefix)) return statistics
def get_batch_from_buffer(self, replay_buffer): batch = replay_buffer.random_batch(self.bc_batch_size) batch = np_to_pytorch_batch(batch) # obs = batch['observations'] # next_obs = batch['next_observations'] # goals = batch['resampled_goals'] # import ipdb; ipdb.set_trace() # batch['observations'] = torch.cat(( # obs, # goals # ), dim=1) # batch['next_observations'] = torch.cat(( # next_obs, # goals # ), dim=1) return batch
def get_batch(self): batch = self.replay_buffer.random_batch_random_tau( self.batch_size, self.max_tau) """ Update the goal states/rewards """ num_steps_left = self._sample_taus_for_training(batch) obs = batch['observations'] actions = batch['actions'] next_obs = batch['next_observations'] goals = batch['training_goals'] rewards = self._compute_rewards_np(batch, obs, actions, next_obs, goals) terminals = batch['terminals'] #not too sure what this code does if self.tau_sample_strategy == 'all_valid': obs = np.repeat(obs, self.max_tau + 1, 0) actions = np.repeat(actions, self.max_tau + 1, 0) next_obs = np.repeat(next_obs, self.max_tau + 1, 0) goals = np.repeat(goals, self.max_tau + 1, 0) rewards = np.repeat(rewards, self.max_tau + 1, 0) terminals = np.repeat(terminals, self.max_tau + 1, 0) if self.finite_horizon: terminals = 1 - (1 - terminals) * (num_steps_left != 0) if self.terminate_when_goal_reached: diff = self.env.convert_obs_to_goals(next_obs) - goals goal_not_reached = (np.linalg.norm(diff, axis=1, keepdims=True) > self.goal_reached_epsilon) terminals = 1 - (1 - terminals) * goal_not_reached if not self.dense_rewards: rewards = rewards * terminals """ Update the batch """ batch['rewards'] = rewards batch['terminals'] = terminals batch['actions'] = actions batch['num_steps_left'] = num_steps_left batch['goals'] = goals batch['observations'] = obs batch['next_observations'] = next_obs return np_to_pytorch_batch(batch)
def get_batch(self, training=True): if self.replay_buffer_is_split: replay_buffer = self.replay_buffer.get_replay_buffer(training) else: replay_buffer = self.replay_buffer batch = replay_buffer.random_batch(self.batch_size) """ Update the goal states/rewards """ num_steps_left = np.random.randint(0, self.max_tau + 1, (self.batch_size, 1)) terminals = 1 - (1 - batch['terminals']) * (num_steps_left != 0) batch['terminals'] = terminals obs = batch['observations'] next_obs = batch['next_observations'] if self.sample_train_goals_from == 'her': goals = batch['goals'] else: goals = self._sample_goals_for_training() goal_differences = np.abs( self.env.convert_obs_to_goals(next_obs) # - self.env.convert_obs_to_goals(obs) - goals) batch['goal_differences'] = goal_differences * self.reward_scale batch['goals'] = goals """ Update the observations """ batch['observations'] = merge_into_flat_obs( obs=batch['observations'], goals=batch['goals'], num_steps_left=num_steps_left, ) batch['next_observations'] = merge_into_flat_obs( obs=batch['next_observations'], goals=batch['goals'], num_steps_left=num_steps_left - 1, ) return np_to_pytorch_batch(batch)
def random_batch(self, batch_size): traj_i = np.random.choice(self.size, batch_size) trans_i = np.random.choice(self.traj_length, batch_size) # conditioning = np.random.choice(self.traj_length, batch_size) # env = normalize_image(self.data['observations'][traj_i, conditioning, :]) try: env = normalize_image(self.data['env'][traj_i, :]) except: env = normalize_image(self.data['observations'][traj_i, 0, :]) x_t = normalize_image(self.data['observations'][traj_i, trans_i, :]) episode_num = np.random.randint(0, self.size) episode_obs = normalize_image( self.data['observations'][episode_num, :8, :]) data_dict = { 'x_t': x_t, 'env': env, 'episode_obs': episode_obs, } return np_to_pytorch_batch(data_dict)
def fix_data_set(self): for training in [True, False]: replay_buffer = self.replay_buffer.get_replay_buffer(training) batch_dict = {} for i in range(self.num_unique_batches): batch_size = min( replay_buffer.num_steps_can_sample(), self.batch_size ) batch = replay_buffer.random_batch(batch_size) goal_states = self.sample_goal_states(batch_size, training) new_rewards = self.env.compute_rewards( batch['observations'], batch['actions'], batch['next_observations'], goal_states, ) batch['goal_states'] = goal_states batch['rewards'] = new_rewards torch_batch = np_to_pytorch_batch(batch) batch_dict[i] = torch_batch self.mode_to_batch_iterator[training] = create_batch_iterator( self.num_unique_batches, batch_dict )
def _do_training(self): beta = self.per_beta_schedule.get_value(self._n_train_steps_total, ) batches = [] if self.train_with in ['on_policy', 'both']: batches.append( self.replay_buffer.most_recent_path_batch(beta=beta)) if self.train_with in ['off_policy', 'both']: batches.append( self.replay_buffer.random_batch( self.batch_size, beta=beta, )) for tmp, np_batch in enumerate(batches): next_obs = np_batch['next_observations'] goals = np_batch['goals'] terminals = np_batch['terminals'] indices = np_batch['indices'] events = self.detect_event(next_obs, goals) np_batch['events'] = events terminals = 1 - (1 - terminals) * (1 - events) if self.finite_horizon: terminals = 1 - (1 - terminals) * (np_batch['num_steps_left'] != 0) np_batch['terminals'] = terminals batch = np_to_pytorch_batch(np_batch) # self.train_batches.append(batch) terminals = batch['terminals'] obs = batch['observations'] actions = batch['actions'] next_obs = batch['next_observations'] num_steps_left = batch['num_steps_left'] goals = batch['goals'] events = batch['events'] if self.finite_horizon: next_num_steps_left = num_steps_left - 1 else: next_num_steps_left = num_steps_left # next_actions = self.target_policy( # observations=next_obs, # goals=goals, # num_steps_left=next_num_steps_left, # ) # noise = torch.normal( # torch.zeros_like(next_actions), # self.target_policy_noise, # ) # noise = torch.clamp( # noise, # -self.target_policy_noise_clip, # self.target_policy_noise_clip # ) # noisy_next_actions = next_actions + noise # next_beta_1 = self.target_beta_q( # observations=next_obs, # actions=noisy_next_actions, # goals=goals, # num_steps_left=next_num_steps_left, # ) # next_beta_2 = self.target_beta_q2( # observations=next_obs, # actions=noisy_next_actions, # goals=goals, # num_steps_left=next_num_steps_left, # ) # next_beta = torch.min(next_beta_1, next_beta_2) # noisy_next_actions = self.policy( # observations=next_obs, # goals=goals, # num_steps_left=next_num_steps_left, # ) next_actions = self.policy( observations=next_obs, goals=goals, num_steps_left=next_num_steps_left, ) next_beta = self.beta_q( observations=next_obs, actions=next_actions, goals=goals, num_steps_left=next_num_steps_left, ) if not self.finite_horizon: next_beta = next_beta * self.discount targets = (terminals * events + (1 - terminals) * next_beta).detach() predictions = self.beta_q(obs, actions, goals, num_steps_left) if self.prioritized_replay: weights = ptu.from_numpy(np_batch['is_weights']).float() self.q_criterion.weight = weights priorities = ptu.get_numpy(torch.abs(predictions - targets)) self.replay_buffer.update_priorities(indices, priorities) beta_q_loss = self.q_criterion(predictions, targets) # predictions2 = self.beta_q2(obs, actions, goals, num_steps_left) # beta_q2_loss = self.q_criterion(predictions2, targets) # self.beta_q2_optimizer.zero_grad() # beta_q2_loss.backward() policy_actions = self.policy(obs, goals, num_steps_left) policy_actions.register_hook( self.create_save_gradient_norm_hook('dQ/da')) beta_q_output = self.beta_q( observations=obs, actions=policy_actions, goals=goals, num_steps_left=num_steps_left, ) beta_v_loss = self.v_criterion( self.beta_v(obs, goals, num_steps_left), beta_q_output.detach()) self.beta_v_optimizer.zero_grad() beta_v_loss.backward() self.beta_v_optimizer.step() policy_loss = -beta_q_output.mean() if self.training_policy or self.train_simultaneously: self.policy_optimizer.zero_grad() policy_loss.backward() policy_grad_norms = [] for param in self.policy.parameters(): policy_grad_norms.append(param.grad.data.norm()) self.eval_statistics.update( create_stats_ordered_dict( 'Policy Gradient Norms', policy_grad_norms, )) self.policy_optimizer.step() if not self.training_policy or self.train_simultaneously: self.beta_q_optimizer.zero_grad() beta_q_loss.backward() beta_q_grad_norms = [] for param in self.beta_q.parameters(): beta_q_grad_norms.append(param.grad.data.norm()) self.eval_statistics.update( create_stats_ordered_dict( 'Beta Q Gradient Norms', beta_q_grad_norms, )) self.beta_q_optimizer.step() # self.beta_q2_optimizer.step() if self._n_train_steps_total % self.flip_training_period == 0: self.training_policy = not self.training_policy # ptu.soft_update_from_to( # self.policy, self.target_policy, self.soft_target_tau # ) # ptu.soft_update_from_to( # self.beta_q, self.target_beta_q, self.soft_target_tau # ) # ptu.soft_update_from_to( # self.beta_q2, self.target_beta_q2, self.soft_target_tau # ) if self.need_to_update_eval_statistics: self.need_to_update_eval_statistics = False self.eval_statistics['Policy Loss'] = np.mean( ptu.get_numpy(policy_loss)) self.eval_statistics['Beta Q Loss'] = np.mean( ptu.get_numpy(beta_q_loss)) self.eval_statistics['Beta V Loss'] = np.mean( ptu.get_numpy(beta_v_loss)) self.eval_statistics.update( create_stats_ordered_dict( 'Beta Q Targets', ptu.get_numpy(targets), )) self.eval_statistics.update( create_stats_ordered_dict( 'Beta Q Predictions', ptu.get_numpy(predictions), )) # self.eval_statistics.update(create_stats_ordered_dict( # 'Beta Q1 - Q2', # ptu.get_numpy(next_beta_1 - next_beta_2), # )) real_goal = np.array([0., 4.]) is_real_goal = (np_batch['goals'] == real_goal).all(axis=1) goal_is_corner = (np.abs(np_batch['goals']) == 4).all(axis=1) self.eval_statistics['Event Prob'] = np_batch['events'].mean() self.eval_statistics['Goal is Current Obs Prob'] = ( self.detect_event( np_batch['observations'], np_batch['goals'], ).mean()) self.eval_statistics['Training Goal is (0, 4) Prob'] = ( is_real_goal.mean()) self.eval_statistics['Training Goal is Corner'] = ( goal_is_corner.mean()) self.eval_statistics.update(self.extra_eval_statistics) self.extra_eval_statistics = OrderedDict()
def get_batch(self): batch = self.replay_buffer.random_batch_random_tau( self.batch_size, max_tau=self.time_horizon) return np_to_pytorch_batch(batch)
def train(self, np_batch): self._num_train_steps += 1 batch = np_to_pytorch_batch(np_batch) self.train_from_torch(batch)
def pretrain_q_with_bc_data(self): logger.remove_tabular_output( 'progress.csv', relative_to_snapshot_dir=True ) logger.add_tabular_output( 'pretrain_q.csv', relative_to_snapshot_dir=True ) self.update_policy = False # first train only the Q function for i in range(self.q_num_pretrain1_steps): self.eval_statistics = dict() train_data = self.replay_buffer.random_batch(self.bc_batch_size) train_data = np_to_pytorch_batch(train_data) obs = train_data['observations'] next_obs = train_data['next_observations'] # goals = train_data['resampled_goals'] train_data['observations'] = obs # torch.cat((obs, goals), dim=1) train_data['next_observations'] = next_obs # torch.cat((next_obs, goals), dim=1) self.train_from_torch(train_data) if i%self.pretraining_logging_period == 0: logger.record_dict(self.eval_statistics) logger.dump_tabular(with_prefix=True, with_timestamp=False) self.update_policy = True # then train policy and Q function together prev_time = time.time() for i in range(self.q_num_pretrain2_steps): self.eval_statistics = dict() if i % self.pretraining_logging_period == 0: self._need_to_update_eval_statistics=True train_data = self.replay_buffer.random_batch(self.bc_batch_size) train_data = np_to_pytorch_batch(train_data) obs = train_data['observations'] next_obs = train_data['next_observations'] # goals = train_data['resampled_goals'] train_data['observations'] = obs # torch.cat((obs, goals), dim=1) train_data['next_observations'] = next_obs # torch.cat((next_obs, goals), dim=1) self.train_from_torch(train_data) if self.do_pretrain_rollouts and i % self.pretraining_env_logging_period == 0: total_ret = self.do_rollouts() print("Return at step {} : {}".format(i, total_ret/20)) if i%self.pretraining_logging_period==0: if self.do_pretrain_rollouts: self.eval_statistics["pretrain_bc/avg_return"] = total_ret / 20 self.eval_statistics["batch"] = i self.eval_statistics["epoch_time"] = time.time()-prev_time logger.record_dict(self.eval_statistics) logger.dump_tabular(with_prefix=True, with_timestamp=False) prev_time = time.time() logger.remove_tabular_output( 'pretrain_q.csv', relative_to_snapshot_dir=True, ) logger.add_tabular_output( 'progress.csv', relative_to_snapshot_dir=True, ) self._need_to_update_eval_statistics = True self.eval_statistics = dict() if self.post_pretrain_hyperparams: self.set_algorithm_weights(**self.post_pretrain_hyperparams)
def get_batch_from_buffer(self, replay_buffer, batch_size): batch = replay_buffer.random_batch(batch_size) batch = np_to_pytorch_batch(batch) return batch
def get_batch(self): batch = self.replay_buffer.random_batch(self.batch_size) return np_to_pytorch_batch(batch)
def get_batch(self): batch = self.replay_buffer.get_training_data() return np_to_pytorch_batch(batch)