def run_vae_forward_model(self, agent, trial): config = self._config trial = trial + config.shift forward_model = agent.get_motivation_module() vae = forward_model.get_fm_network() step_limit = int(config.steps * 1e6) steps = 0 states = None if config.check('generate_states'): states = [] if config.check('collect_stats'): states = torch.tensor(numpy.load('./{0:s}_states.npy'.format( self._env_name)), dtype=torch.float32) action_list = [] value_list = [] fm_error_list = [] reward_list = [] steps_per_episode = [] train_fm_errors = [] train_ext_rewards = [] train_int_rewards = [] train_vae_losses = [] reward_avg = RunningAverageWindow(100) step_avg = RunningAverageWindow(100) bar = ProgressBar(config.steps * 1e6, max_width=40) exploration = GaussianExploration( config.sigma, 0.01, config.steps * config.exploration_time * 1e6) while steps < step_limit: if config.check('collect_stats'): actions, values, fm_errors, rewards = self.fm_activations( self._env, agent, forward_model, states) action_list.append(actions) value_list.append(values) fm_error_list.append(fm_errors) reward_list.append(rewards) state0 = torch.tensor(self._env.reset(), dtype=torch.float32).unsqueeze(0) done = False train_ext_reward = 0 train_int_reward = 0 train_vae_loss = 0 train_steps = 0 while not done: train_steps += 1 if config.check('generate_states'): states.append(state0.numpy()) action0 = exploration.explore(agent.get_action(state0)) next_state, reward, done, _ = self._env.step( action0.squeeze(0).numpy()) reward = self.transform_reward(reward) state1 = torch.tensor(next_state, dtype=torch.float32).unsqueeze(0) agent.train(state0, action0, state1, reward, done) forward_model.train(state0, action0, state1) train_ext_reward += reward train_int_reward += forward_model.reward( state0, action0, state1).item() train_vae_loss += vae.loss_function(state0, action0, state1).item() train_fm_error = forward_model.error(state0, action0, state1).item() train_fm_errors.append(train_fm_error) state0 = state1 steps += train_steps if steps > step_limit: train_steps -= steps - step_limit bar.numerator = steps exploration.update(steps) reward_avg.update(train_ext_reward) step_avg.update(train_steps) steps_per_episode.append(train_steps) train_ext_rewards.append(train_ext_reward) train_int_rewards.append(train_int_reward) train_vae_losses.append(train_vae_loss) print( 'Run {0} step {1:d} sigma {2:f} training [ext. reward {3:f} int. reward {4:f} VAE loss {5:f} steps {6:d}] avg. ext. reward {7:f} avg. steps {8:f}' .format(trial, steps, exploration.sigma, train_ext_reward, train_int_reward, train_vae_loss, train_steps, reward_avg.value(), step_avg.value())) print(bar) agent.save('./models/{0:s}_{1}_{2:d}'.format(self._env_name, config.model, trial)) print('Saving data...') save_data = { 'steps': numpy.array(steps_per_episode), 're': numpy.array(train_ext_rewards), 'ri': numpy.array(train_int_rewards), 'fme': numpy.array(train_fm_errors[:step_limit]), 'vl': numpy.array(train_vae_losses), } numpy.save( 'ddpg_{0}_{1}_{2:d}'.format(config.name, config.model, trial), save_data) if config.check('generate_states'): self.generate_states(states) if config.check('collect_stats'): action_list = torch.stack(action_list) value_list = torch.stack(value_list) fm_error_list = torch.stack(fm_error_list) reward_list = torch.stack(reward_list) numpy.save( 'ddpg_{0}_{1}_{2:d}_actions'.format(config.name, config.model, trial), action_list) numpy.save( 'ddpg_{0}_{1}_{2:d}_values'.format(config.name, config.model, trial), value_list) numpy.save( 'ddpg_{0}_{1}_{2:d}_prediction_errors'.format( config.name, config.model, trial), fm_error_list) numpy.save( 'ddpg_{0}_{1}_{2:d}_rewards'.format(config.name, config.model, trial), reward_list)
def run_rnd_model(self, agent, trial): config = self._config trial = trial + config.shift step_limit = int(config.steps * 1e6) steps = 0 steps_per_episode = [] train_fm_errors = [] train_ext_rewards = [] train_int_rewards = [] reward_avg = RunningAverageWindow(100) step_avg = RunningAverageWindow(100) bar = ProgressBar(config.steps * 1e6, max_width=40) exploration = GaussianExploration( config.sigma, 0.01, config.steps * config.exploration_time * 1e6) while steps < step_limit: state0 = torch.tensor(self._env.reset(), dtype=torch.float32).unsqueeze(0) done = False train_ext_reward = 0 train_int_reward = 0 train_steps = 0 while not done: action0 = exploration.explore(agent.get_action(state0)) next_state, reward, done, _ = self._env.step( action0.squeeze(0).numpy()) reward = self.transform_reward(reward) state1 = torch.tensor(next_state, dtype=torch.float32).unsqueeze(0) reward = torch.tensor([reward], dtype=torch.float32).unsqueeze(0) mask = torch.tensor([done], dtype=torch.float32).unsqueeze(0) agent.train(state0, action0, state1, reward, mask) train_steps += 1 train_ext_reward += reward.item() train_int_reward += agent.motivation.reward(state0).item() train_fm_error = agent.motivation.error(state0).item() train_fm_errors.append(train_fm_error) state0 = state1 steps += train_steps if steps > step_limit: train_steps -= steps - step_limit bar.numerator = steps exploration.update(steps) reward_avg.update(train_ext_reward) step_avg.update(train_steps) steps_per_episode.append(train_steps) train_ext_rewards.append(train_ext_reward) train_int_rewards.append(train_int_reward) print( 'Run {0:d} step {1:d} sigma {2:f} training [ext. reward {3:f} int. reward {4:f} steps {5:d}] avg. ext. reward {6:f} avg. steps {7:f}' .format(trial, steps, exploration.sigma, train_ext_reward, train_int_reward, train_steps, reward_avg.value(), step_avg.value())) print(bar) agent.save('./models/{0:s}_{1}_{2:d}'.format(self._env_name, config.model, trial)) print('Saving data...') save_data = { 'steps': numpy.array(steps_per_episode), 're': numpy.array(train_ext_rewards), 'ri': numpy.array(train_int_rewards), 'fme': numpy.array(train_fm_errors[:step_limit]) } numpy.save( 'ddpg_{0}_{1}_{2:d}'.format(config.name, config.model, trial), save_data)
def run_forward_inverse_model(self, agent, trial): config = self._config trial = trial + config.shift step_limit = int(config.steps * 1e6) steps = 0 states = [] steps_per_episode = [] train_fm_errors = [] train_im_errors = [] train_ext_rewards = [] train_int_rewards = [] reward_avg = RunningAverageWindow(100) step_avg = RunningAverageWindow(100) bar = ProgressBar(config.steps * 1e6, max_width=40) exploration = GaussianExploration( config.sigma, 0.01, config.steps * config.exploration_time * 1e6) while steps < step_limit: state0 = torch.tensor(self._env.reset(), dtype=torch.float32).unsqueeze(0) done = False train_ext_reward = 0 train_int_reward = 0 train_steps = 0 while not done: train_steps += 1 states.append(state0.squeeze(0)) action0 = exploration.explore(agent.get_action(state0)) next_state, reward, done, _ = self._env.step( action0.squeeze(0).numpy()) reward = self.transform_reward(reward) state1 = torch.tensor(next_state, dtype=torch.float32).unsqueeze(0) agent.train(state0, action0, state1, reward, done) train_ext_reward += reward train_int_reward += agent.motivation.reward( state0, action0, state1).item() train_fm_error, train_im_error = agent.motivation.error( state0, action0, state1) train_fm_errors.append(train_fm_error.item()) train_im_errors.append(train_im_error.item()) state0 = state1 steps += train_steps if steps > step_limit: train_steps -= steps - step_limit bar.numerator = steps exploration.update(steps) reward_avg.update(train_ext_reward) step_avg.update(train_steps) steps_per_episode.append(train_steps) train_ext_rewards.append(train_ext_reward) train_int_rewards.append(train_int_reward) print( 'Run {0:d} step {1:d} sigma {2:f} training [ext. reward {3:f} int. reward {4:f} steps {5:d}] avg. ext. reward {6:f} avg. steps {7:f}' .format(trial, steps, exploration.sigma, train_ext_reward, train_int_reward, train_steps, reward_avg.value(), step_avg.value())) print(bar) agent.save('./models/{0:s}_{1}_{2:d}'.format(self._env_name, config.model, trial)) print('Calculating distance matrices') states = self.generate_states(torch.stack(states[:step_limit]), 500) state_dist = cdist(states.flatten(1), states.flatten(1), 'euclidean') index_list = numpy.argsort(numpy.linalg.norm(state_dist, axis=1)) states = states[index_list] state_dist = cdist(states.flatten(1), states.flatten(1), 'euclidean') latent_states = agent.network.encoder(states).detach() latent_dist = torch.cdist(latent_states, latent_states) print('Saving data...') save_data = { 'steps': numpy.array(steps_per_episode), 're': numpy.array(train_ext_rewards), 'ri': numpy.array(train_int_rewards), 'fme': numpy.array(train_fm_errors[:step_limit]), 'ime': numpy.array(train_im_errors[:step_limit]), 'sdm': state_dist, 'ldm': latent_dist.numpy() } numpy.save( 'ddpg_{0}_{1}_{2:d}'.format(config.name, config.model, trial), save_data)
def run_baseline(self, agent, trial): config = self._config n_env = config.n_env trial = trial + config.shift step_counter = StepCounter(int(config.steps * 1e6)) steps_per_episode = [] train_ext_rewards = [] train_ext_reward = numpy.zeros((n_env, 1), dtype=numpy.float32) train_steps = numpy.zeros((n_env, 1), dtype=numpy.int32) reward_avg = RunningAverageWindow(100) # time_avg = RunningAverageWindow(100) s = numpy.zeros((n_env,) + self._env.observation_space.shape, dtype=numpy.float32) for i in range(n_env): s[i] = self._env.reset(i) state0 = self.process_state(s) while step_counter.running(): with torch.no_grad(): value, action0, probs0 = agent.get_action(state0) # start = time.time() next_state, reward, done, info = self._env.step(agent.convert_action(action0.cpu())) # end = time.time() # time_avg.update(end - start) # print('Duration {0:.3f}s'.format(time_avg.value())) train_steps += 1 train_ext_reward += reward env_indices = numpy.nonzero(numpy.squeeze(done, axis=1))[0] for i in env_indices: if step_counter.steps + train_steps[i] > step_counter.limit: train_steps[i] = step_counter.limit - step_counter.steps step_counter.update(train_steps[i].item()) steps_per_episode.append(train_steps[i]) train_ext_rewards.append(train_ext_reward[i]) reward_avg.update(train_ext_reward[i].item()) print('Run {0:d} step {1:d} training [ext. reward {2:f} steps {3:d} avg. reward {4:f}]'.format(trial, step_counter.steps, train_ext_reward[i].item(), train_steps[i].item(), reward_avg.value())) step_counter.print() train_ext_reward[i] = 0 train_steps[i] = 0 next_state[i] = self._env.reset(i) state1 = self.process_state(next_state) reward = torch.tensor(reward, dtype=torch.float32) done = 1 - torch.tensor(done, dtype=torch.float32) agent.train(state0, value, action0, probs0, state1, reward, done) state0 = self.process_state(s) agent.save('./models/{0:s}_{1}_{2:d}'.format(self._env_name, config.model, trial)) print('Saving data...') save_data = { 'steps': numpy.array(steps_per_episode), 're': numpy.array(train_ext_rewards) } numpy.save('ppo_{0}_{1}_{2:d}'.format(config.name, config.model, trial), save_data)
def run_baseline(self, agent, trial): config = self._config trial = trial + config.shift step_limit = int(config.steps * 1e6) steps = 0 bar = ProgressBar(step_limit, max_width=80) steps_per_episode = [] train_ext_rewards = [] reward_avg = RunningAverageWindow(100) prob_avg = RunningAverageWindow(1000, 4) value_avg = RunningAverageWindow(1000) while steps < step_limit: state0 = self.process_state(self._env.reset()) done = False train_ext_reward = 0 train_steps = 0 while not done: value, action0, probs0 = agent.get_action(state0) value_avg.update(value.numpy()) prob_avg.update(probs0.numpy()) next_state, reward, done, info = self._env.step( agent.convert_action(action0.cpu())) if isinstance(reward, numpy.ndarray): reward = reward[0] reward = torch.tensor([reward], dtype=torch.float32).unsqueeze(-1) state1 = self.process_state(next_state) mask = torch.tensor([1], dtype=torch.float32) if done: mask[0] = 0 mask = mask.unsqueeze(-1) agent.train(state0, value, action0, probs0, state1, reward, mask) state0 = state1 if info is not None and 'raw_score' in info: train_ext_reward += info['raw_score'] else: train_ext_reward += reward.item() # train_ext_reward += reward train_steps += 1 if steps + train_steps > step_limit: train_steps = step_limit - steps steps += train_steps bar.numerator = steps steps_per_episode.append(train_steps) train_ext_rewards.append(train_ext_reward) reward_avg.update(train_ext_reward) print( 'Run {0:d} step {1:d} training [ext. reward {2:f} steps {3:d} mean reward {4:f}] prob {5:s} value {6:f}' .format(trial, steps, train_ext_reward, train_steps, reward_avg.value().item(), numpy.array2string(prob_avg.value()), value_avg.value().item())) print(bar) agent.save('./models/{0:s}_{1}_{2:d}'.format(self._env_name, config.model, trial)) print('Saving data...') save_data = { 'steps': numpy.array(steps_per_episode), 're': numpy.array(train_ext_rewards) } numpy.save( 'ppo_{0}_{1}_{2:d}'.format(config.name, config.model, trial), save_data)
def run_dop_model(self, agent, trial): config = self._config trial = trial + config.shift step_limit = int(config.steps * 1e6) steps = 0 steps_per_episode = [] train_fm_errors = [] train_ext_rewards = [] train_int_rewards = [] train_head_index = [] bar = ProgressBar(step_limit, max_width=80) reward_avg = RunningAverageWindow(100) while steps < step_limit: head_index_density = numpy.zeros(4) state0 = self.process_state(self._env.reset()) done = False train_ext_reward = 0 train_int_reward = 0 train_error = 0 train_steps = 0 while not done: train_steps += 1 value, action0, probs0 = agent.get_action(state0) agent.motivation.update_state_average(state0) action0, head_index = action0 next_state, reward, done, info = self._env.step( agent.convert_action(action0)) state1 = self.process_state(next_state) reward = torch.tensor([reward], dtype=torch.float32).unsqueeze(0) mask = torch.tensor([1], dtype=torch.float32) if done: mask[0] = 0 agent.train(state0, value, action0, probs0, state1, reward, mask) train_ext_reward += reward.item() train_int_reward += agent.motivation.reward(state0, probs0).item() train_fm_error = agent.motivation.error(state0, probs0).item() train_error += train_fm_error train_fm_errors.append(train_fm_error) head_index_density[head_index.item()] += 1 state0 = state1 steps += train_steps if steps > step_limit: train_steps -= steps - step_limit bar.numerator = steps steps_per_episode.append(train_steps) train_ext_rewards.append(train_ext_reward) train_int_rewards.append(train_int_reward) train_head_index.append(head_index_density) reward_avg.update(train_ext_reward) print( 'Run {0:d} step {1:d} training [ext. reward {2:f} error {3:f} steps {4:d} ({5:f} err/step) mean reward {6:f} density {7:s}]' .format(trial, steps, train_ext_reward, train_error, train_steps, train_error / train_steps, reward_avg.value(), numpy.array2string(head_index_density))) print(bar) agent.save('./models/{0:s}_{1}_{2:d}'.format(self._env_name, config.model, trial)) print('Saving data...') save_data = { 'steps': numpy.array(steps_per_episode), 're': numpy.array(train_ext_rewards), 'ri': numpy.array(train_int_rewards), 'fme': numpy.array(train_fm_errors[:step_limit]), 'hid': numpy.stack(train_head_index) } numpy.save( 'ppo_{0}_{1}_{2:d}'.format(config.name, config.model, trial), save_data)
def run_rnd_model(self, agent, trial): config = self._config trial = trial + config.shift step_limit = int(config.steps * 1e6) steps = 0 steps_per_episode = [] train_fm_errors = [] train_ext_rewards = [] train_int_rewards = [] bar = ProgressBar(step_limit, max_width=80) reward_avg = RunningAverageWindow(100) while steps < step_limit: state0 = self.process_state(self._env.reset()) done = False train_ext_reward = 0 train_int_reward = 0 train_steps = 0 while not done: train_steps += 1 value, action0, probs0 = agent.get_action(state0) agent.motivation.update_state_average(state0) next_state, reward, done, info = self._env.step( agent.convert_action(action0.cpu())) state1 = self.process_state(next_state) ext_reward = torch.tensor([reward], dtype=torch.float32).unsqueeze(0) int_reward = agent.motivation.reward(state0).cpu() reward = torch.cat([ext_reward, int_reward], dim=1) mask = torch.tensor([1], dtype=torch.float32) if done: mask[0] = 0 mask = mask.unsqueeze(-1) agent.train(state0, value, action0, probs0, state1, reward, mask) train_ext_reward += ext_reward.item() train_int_reward += int_reward.item() train_fm_error = agent.motivation.error(state0).item() train_fm_errors.append(train_fm_error) state0 = state1 steps += train_steps if steps > step_limit: train_steps -= steps - step_limit bar.numerator = steps steps_per_episode.append(train_steps) train_ext_rewards.append(train_ext_reward) train_int_rewards.append(train_int_reward) reward_avg.update(train_ext_reward) print( 'Run {0:d} step {1:d} training [ext. reward {2:f} int. reward {3:f} steps {4:d} ({5:f}) mean reward {6:f}]' .format(trial, steps, train_ext_reward, train_int_reward, train_steps, train_int_reward / train_steps, reward_avg.value())) print(bar) agent.save('./models/{0:s}_{1}_{2:d}'.format(self._env_name, config.model, trial)) print('Saving data...') save_data = { 'steps': numpy.array(steps_per_episode), 're': numpy.array(train_ext_rewards), 'ri': numpy.array(train_int_rewards), 'fme': numpy.array(train_fm_errors[:step_limit]) } numpy.save( 'ppo_{0}_{1}_{2:d}'.format(config.name, config.model, trial), save_data)