def save(self, save_path): """save model""" check_path(save_path) pickle.dump( (self.policy_net, self.value_net, self.running_state), open("{}/{}_trpo.p".format(save_path, self.env_id), "wb"), )
def save(self, save_path): """save model""" check_path(save_path) pickle.dump( (self.policy_net, self.q_net_1, self.q_net_2, self.running_state), open('{}/{}_sac_alpha.p'.format(save_path, self.env_id), 'wb'))
def save_model(self, save_path): check_path(save_path) # torch.save((self.discriminator, self.policy, self.value), f"{save_path}/{self.exp_name}.pt") torch.save(self.discriminator, f"{save_path}/{self.env_id}_Discriminator.pt") torch.save(self.policy, f"{save_path}/{self.env_id}_Policy.pt") torch.save(self.value, f"{save_path}/{self.env_id}_Value.pt")
def save(self, save_path): """save model""" check_path(save_path) pickle.dump(self.running_state, open('{}/{}_dueling_dqn_tf2.p'.format(save_path, self.env_id), 'wb')) self.value_net.save_weights( "{}/{}_dueling_dqn_tf2".format(save_path, self.env_id))
def save(self, save_path): """save model""" check_path(save_path) pickle.dump( self.running_state, open('{}/{}_reinforce_tf2.p'.format(save_path, self.env_id), 'wb')) self.policy_net.save_weights("{}/{}_reinforce_tf2".format( save_path, self.env_id))
def save(self, save_path): """save model""" check_path(save_path) pickle.dump( self.running_state, open("{}/{}_trpo_tf2.p".format(save_path, self.env_id), "wb"), ) self.policy_net.save_weights( "{}/{}_trpo_tf2_p".format(save_path, self.env_id) ) self.value_net.save_weights( "{}/{}_trpo_tf2_v".format(save_path, self.env_id) )
def save(self, save_path): """save model""" check_path(save_path) pickle.dump((self.policy_net, self.value_net, self.running_state), open('{}/{}_ppo.p'.format(save_path, self.env_id), 'wb'))
def save(self, save_path): """save model""" check_path(save_path) pickle.dump((self.ac_net, self.running_state), open('{}/{}_a2c.p'.format(save_path, self.env_id), 'wb'))
def main(env_id, n_trajs, model_path, data_path, render, seed, obs_type): """ Collect trajectories from pre-trained models by PPO """ if data_path is not None: check_path(data_path) env, _, num_states, num_actions = get_env_info(env_id) # seed env.seed(seed) torch.manual_seed(seed) np.random.seed(seed) model = pickle.load(open(model_path, 'rb')) model.running_state.fix = True states, actions, rewards, dones, next_states = [], [], [], [], [] for i_iter in range(1, n_trajs + 1): state = env.reset() ep_reward = 0 n_step = 0 ep_states, ep_actions, ep_rewards, ep_dones, ep_next_states = [], [], [], [], [] while True: if render: env.render() normalized_state = model.running_state(state) action = model.choose_action(normalized_state) next_state, reward, done, _ = env.step(action) normalized_next_state = model.running_state(next_state) ep_reward += reward n_step += 1 ep_states.append(state if obs_type == 0 else normalized_state) ep_actions.append(action) ep_rewards.append(reward) ep_dones.append(done) ep_next_states.append(next_state if obs_type == 0 else normalized_next_state) if done: states.extend(ep_states) actions.extend(ep_actions) rewards.extend(ep_rewards) dones.extend(ep_dones) next_states.extend(ep_next_states) print( f"Iter: {i_iter}, step: {n_step}, episode Reward: {ep_reward}") break state = next_state env.close() states = np.r_[states].reshape((-1, num_states)) next_states = np.r_[next_states].reshape((-1, num_states)) actions = np.r_[actions].reshape((-1, 1)) rewards = np.r_[rewards].reshape((-1, 1)) dones = np.r_[dones].reshape((-1, 1)) numpy_dict = { 'obs': states, 'action': actions, 'reward': rewards, 'done': dones, 'next_obs': next_states } # type: Dict[str, np.ndarray] save_path = f"{data_path}/{env_id}" if data_path is not None else env_id np.savez(f"{save_path}.npz", **numpy_dict)