def _init_model(self): """init model from parameters""" self.env, env_continuous, num_states, num_actions = get_env_info( self.env_id) tf.keras.backend.set_floatx('float64') # seeding np.random.seed(self.seed) tf.random.set_seed(self.seed) self.env.seed(self.seed) if env_continuous: self.policy_net = Policy(num_states, num_actions) # current policy else: self.policy_net = DiscretePolicy(num_states, num_actions) self.running_state = ZFilter((num_states, ), clip=5) if self.model_path: print("Loading Saved Model {}_reinforce_tf2.p".format(self.env_id)) self.running_state = pickle.load( open( '{}/{}_reinforce_tf2.p'.format(self.model_path, self.env_id), "rb")) self.policy_net.load_weights("{}/{}_reinforce_tf2".format( self.model_path, self.env_id)) self.collector = MemoryCollector(self.env, self.policy_net, render=self.render, running_state=self.running_state, num_process=self.num_process) self.optimizer_p = optim.Adam(lr=self.lr_p, clipnorm=20)
def _init_model(self): """init model from parameters""" self.env, env_continuous, num_states, self.num_actions = get_env_info( self.env_id) assert env_continuous, "SAC is only applicable to continuous environment !!!!" self.action_low, self.action_high = self.env.action_space.low[ 0], self.env.action_space.high[0] self.target_entropy = -np.prod(self.env.action_space.shape) # seeding np.random.seed(self.seed) torch.manual_seed(self.seed) self.env.seed(self.seed) self.policy_net = Policy(num_states, self.num_actions, max_action=self.action_high, use_sac=True).double().to(device) self.q_net_1 = QValue(num_states, self.num_actions).double().to(device) self.q_net_target_1 = QValue(num_states, self.num_actions).double().to(device) self.q_net_2 = QValue(num_states, self.num_actions).double().to(device) self.q_net_target_2 = QValue(num_states, self.num_actions).double().to(device) # self.alpha init self.alpha = torch.exp(torch.zeros( 1, device=device).double()).requires_grad_() self.running_state = ZFilter((num_states, ), clip=5) if self.model_path: print("Loading Saved Model {}_sac_alpha.p".format(self.env_id)) self.policy_net, self.q_net_1, self.q_net_2, self.running_state \ = pickle.load(open('{}/{}_sac_alpha.p'.format(self.model_path, self.env_id), "rb")) self.q_net_target_1.load_state_dict(self.q_net_1.state_dict()) self.q_net_target_2.load_state_dict(self.q_net_2.state_dict()) self.optimizer_p = optim.Adam(self.policy_net.parameters(), lr=self.lr_p) self.optimizer_a = optim.Adam([self.alpha], lr=self.lr_a) self.optimizer_q_1 = optim.Adam(self.q_net_1.parameters(), lr=self.lr_q) self.optimizer_q_2 = optim.Adam(self.q_net_2.parameters(), lr=self.lr_q)
def _init_model(self): """init model from parameters""" self.env, env_continuous, num_states, num_actions = get_env_info( self.env_id ) tf.keras.backend.set_floatx("float64") # seeding np.random.seed(self.seed) tf.random.set_seed(self.seed) self.env.seed(self.seed) if env_continuous: self.policy_net = Policy(num_states, num_actions) else: self.policy_net = DiscretePolicy(num_states, num_actions) self.value_net = Value(num_states, l2_reg=1e-3) self.running_state = ZFilter((num_states,), clip=5) if self.model_path: print("Loading Saved Model {}_trpo_tf2.p".format(self.env_id)) self.running_state = pickle.load( open( "{}/{}_trpo_tf2.p".format(self.model_path, self.env_id), "rb", ) ) self.policy_net.load_weights( "{}/{}_trpo_tf2_p".format(self.model_path, self.env_id) ) self.value_net.load_weights( "{}/{}_trpo_tf2_v".format(self.model_path, self.env_id) ) self.collector = MemoryCollector( self.env, self.policy_net, render=self.render, running_state=self.running_state, num_process=self.num_process, ) self.optimizer_v = optim.Adam(lr=self.lr_v)
def _init_model(self): """init model from parameters""" self.env, env_continuous, num_states, self.num_actions = get_env_info( self.env_id) assert env_continuous, "TD3 is only applicable to continuous environment !!!!" self.action_low, self.action_high = self.env.action_space.low[ 0], self.env.action_space.high[0] # seeding np.random.seed(self.seed) torch.manual_seed(self.seed) self.env.seed(self.seed) self.policy_net = Policy(num_states, self.num_actions, self.action_high).double().to(device) self.policy_net_target = Policy(num_states, self.num_actions, self.action_high).double().to(device) self.value_net_1 = Value(num_states, self.num_actions).double().to(device) self.value_net_target_1 = Value(num_states, self.num_actions).double().to(device) self.value_net_2 = Value(num_states, self.num_actions).double().to(device) self.value_net_target_2 = Value(num_states, self.num_actions).double().to(device) self.running_state = ZFilter((num_states, ), clip=5) if self.model_path: print("Loading Saved Model {}_td3.p".format(self.env_id)) self.policy_net, self.value_net_1, self.value_net_2, self.running_state = pickle.load( open('{}/{}_td3.p'.format(self.model_path, self.env_id), "rb")) self.policy_net_target.load_state_dict(self.policy_net.state_dict()) self.value_net_target_1.load_state_dict(self.value_net_1.state_dict()) self.value_net_target_2.load_state_dict(self.value_net_2.state_dict()) self.optimizer_p = optim.Adam(self.policy_net.parameters(), lr=self.lr_p) self.optimizer_v_1 = optim.Adam(self.value_net_1.parameters(), lr=self.lr_v) self.optimizer_v_2 = optim.Adam(self.value_net_2.parameters(), lr=self.lr_v)
def _init_model(self): """init model from parameters""" self.env, env_continuous, num_states, num_actions = get_env_info( self.env_id) # seeding torch.manual_seed(self.seed) self.env.seed(self.seed) if env_continuous: self.policy_net = Policy(num_states, num_actions).double().to( device) # current policy self.policy_net_old = Policy(num_states, num_actions).double().to( device) # old policy else: self.policy_net = DiscretePolicy(num_states, num_actions).double().to(device) self.policy_net_old = DiscretePolicy( num_states, num_actions).double().to(device) self.value_net = Value(num_states).double().to(device) self.running_state = ZFilter((num_states, ), clip=5) if self.model_path: print("Loading Saved Model {}_ppo.p".format(self.env_id)) self.policy_net, self.value_net, self.running_state = pickle.load( open('{}/{}_ppo.p'.format(self.model_path, self.env_id), "rb")) self.policy_net_old.load_state_dict(self.policy_net.state_dict()) self.collector = MemoryCollector(self.env, self.policy_net_old, render=self.render, running_state=self.running_state, num_process=self.num_process) self.optimizer_p = optim.Adam(self.policy_net.parameters(), lr=self.lr_p) self.optimizer_v = optim.Adam(self.value_net.parameters(), lr=self.lr_v)
ppo_epochs = 10 num_iters = 2000 env = gym.make(env_id) # env = env.unwrapped num_states = env.observation_space.shape[0] if type(env.action_space) == Discrete: num_actions = env.action_space.n else: num_actions = env.action_space.shape[0] actor = ActorContinuous(num_states, num_actions).double().to(device) critic = Critic(num_states).double().to(device) running_state = ZFilter((num_states,), clip=5) agent = MemoryCollector(env, actor, running_state=running_state, num_process=4) opt_p = opt.Adam(actor.parameters(), lr=lr) opt_v = opt.Adam(critic.parameters(), lr=lr) def train(memory): batch = memory.sample() batch_states = DOUBLE(batch.state).to(device) batch_actions = DOUBLE(batch.action).to(device) batch_log_probs = DOUBLE(batch.log_prob).to(device) batch_masks = DOUBLE(batch.mask).to(device) batch_rewards = DOUBLE(batch.reward).to(device) batch_size = batch_states.shape[0]