示例#1
0
    def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        self.env = env        
        self.action_range = [env.action_space.low, env.action_space.high]
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        # hyperparameters
        self.gamma = gamma
        self.tau = tau       

        # initialize networks 
        self.value_net = ValueNetwork(self.obs_dim, 1).to(self.device)
        self.target_value_net = ValueNetwork(self.obs_dim, 1).to(self.device)
        self.q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device)
        self.q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device)
        self.policy_net = GaussianPolicy(self.obs_dim, self.action_dim).to(self.device)
        
        # copy params to target param
        for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()):
            target_param.data.copy_(param)
            
        # initialize optimizers 
        self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr)
        self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr)
        self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr)

        self.replay_buffer = Buffer(buffer_maxlen)
    def __init__(self, env, gamma, tau, alpha, q_lr, policy_lr, a_lr,
                 buffer_maxlen):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = env
        self.action_range = [env.action_space.low, env.action_space.high]
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        # hyperparameters
        self.gamma = gamma
        self.tau = tau
        self.update_step = 0
        self.delay_step = 2

        # initialize networks
        self.q_net1 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.q_net2 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.target_q_net1 = SoftQNetwork(self.obs_dim,
                                          self.action_dim).to(self.device)
        self.target_q_net2 = SoftQNetwork(self.obs_dim,
                                          self.action_dim).to(self.device)
        self.policy_net = PolicyNetwork(self.obs_dim,
                                        self.action_dim).to(self.device)

        # copy params to target param
        for target_param, param in zip(self.target_q_net1.parameters(),
                                       self.q_net1.parameters()):
            target_param.data.copy_(param)

        for target_param, param in zip(self.target_q_net2.parameters(),
                                       self.q_net2.parameters()):
            target_param.data.copy_(param)

        # initialize optimizers
        self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr)
        self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                           lr=policy_lr)

        # entropy temperature
        self.alpha = alpha
        self.target_entropy = -torch.prod(
            torch.Tensor(self.env.action_space.shape).to(self.device)).item()
        self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
        self.alpha_optim = optim.Adam([self.log_alpha], lr=a_lr)

        self.replay_buffer = BasicBuffer(buffer_maxlen)
示例#3
0
class SACAgent:
    def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = env
        self.action_range = [env.action_space.low, env.action_space.high]
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        # hyperparameters
        self.gamma = gamma
        self.tau = tau
        self.update_step = 0
        self.delay_step = 2

        # initialize networks
        self.value_net = ValueNetwork(self.obs_dim, 1).to(self.device)
        self.target_value_net = ValueNetwork(self.obs_dim, 1).to(self.device)
        self.q_net1 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.q_net2 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.policy_net = PolicyNetwork(self.obs_dim,
                                        self.action_dim).to(self.device)

        # copy params to target param
        for target_param, param in zip(self.target_value_net.parameters(),
                                       self.value_net.parameters()):
            target_param.data.copy_(param)

        # initialize optimizers
        self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr)
        self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr)
        self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                           lr=policy_lr)

        self.replay_buffer = BasicBuffer(buffer_maxlen)

    def get_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        mean, log_std = self.policy_net.forward(state)
        std = log_std.exp()

        normal = Normal(mean, std)
        z = normal.sample()
        action = torch.tanh(z)
        action = action.cpu().detach().squeeze(0).numpy()

        return self.rescale_action(action)

    def rescale_action(self, action):
        return action * (self.action_range[1] - self.action_range[0]) / 2.0 +\
            (self.action_range[1] + self.action_range[0]) / 2.0

    def update(self, batch_size):
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(
            batch_size)
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.FloatTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)
        dones = dones.view(dones.size(0), -1)

        next_actions, next_log_pi = self.policy_net.sample(next_states)
        next_q1 = self.q_net1(next_states, next_actions)
        next_q2 = self.q_net2(next_states, next_actions)
        next_v = self.target_value_net(next_states)

        # value Loss
        next_v_target = torch.min(next_q1, next_q2) - next_log_pi
        curr_v = self.value_net.forward(states)
        v_loss = F.mse_loss(curr_v, next_v_target.detach())

        # q loss
        curr_q1 = self.q_net1.forward(states, actions)
        curr_q2 = self.q_net2.forward(states, actions)
        expected_q = rewards + (1 - dones) * self.gamma * next_v
        q1_loss = F.mse_loss(curr_q1, expected_q.detach())
        q2_loss = F.mse_loss(curr_q2, expected_q.detach())

        # update value network and q networks
        self.value_optimizer.zero_grad()
        v_loss.backward()
        self.value_optimizer.step()

        self.q1_optimizer.zero_grad()
        q1_loss.backward()
        self.q1_optimizer.step()

        self.q2_optimizer.zero_grad()
        q2_loss.backward()
        self.q2_optimizer.step()

        #delayed update for policy net and target value nets
        if self.update_step % self.delay_step == 0:
            new_actions, log_pi = self.policy_net.sample(states)
            min_q = torch.min(self.q_net1.forward(states, new_actions),
                              self.q_net2.forward(states, new_actions))
            policy_loss = (log_pi - min_q).mean()

            self.policy_optimizer.zero_grad()
            policy_loss.backward()
            self.policy_optimizer.step()

            # target networks
            for target_param, param in zip(self.target_value_net.parameters(),
                                           self.value_net.parameters()):
                target_param.data.copy_(self.tau * param +
                                        (1 - self.tau) * target_param)

        self.update_step += 1
示例#4
0
def train_sac(args):
    '''
    Parse arguments and construct objects for training sac model
    '''
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')
    token_tables = op.build_token_tables()
    # initialize tensorboard for logging output
    from os import path
    train_logger = None
    if args.log_dir is not None:
        train_logger = tb.SummaryWriter(path.join(args.log_dir, 'train'),
                                        flush_secs=1)

    # Load Models
    policy = RobustFill(string_size=len(op.CHARACTER),
                        string_embedding_size=args.embedding_size,
                        decoder_inp_size=128,
                        hidden_size=args.hidden_size,
                        program_size=len(token_tables.op_token_table),
                        device=device)
    q_1 = SoftQNetwork(128, len(token_tables.op_token_table), args.hidden_size)
    q_2 = SoftQNetwork(128, len(token_tables.op_token_table), args.hidden_size)

    tgt_q_1 = SoftQNetwork(128, len(token_tables.op_token_table),
                           args.hidden_size).eval()
    tgt_q_2 = SoftQNetwork(128, len(token_tables.op_token_table),
                           args.hidden_size).eval()

    if args.continue_training_policy:
        policy.load_state_dict(
            torch.load(path.join(path.dirname(path.abspath(__file__)),
                                 args.checkpoint_filename),
                       map_location=device))
    elif args.continue_training:
        policy.load_state_dict(
            torch.load(path.join(path.dirname(path.abspath(__file__)),
                                 args.checkpoint_filename),
                       map_location=device))
        q_1.load_state_dict(
            torch.load(path.join(path.dirname(path.abspath(__file__)),
                                 args.q1_checkpoint_filename),
                       map_location=device))
        q_2.load_state_dict(
            torch.load(path.join(path.dirname(path.abspath(__file__)),
                                 args.q2_checkpoint_filename),
                       map_location=device))

    for target_param, param in zip(tgt_q_1.parameters(), q_1.parameters()):
        target_param.data.copy_(param.data)
    for target_param, param in zip(tgt_q_2.parameters(), q_2.parameters()):
        target_param.data.copy_(param.data)
    for param in tgt_q_1.parameters():
        param.requires_grad = False
    for param in tgt_q_2.parameters():
        param.requires_grad = False

    policy = policy.to(device)
    policy.set_device(device)
    q_1 = q_1.to(device)
    q_2 = q_2.to(device)
    tgt_q_1 = tgt_q_1.to(device)
    tgt_q_2 = tgt_q_2.to(device)

    # Initialize optimizers
    if (args.optimizer == 'sgd'):
        policy_opt = optim.SGD(policy.parameters(), lr=args.lr)
        q_1_opt = optim.SGD(q_1.parameters(), lr=args.lr)
        q_2_opt = optim.SGD(q_2.parameters(), lr=args.lr)
        entropy_opt = optim.SGD([policy.log_alpha], lr=args.lr)
    else:
        policy_opt = optim.Adam(policy.parameters(), lr=args.lr)
        q_1_opt = optim.Adam(q_1.parameters(), lr=args.lr)
        q_2_opt = optim.Adam(q_2.parameters(), lr=args.lr)
        entropy_opt = optim.Adam([policy.log_alpha], lr=args.lr)

    # Other necessary objects
    env = RobustFillEnv()
    replay_buffer_size = 1_000_000
    replay_buffer = Replay_Buffer(replay_buffer_size, args.batch_size)
    her = HER()

    train_sac_(args, policy, q_1, q_2, tgt_q_1, tgt_q_2, policy_opt, q_1_opt,
               q_2_opt, entropy_opt, replay_buffer, her, env, train_logger,
               args.checkpoint_filename, args.checkpoint_step_size,
               args.print_tensors)
示例#5
0
    def __init__(self, env: object, gamma: float, tau: float,
                 buffer_maxlen: int, critic_lr: float, actor_lr: float,
                 reward_scale: int):

        # Selecting the device to use, wheter CUDA (GPU) if available or CPU
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        # Creating the Gym environments for training and evaluation
        self.env = env
        # Get max and min values of the action of this environment
        self.action_range = [
            self.env.action_space.low, self.env.action_space.high
        ]
        # Get dimension of of the state and the action
        self.obs_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]

        # hyperparameters
        self.gamma = gamma
        self.tau = tau
        self.critic_lr = critic_lr
        self.actor_lr = actor_lr
        self.buffer_maxlen = buffer_maxlen
        self.reward_scale = reward_scale

        # Scaling and bias factor for the actions -> We need scaling of the actions because each environment has different min and max values of actions
        self.scale = (self.action_range[1] - self.action_range[0]) / 2.0
        self.bias = (self.action_range[1] + self.action_range[0]) / 2.0

        # initialize networks
        self.q_net1 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.target_q_net1 = SoftQNetwork(self.obs_dim,
                                          self.action_dim).to(self.device)
        self.q_net2 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.target_q_net2 = SoftQNetwork(self.obs_dim,
                                          self.action_dim).to(self.device)
        self.policy = PolicyNetwork(self.obs_dim,
                                    self.action_dim).to(self.device)

        # copy weight parameters to the target Q networks
        for target_param, param in zip(self.target_q_net1.parameters(),
                                       self.q_net1.parameters()):
            target_param.data.copy_(param)

        for target_param, param in zip(self.target_q_net2.parameters(),
                                       self.q_net2.parameters()):
            target_param.data.copy_(param)

        # initialize optimizers
        self.q1_optimizer = optim.Adam(self.q_net1.parameters(),
                                       lr=self.critic_lr)
        self.q2_optimizer = optim.Adam(self.q_net2.parameters(),
                                       lr=self.critic_lr)
        self.policy_optimizer = optim.Adam(self.policy.parameters(),
                                           lr=self.actor_lr)

        # Create a replay buffer
        self.replay_buffer = BasicBuffer(self.buffer_maxlen)
示例#6
0
class SACAgent():
    def __init__(self, env: object, gamma: float, tau: float,
                 buffer_maxlen: int, critic_lr: float, actor_lr: float,
                 reward_scale: int):

        # Selecting the device to use, wheter CUDA (GPU) if available or CPU
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        # Creating the Gym environments for training and evaluation
        self.env = env
        # Get max and min values of the action of this environment
        self.action_range = [
            self.env.action_space.low, self.env.action_space.high
        ]
        # Get dimension of of the state and the action
        self.obs_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]

        # hyperparameters
        self.gamma = gamma
        self.tau = tau
        self.critic_lr = critic_lr
        self.actor_lr = actor_lr
        self.buffer_maxlen = buffer_maxlen
        self.reward_scale = reward_scale

        # Scaling and bias factor for the actions -> We need scaling of the actions because each environment has different min and max values of actions
        self.scale = (self.action_range[1] - self.action_range[0]) / 2.0
        self.bias = (self.action_range[1] + self.action_range[0]) / 2.0

        # initialize networks
        self.q_net1 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.target_q_net1 = SoftQNetwork(self.obs_dim,
                                          self.action_dim).to(self.device)
        self.q_net2 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.target_q_net2 = SoftQNetwork(self.obs_dim,
                                          self.action_dim).to(self.device)
        self.policy = PolicyNetwork(self.obs_dim,
                                    self.action_dim).to(self.device)

        # copy weight parameters to the target Q networks
        for target_param, param in zip(self.target_q_net1.parameters(),
                                       self.q_net1.parameters()):
            target_param.data.copy_(param)

        for target_param, param in zip(self.target_q_net2.parameters(),
                                       self.q_net2.parameters()):
            target_param.data.copy_(param)

        # initialize optimizers
        self.q1_optimizer = optim.Adam(self.q_net1.parameters(),
                                       lr=self.critic_lr)
        self.q2_optimizer = optim.Adam(self.q_net2.parameters(),
                                       lr=self.critic_lr)
        self.policy_optimizer = optim.Adam(self.policy.parameters(),
                                           lr=self.actor_lr)

        # Create a replay buffer
        self.replay_buffer = BasicBuffer(self.buffer_maxlen)

    def update(self, batch_size: int):
        # Sampling experiences from the replay buffer
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(
            batch_size)

        # Convert numpy arrays of experience tuples into pytorch tensors
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.FloatTensor(actions).to(self.device)
        rewards = self.reward_scale * torch.FloatTensor(rewards).to(
            self.device)  # in SAC we do reward scaling for the sampled rewards
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)
        dones = dones.view(dones.size(0), -1)

        # Critic update (computing the loss)
        # Please refer to equation (6) in the paper for details
        # Sample actions for the next states (s_t+1) using the current policy
        next_actions, next_log_pi, _, _ = self.policy.sample(
            next_states, self.scale)
        next_actions = self.rescale_action(next_actions)

        # Compute Q(s_t+1,a_t+1) by giving the states and actions to the Q network and choose the minimum from 2 target Q networks
        next_q1 = self.target_q_net1(next_states, next_actions)
        next_q2 = self.target_q_net2(next_states, next_actions)
        min_q = torch.min(next_q1,
                          next_q2)  # find minimum between next_q1 and next_q2

        # Compute the next Q_target (Q(s_t,a_t)-alpha(next_log_pi))
        next_q_target = (min_q - next_log_pi)

        # Compute the Q(s_t,a_t) using s_t and a_t from the replay buffer
        curr_q1 = self.q_net1.forward(states, actions)
        curr_q2 = self.q_net2.forward(states, actions)

        # Find expected Q, i.e., r(t) + gamma*next_q_target
        expected_q = rewards + (1 - dones) * self.gamma * next_q_target

        # Compute loss between Q network and expected Q
        q1_loss = F.mse_loss(curr_q1, expected_q.detach())
        q2_loss = F.mse_loss(curr_q2, expected_q.detach())

        # Backpropagate the losses and update Q network parameters
        self.q1_optimizer.zero_grad()
        q1_loss.backward()
        self.q1_optimizer.step()

        self.q2_optimizer.zero_grad()
        q2_loss.backward()
        self.q2_optimizer.step()

        # Policy update (computing the loss)
        # Sample new actions for the current states (s_t) using the current policy
        new_actions, log_pi, _, _ = self.policy.sample(states, self.scale)
        new_actions = self.rescale_action(new_actions)

        # Compute Q(s_t,a_t) and choose the minimum from 2 Q networks
        new_q1 = self.q_net1.forward(states, new_actions)
        new_q2 = self.q_net2.forward(states, new_actions)
        min_q = torch.min(new_q1, new_q2)

        # Compute the next policy loss, i.e., alpha*log_pi - Q(s_t,a_t) eq. (7)
        policy_loss = (log_pi - min_q).mean()

        # Backpropagate the losses and update policy network parameters
        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()

        # Updating target networks with soft update using update rate tau
        for target_param, param in zip(self.target_q_net1.parameters(),
                                       self.q_net1.parameters()):
            target_param.data.copy_(self.tau * param +
                                    (1 - self.tau) * target_param)

        for target_param, param in zip(self.target_q_net2.parameters(),
                                       self.q_net2.parameters()):
            target_param.data.copy_(self.tau * param +
                                    (1 - self.tau) * target_param)

    def get_action(
            self, state: np.ndarray,
            stochastic: bool) -> Tuple[np.ndarray, torch.Tensor, torch.Tensor]:
        # state: the state input to the pi network
        # stochastic: boolean (True -> use noisy action, False -> use noiseless (deterministic action))
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)

        # Get mean and sigma from the policy network
        mean, log_std = self.policy.forward(state)
        std = log_std.exp()

        # Stochastic mode is used for training, non-stochastic mode is used for evaluation
        if stochastic:
            normal = Normal(mean, std)
            z = normal.sample()
            action = torch.tanh(z)
            action = action.cpu().detach().squeeze(0).numpy()
        else:
            normal = Normal(mean, 0)
            z = normal.sample()
            action = torch.tanh(z)
            action = action.cpu().detach().squeeze(0).numpy()

        # return a rescaled action, and also the mean and standar deviation of the action
        # we use a rescaled action since the output of the policy network is [-1,1] and the mujoco environments could be ranging from [-n,n] where n is an arbitrary real value
        return self.rescale_action(action), mean, std

    def rescale_action(self, action: np.ndarray) -> np.ndarray:
        # we use a rescaled action since the output of the policy network is [-1,1] and the mujoco environments could be ranging from [-n,n] where n is an arbitrary real value
        # scale -> scalar multiplication
        # bias -> scalar offset
        return action * self.scale[0] + self.bias[0]

    def Actor_save(self, WORKSPACE: str):
        # save 각 node별 모델 저장
        print("Save the torch model")
        savePath = WORKSPACE + "./policy_model5_Hop_.pth"
        torch.save(self.policy.state_dict(), savePath)

    def Actor_load(self, WORKSPACE: str):
        # save 각 node별 모델 로드
        print("load the torch model")
        savePath = WORKSPACE + "./policy_model5_Hop_.pth"  # Best
        self.policy = PolicyNetwork(self.obs_dim,
                                    self.action_dim).to(self.device)
        self.policy.load_state_dict(torch.load(savePath))
示例#7
0
class SACAgent:
    def __init__(self, env, gamma, tau, alpha, q_lr, policy_lr, a_lr,
                 buffer_maxlen):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = env
        self.action_range = [0, 250]
        self.obs_dim = env.state_dim
        self.action_dim = env.action_dim

        # hyperparameters
        self.gamma = gamma
        self.tau = tau
        self.update_step = 0
        self.delay_step = 2

        # initialize networks
        self.q_net1 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.q_net2 = SoftQNetwork(self.obs_dim,
                                   self.action_dim).to(self.device)
        self.target_q_net1 = SoftQNetwork(self.obs_dim,
                                          self.action_dim).to(self.device)
        self.target_q_net2 = SoftQNetwork(self.obs_dim,
                                          self.action_dim).to(self.device)
        self.policy_net = PolicyNetwork(self.obs_dim,
                                        self.action_dim).to(self.device)

        # copy params to target param
        for target_param, param in zip(self.target_q_net1.parameters(),
                                       self.q_net1.parameters()):
            target_param.data.copy_(param)

        for target_param, param in zip(self.target_q_net2.parameters(),
                                       self.q_net2.parameters()):
            target_param.data.copy_(param)

        # initialize optimizers
        self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr)
        self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                           lr=policy_lr)

        # entropy temperature
        self.alpha = alpha
        self.target_entropy = -torch.prod(
            torch.Tensor([self.action_dim, 1]).to(self.device)).item()
        self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
        self.alpha_optim = optim.Adam([self.log_alpha], lr=a_lr)

        self.replay_buffer = BasicBuffer(buffer_maxlen)

    def get_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        mean, log_std = self.policy_net.forward(state)
        std = log_std.exp()

        normal = Normal(mean, std)
        z = normal.sample()
        action = torch.tanh(z)
        action = action.cpu().detach().squeeze(0).numpy()

        return self.rescale_action(action)

    def rescale_action(self, action):
        return action * (self.action_range[1] - self.action_range[0]) / 2.0 +\
            (self.action_range[1] + self.action_range[0]) / 2.0

    def update(self, batch_size):
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(
            batch_size)
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.FloatTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)
        dones = dones.view(dones.size(0), -1)

        next_actions, next_log_pi = self.policy_net.sample(next_states)
        next_q1 = self.target_q_net1(next_states, next_actions)
        next_q2 = self.target_q_net2(next_states, next_actions)
        next_q_target = torch.min(next_q1, next_q2) - self.alpha * next_log_pi
        expected_q = rewards + (1 - dones) * self.gamma * next_q_target

        # q loss
        curr_q1 = self.q_net1.forward(states, actions)
        curr_q2 = self.q_net2.forward(states, actions)
        q1_loss = F.mse_loss(curr_q1, expected_q.detach())
        q2_loss = F.mse_loss(curr_q2, expected_q.detach())

        # update q networks
        self.q1_optimizer.zero_grad()
        q1_loss.backward()
        self.q1_optimizer.step()

        self.q2_optimizer.zero_grad()
        q2_loss.backward()
        self.q2_optimizer.step()

        # delayed update for policy network and target q networks
        new_actions, log_pi = self.policy_net.sample(states)
        if self.update_step % self.delay_step == 0:
            min_q = torch.min(self.q_net1.forward(states, new_actions),
                              self.q_net2.forward(states, new_actions))
            policy_loss = (self.alpha * log_pi - min_q).mean()

            self.policy_optimizer.zero_grad()
            policy_loss.backward()
            self.policy_optimizer.step()

            # target networks
            for target_param, param in zip(self.target_q_net1.parameters(),
                                           self.q_net1.parameters()):
                target_param.data.copy_(self.tau * param +
                                        (1 - self.tau) * target_param)

            for target_param, param in zip(self.target_q_net2.parameters(),
                                           self.q_net2.parameters()):
                target_param.data.copy_(self.tau * param +
                                        (1 - self.tau) * target_param)

        # update temperature
        alpha_loss = (self.log_alpha *
                      (-log_pi - self.target_entropy).detach()).mean()

        self.alpha_optim.zero_grad()
        alpha_loss.backward()
        self.alpha_optim.step()
        self.alpha = self.log_alpha.exp()

        self.update_step += 1
示例#8
0
    def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.firsttime = 0

        self.env = env
        self.action_range = [env.action_space.low, env.action_space.high]
        #self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]  #1

        self.conv_channels = 4
        self.kernel_size = (3, 3)

        self.img_size = (500, 500, 3)

        print("Diagnostics:")
        print(f"action_range: {self.action_range}")
        #print(f"obs_dim: {self.obs_dim}")
        print(f"action_dim: {self.action_dim}")

        # hyperparameters
        self.gamma = gamma
        self.tau = tau
        self.update_step = 0
        self.delay_step = 2

        # initialize networks
        self.feature_net = FeatureExtractor(self.img_size[2],
                                            self.conv_channels,
                                            self.kernel_size).to(self.device)
        print("Feature net init'd successfully")

        input_dim = self.feature_net.get_output_size(self.img_size)
        self.input_size = input_dim[0] * input_dim[1] * input_dim[2]
        print(f"input_size: {self.input_size}")

        self.value_net = ValueNetwork(self.input_size, 1).to(self.device)
        self.target_value_net = ValueNetwork(self.input_size,
                                             1).to(self.device)
        self.q_net1 = SoftQNetwork(self.input_size,
                                   self.action_dim).to(self.device)
        self.q_net2 = SoftQNetwork(self.input_size,
                                   self.action_dim).to(self.device)
        self.policy_net = PolicyNetwork(self.input_size,
                                        self.action_dim).to(self.device)

        print("Finished initing all nets")

        # copy params to target param
        for target_param, param in zip(self.target_value_net.parameters(),
                                       self.value_net.parameters()):
            target_param.data.copy_(param)

        print("Finished copying targets")

        # initialize optimizers
        self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr)
        self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr)
        self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                           lr=policy_lr)

        print("Finished initing optimizers")

        self.replay_buffer = BasicBuffer(buffer_maxlen)
        print("End of init")
示例#9
0
class SACAgent:
    def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.firsttime = 0

        self.env = env
        self.action_range = [env.action_space.low, env.action_space.high]
        #self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]  #1

        self.conv_channels = 4
        self.kernel_size = (3, 3)

        self.img_size = (500, 500, 3)

        print("Diagnostics:")
        print(f"action_range: {self.action_range}")
        #print(f"obs_dim: {self.obs_dim}")
        print(f"action_dim: {self.action_dim}")

        # hyperparameters
        self.gamma = gamma
        self.tau = tau
        self.update_step = 0
        self.delay_step = 2

        # initialize networks
        self.feature_net = FeatureExtractor(self.img_size[2],
                                            self.conv_channels,
                                            self.kernel_size).to(self.device)
        print("Feature net init'd successfully")

        input_dim = self.feature_net.get_output_size(self.img_size)
        self.input_size = input_dim[0] * input_dim[1] * input_dim[2]
        print(f"input_size: {self.input_size}")

        self.value_net = ValueNetwork(self.input_size, 1).to(self.device)
        self.target_value_net = ValueNetwork(self.input_size,
                                             1).to(self.device)
        self.q_net1 = SoftQNetwork(self.input_size,
                                   self.action_dim).to(self.device)
        self.q_net2 = SoftQNetwork(self.input_size,
                                   self.action_dim).to(self.device)
        self.policy_net = PolicyNetwork(self.input_size,
                                        self.action_dim).to(self.device)

        print("Finished initing all nets")

        # copy params to target param
        for target_param, param in zip(self.target_value_net.parameters(),
                                       self.value_net.parameters()):
            target_param.data.copy_(param)

        print("Finished copying targets")

        # initialize optimizers
        self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr)
        self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr)
        self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                           lr=policy_lr)

        print("Finished initing optimizers")

        self.replay_buffer = BasicBuffer(buffer_maxlen)
        print("End of init")

    def get_action(self, state):
        if state.shape != self.img_size:
            print(
                f"Invalid size, expected shape {self.img_size}, got {state.shape}"
            )
            return None

        inp = torch.from_numpy(state).float().permute(2, 0, 1).unsqueeze(0).to(
            self.device)
        features = self.feature_net(inp)
        features = features.view(-1, self.input_size)

        mean, log_std = self.policy_net.forward(features)
        std = log_std.exp()

        normal = Normal(mean, std)
        z = normal.sample()
        action = torch.tanh(z)
        action = action.cpu().detach().squeeze(0).numpy()

        return self.rescale_action(action)

    def rescale_action(self, action):
        return action * (self.action_range[1] - self.action_range[0]) / 2.0 +\
            (self.action_range[1] + self.action_range[0]) / 2.0

    def update(self, batch_size):
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(
            batch_size)

        # states and next states are lists of ndarrays, np.stack converts them to
        # ndarrays of shape (batch_size, height, width, num_channels)
        states = np.stack(states)
        next_states = np.stack(next_states)

        states = torch.FloatTensor(states).permute(0, 3, 1, 2).to(self.device)
        actions = torch.FloatTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).permute(0, 3, 1,
                                                             2).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)
        dones = dones.view(dones.size(0), -1)

        # Process images
        features = self.feature_net(
            states)  #.contiguous() # Properly shaped due to batching
        next_features = self.feature_net(next_states)  #.contiguous()

        features = torch.reshape(features, (64, self.input_size))
        next_features = torch.reshape(next_features, (64, self.input_size))

        next_actions, next_log_pi = self.policy_net.sample(next_features)
        next_q1 = self.q_net1(next_features, next_actions)
        next_q2 = self.q_net2(next_features, next_actions)
        next_v = self.target_value_net(next_features)

        next_v_target = torch.min(next_q1, next_q2) - next_log_pi
        curr_v = self.value_net.forward(features)
        v_loss = F.mse_loss(curr_v, next_v_target.detach())

        # q loss
        expected_q = rewards + (1 - dones) * self.gamma * next_v
        curr_q1 = self.q_net1.forward(features, actions)
        curr_q2 = self.q_net2.forward(features, actions)
        q1_loss = F.mse_loss(curr_q1, expected_q.detach())
        q2_loss = F.mse_loss(curr_q2, expected_q.detach())

        # update value and q networks
        self.value_optimizer.zero_grad()
        v_loss.backward(retain_graph=True)
        self.value_optimizer.step()

        self.q1_optimizer.zero_grad()
        q1_loss.backward(retain_graph=True)
        self.q1_optimizer.step()

        self.q2_optimizer.zero_grad()
        q2_loss.backward(retain_graph=True)
        self.q2_optimizer.step()

        # delayed update for policy network and target q networks
        if self.update_step % self.delay_step == 0:
            new_actions, log_pi = self.policy_net.sample(features)
            min_q = torch.min(self.q_net1.forward(features, new_actions),
                              self.q_net2.forward(features, new_actions))
            policy_loss = (log_pi - min_q).mean()

            self.policy_optimizer.zero_grad()
            policy_loss.backward(retain_graph=True)
            self.policy_optimizer.step()

            # target networks
            for target_param, param in zip(self.target_value_net.parameters(),
                                           self.value_net.parameters()):
                target_param.data.copy_(self.tau * param +
                                        (1 - self.tau) * target_param)

        self.update_step += 1