예제 #1
0
 def __init__(self, state_dim, action_dim, action_low, action_high,
              mem_size, train_batch_size, gamma, actor_lr, critic_lr, tau,
              eps, update_epoach):
     self.mem_size, self.train_batch_size = mem_size, train_batch_size
     self.gamma, self.actor_lr, self.critic_lr = gamma, actor_lr, critic_lr
     self.global_step = 0
     self.tau, self.eps = tau, eps
     self.state_dim, self.action_dim = state_dim, action_dim
     #self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     self.replay_mem = SlidingMemory(mem_size)
     self.device = 'cpu'
     self.action_low, self.action_high = action_low, action_high
     self.actor_policy_net = CAC_a_fc_network(state_dim, action_dim,
                                              action_low,
                                              action_high).to(self.device)
     self.actor_target_net = CAC_a_fc_network(state_dim, action_dim,
                                              action_low,
                                              action_high).to(self.device)
     self.critic_policy_net = AC_v_fc_network(state_dim).to(self.device)
     self.critic_target_net = AC_v_fc_network(state_dim).to(self.device)
     self.actor_optimizer = optim.Adam(self.actor_policy_net.parameters(),
                                       self.actor_lr)
     self.critic_optimizer = optim.Adam(self.critic_policy_net.parameters(),
                                        self.critic_lr)
     self.hard_update(self.actor_target_net, self.actor_policy_net)
     self.hard_update(self.critic_target_net, self.critic_policy_net)
     self.update_epoach = update_epoach
예제 #2
0
    def __init__(self,
                 state_dim,
                 action_dim,
                 mem_size=10000,
                 train_batch_size=32,
                 gamma=0.99,
                 lr=1e-3,
                 tau=0.1,
                 if_dueling=False,
                 if_PER=False,
                 load_path=None):
        self.mem_size, self.train_batch_size = mem_size, train_batch_size
        self.gamma, self.lr = gamma, lr
        self.global_step = 0
        self.tau = tau
        self.state_dim, self.action_dim = state_dim, action_dim
        self.if_PER = if_PER
        self.replay_mem = PERMemory(mem_size) if if_PER else SlidingMemory(
            mem_size)
        self.policy_net = DQN_fc_network(state_dim, action_dim, 1)
        self.target_net = DQN_fc_network(state_dim, action_dim, 1)
        self.epsilon, self.min_eps = 0.9, 0.4

        if load_path is not None:
            self.policy_net.load_state_dict(torch.load(load_path))

        if if_dueling:
            self.policy_net = DQN_dueling_network(state_dim, action_dim, 1)
            self.target_net = DQN_dueling_network(state_dim, action_dim, 1)

        self.optimizer = optim.RMSprop(self.policy_net.parameters(), self.lr)
        self.hard_update(self.target_net, self.policy_net)
예제 #3
0
 def __init__(self,
              state_dim,
              action_dim,
              mem_size,
              train_batch_size,
              gamma,
              actor_lr,
              critic_lr,
              tau,
              if_PER=True):
     self.mem_size, self.train_batch_size = mem_size, train_batch_size
     self.gamma, self.actor_lr, self.critic_lr = gamma, actor_lr, critic_lr
     self.global_step = 0
     self.tau, self.if_PER = tau, if_PER
     self.state_dim, self.action_dim = state_dim, action_dim
     self.replay_mem = PERMemory(mem_size) if if_PER else SlidingMemory(
         mem_size)
     #self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     self.device = 'cpu'
     self.cret = nn.MSELoss()
     self.actor_policy_net = AC_a_fc_network(state_dim,
                                             action_dim).to(self.device)
     self.actor_target_net = AC_a_fc_network(state_dim,
                                             action_dim).to(self.device)
     self.critic_policy_net = AC_v_fc_network(state_dim).to(self.device)
     self.critic_target_net = AC_v_fc_network(state_dim).to(self.device)
     self.actor_optimizer = optim.Adam(self.actor_policy_net.parameters(),
                                       self.actor_lr)
     self.critic_optimizer = optim.Adam(self.critic_policy_net.parameters(),
                                        self.critic_lr)
     self.hard_update(self.actor_target_net, self.actor_policy_net)
     self.hard_update(self.critic_target_net, self.critic_policy_net)
    def __init__(self, args, noise, flag=False, if_PER=False):
        self.args = args
        self.mem_size, self.train_batch_size = args.replay_size, args.batch_size
        self.gamma, self.lr = args.gamma, args.lr
        self.global_step = 0
        self.tau, self.explore = args.tau, noise
        self.state_dim, self.action_dim = args.state_dim, args.action_dim
        self.action_high, self.action_low = args.action_high, args.action_low
        self.if_PER = if_PER
        self.replay_mem = PERMemory(
            self.mem_size) if if_PER else SlidingMemory(self.mem_size)
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.policy_net = NAF_network(self.state_dim, self.action_dim,
                                      self.action_low, self.action_high,
                                      self.device).to(self.device)
        self.target_net = NAF_network(self.state_dim, self.action_dim,
                                      self.action_low, self.action_high,
                                      self.device).to(self.device)
        self.policy_net.apply(self._weight_init)
        if self.args.optimizer == 'adam':
            self.optimizer = optim.Adam(self.policy_net.parameters(), self.lr)
        elif self.args.optimizer == 'rmsprop':
            self.optimizer = optim.RMSprop(self.policy_net.parameters(),
                                           self.lr)
        else:
            print('Invalied Optimizer!')
            exit()
        self.hard_update(self.target_net, self.policy_net)

        self.flag = flag
예제 #5
0
    def __init__(self,
                 state_dim,
                 action_dim,
                 mem_size,
                 train_batch_size,
                 gamma,
                 lr,
                 action_low,
                 action_high,
                 tau,
                 noise,
                 flag,
                 if_PER=True):
        self.mem_size, self.train_batch_size = mem_size, train_batch_size
        self.gamma, self.lr = gamma, lr
        self.global_step = 0
        self.tau, self.explore = tau, noise
        self.state_dim, self.action_dim = state_dim, action_dim
        self.action_high, self.action_low = action_high, action_low
        self.if_PER = if_PER
        self.replay_mem = PERMemory(mem_size) if if_PER else SlidingMemory(
            mem_size)
        #self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.device = 'cpu'
        self.policy_net = NAF_network(state_dim, action_dim, action_low,
                                      action_high).to(self.device)
        self.target_net = NAF_network(state_dim, action_dim, action_low,
                                      action_high).to(self.device)
        self.optimizer = optim.Adam(self.policy_net.parameters(), self.lr)
        self.hard_update(self.target_net, self.policy_net)

        self.flag = flag
예제 #6
0
    def __init__(self, args, if_dueling = True, if_PER = False):
        self.args = args
        self.mem_size, self.train_batch_size = args.replay_size, args.batch_size
        self.gamma, self.lr = args.gamma, args.lr
        self.global_step = 0
        self.tau = args.tau
        self.state_dim, self.action_dim = args.state_dim, args.action_dim
        self.if_PER = if_PER
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.replay_mem = PERMemory(self.mem_size) if if_PER else SlidingMemory(self.mem_size)
        self.policy_net = DQN_fc_network(self.state_dim, self.action_dim, hidden_layers=1).to(self.device)
        self.target_net = DQN_fc_network(self.state_dim, self.action_dim, hidden_layers=1).to(self.device)
        self.epsilon = 1.0
        
        if if_dueling:
            self.policy_net = DQN_dueling_network(self.state_dim, self.action_dim, hidden_layers= 1).to(self.device)
            self.target_net = DQN_dueling_network(self.state_dim, self.action_dim, hidden_layers= 1).to(self.device)
        
        if args.formulation == 'FCONV':
            self.policy_net = DQN_FCONV_network(self.args.window_size, self.action_dim).to(self.device)
            self.target_net = DQN_FCONV_network(self.args.window_size, self.action_dim).to(self.device)

        self.policy_net.apply(self._weight_init)
        self.hard_update(self.target_net, self.policy_net)
        if args.optimizer == 'adam':
            self.optimizer = optim.Adam(self.policy_net.parameters(), self.lr)
        elif args.optimizer == 'rmsprop':
            self.optimizer = optim.RMSprop(self.policy_net.parameters(), self.lr)
        else:
            print('Error: Invalid Optimizer')
            exit()
        self.hard_update(self.target_net, self.policy_net)
 def __init__(self, args, if_PER = False):
     self.args = args
     self.mem_size, self.train_batch_size = args.replay_size, args.batch_size
     self.gamma = args.gamma
     self.actor_lr = args.a_lr
     self.critic_lr = args.c_lr
     self.global_step = 0
     self.tau = args.tau
     self.if_PER = if_PER
     self.state_dim, self.action_dim = args.state_dim, args.action_dim
     self.replay_mem = PERMemory(self.mem_size) if if_PER else SlidingMemory(self.mem_size)
     self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     # self.device = 'cpu'
     self.cret = nn.MSELoss()
     self.actor_policy_net = AC_a_fc_network(self.state_dim, self.action_dim).to(self.device)
     self.actor_target_net = AC_a_fc_network(self.state_dim, self.action_dim).to(self.device)
     self.critic_policy_net = AC_v_fc_network(self.state_dim).to(self.device)
     self.critic_target_net = AC_v_fc_network(self.state_dim).to(self.device)
     self.critic_policy_net.apply(self._weight_init)
     self.actor_policy_net.apply(self._weight_init)
     self.actor_optimizer = optim.Adam(self.actor_policy_net.parameters(), self.actor_lr)
     self.critic_optimizer = optim.Adam(self.critic_policy_net.parameters(), self.critic_lr)
     self.hard_update(self.actor_target_net, self.actor_policy_net)
     self.hard_update(self.critic_target_net, self.critic_policy_net)
     self.save_path = './record/'
예제 #8
0
 def __init__(self,
              state_dim,
              action_dim,
              mem_size,
              train_batch_size,
              gamma,
              actor_lr,
              critic_lr,
              action_low,
              action_high,
              tau,
              noise,
              if_PER=True,
              save_path='./record/ddpg'):
     self.mem_size, self.train_batch_size = mem_size, train_batch_size
     self.gamma, self.actor_lr, self.critic_lr = gamma, actor_lr, critic_lr
     self.global_step = 0
     self.tau, self.explore = tau, noise
     self.state_dim, self.action_dim = state_dim, action_dim
     self.action_high, self.action_low = action_high, action_low
     self.replay_mem = PERMemory(mem_size) if if_PER else SlidingMemory(
         mem_size)
     # self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     self.device = 'cpu'
     self.if_PER = if_PER
     self.actor_policy_net = DDPG_actor_network(state_dim, action_dim,
                                                action_low,
                                                action_high).to(self.device)
     self.actor_target_net = DDPG_actor_network(state_dim, action_dim,
                                                action_low,
                                                action_high).to(self.device)
     self.critic_policy_net = DDPG_critic_network(state_dim, action_dim).to(
         self.device)
     self.critic_target_net = DDPG_critic_network(state_dim, action_dim).to(
         self.device)
     # self.critic_policy_net = NAF_network(state_dim, action_dim, action_low, action_high, self.device).to(self.device)
     # self.critic_target_net = NAF_network(state_dim, action_dim, action_low, action_high, self.device).to(self.device)
     self.critic_policy_net.apply(self._weight_init)
     self.actor_policy_net.apply(self._weight_init)
     self.actor_optimizer = optim.RMSprop(
         self.actor_policy_net.parameters(), self.actor_lr)
     self.critic_optimizer = optim.RMSprop(
         self.critic_policy_net.parameters(), self.critic_lr)
     self.hard_update(self.actor_target_net, self.actor_policy_net)
     self.hard_update(self.critic_target_net, self.critic_policy_net)
     self.save_path = save_path
예제 #9
0
 def __init__(self, state_dim, action_dim, mem_size = 10000, train_batch_size = 64, \
              gamma = 0.99, actor_lr = 1e-4, critic_lr = 1e-4, \
              action_low = -1.0, action_high = 1.0, tau = 0.1, \
              sigma = 2, if_PER = True, save_path = '/record/cac'):
     
     self.mem_size, self.train_batch_size = mem_size, train_batch_size
     self.gamma, self.actor_lr, self.critic_lr = gamma, actor_lr, critic_lr
     self.global_step = 0
     self.tau, self.if_PER= tau, if_PER
     self.state_dim, self.action_dim = state_dim, action_dim
     self.replay_mem = PERMemory(mem_size) if if_PER else SlidingMemory(mem_size)
     self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     # self.device = 'cpu'
     self.action_low, self.action_high = action_low, action_high
     self.actor_policy_net = CAC_a_fc_network(state_dim, action_dim, action_low, action_high, sigma, self.device).to(self.device)
     self.actor_target_net = CAC_a_fc_network(state_dim, action_dim, action_low, action_high, sigma, self.device).to(self.device)
     self.actor_policy_net = CAC_a_sigma_fc_network(state_dim, action_dim, action_low, action_high, sigma).to(self.device)
     self.actor_target_net = CAC_a_sigma_fc_network(state_dim, action_dim, action_low, action_high, sigma).to(self.device)
     self.critic_policy_net = AC_v_fc_network(state_dim).to(self.device)
     self.critic_target_net = AC_v_fc_network(state_dim).to(self.device)
     self.actor_optimizer = optim.Adam(self.actor_policy_net.parameters(), self.actor_lr)
     self.critic_optimizer = optim.Adam(self.critic_policy_net.parameters(), self.critic_lr)
     self.hard_update(self.actor_target_net, self.actor_policy_net)
     self.hard_update(self.critic_target_net, self.critic_policy_net)
예제 #10
0
class PPO():
    def __init__(self, state_dim, action_dim, action_low, action_high,
                 mem_size, train_batch_size, gamma, actor_lr, critic_lr, tau,
                 eps, update_epoach):
        self.mem_size, self.train_batch_size = mem_size, train_batch_size
        self.gamma, self.actor_lr, self.critic_lr = gamma, actor_lr, critic_lr
        self.global_step = 0
        self.tau, self.eps = tau, eps
        self.state_dim, self.action_dim = state_dim, action_dim
        #self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.replay_mem = SlidingMemory(mem_size)
        self.device = 'cpu'
        self.action_low, self.action_high = action_low, action_high
        self.actor_policy_net = CAC_a_fc_network(state_dim, action_dim,
                                                 action_low,
                                                 action_high).to(self.device)
        self.actor_target_net = CAC_a_fc_network(state_dim, action_dim,
                                                 action_low,
                                                 action_high).to(self.device)
        self.critic_policy_net = AC_v_fc_network(state_dim).to(self.device)
        self.critic_target_net = AC_v_fc_network(state_dim).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_policy_net.parameters(),
                                          self.actor_lr)
        self.critic_optimizer = optim.Adam(self.critic_policy_net.parameters(),
                                           self.critic_lr)
        self.hard_update(self.actor_target_net, self.actor_policy_net)
        self.hard_update(self.critic_target_net, self.critic_policy_net)
        self.update_epoach = update_epoach

    def soft_update(self, target, source, tau):
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - tau) +
                                    param.data * tau)

    def hard_update(self, target, source):
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)

    #  training process
    def train(self, pre_state, action, reward, next_state, if_end):

        self.replay_mem.add(pre_state, action, reward, next_state, if_end)

        if self.replay_mem.num() < self.mem_size:
            return

        print("train epoach!")
        self.hard_update(self.actor_target_net, self.actor_policy_net)

        for i in range(self.update_epoach):

            train_batch = self.replay_mem.sample(self.train_batch_size)

            # adjust dtype to suit the gym default dtype
            pre_state_batch = torch.tensor([x[0] for x in train_batch],
                                           dtype=torch.float,
                                           device=self.device)
            action_batch = torch.tensor([x[1] for x in train_batch],
                                        dtype=torch.float,
                                        device=self.device)
            # view to make later computation happy
            reward_batch = torch.tensor([x[2] for x in train_batch],
                                        dtype=torch.float,
                                        device=self.device).view(
                                            self.train_batch_size, 1)
            next_state_batch = torch.tensor([x[3] for x in train_batch],
                                            dtype=torch.float,
                                            device=self.device)
            if_end = [x[4] for x in train_batch]
            if_end = torch.tensor(np.array(if_end).astype(float),
                                  device=self.device,
                                  dtype=torch.float).view(
                                      self.train_batch_size, 1)

            # use the target_Q_network to get the target_Q_value
            with torch.no_grad():
                v_next_state = self.critic_target_net(
                    next_state_batch).detach()
                v_target = self.gamma * v_next_state * (1 -
                                                        if_end) + reward_batch

            v_pred = self.critic_policy_net(pre_state_batch)

            advantage = v_pred.detach() - v_target

            old_action_prob = self.actor_target_net(pre_state_batch).log_prob(
                action_batch)

            self.actor_optimizer.zero_grad()
            log_action_prob = self.actor_policy_net(pre_state_batch).log_prob(
                action_batch)

            aloss1 = log_action_prob / old_action_prob * advantage
            aloss2 = torch.clamp(log_action_prob / old_action_prob,
                                 1 - self.eps, 1 + self.eps) * advantage
            aloss = -torch.min(aloss1, aloss2)
            aloss = aloss.mean()
            aloss.backward()
            torch.nn.utils.clip_grad_norm_(self.actor_policy_net.parameters(),
                                           1)
            self.actor_optimizer.step()

        self.critic_optimizer.zero_grad()
        closs = (v_pred - v_target)**2
        closs = closs.mean()
        closs.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_policy_net.parameters(), 1)
        self.critic_optimizer.step()

        # update target network
        self.replay_mem.clear()
        self.soft_update(self.critic_target_net, self.critic_policy_net,
                         self.tau)
        self.global_step += 1

    # store the (pre_s, action, reward, next_state, if_end) tuples in the replay memory
    def perceive(self, pre_s, action, reward, next_state, if_end):
        self.replay_mem.append([pre_s, action, reward, next_state, if_end])
        if len(self.replay_mem) > self.mem_size:
            self.replay_mem.popleft()

    # use the policy net to choose the action with the highest Q value
    def action(self,
               s,
               sample=True):  # use flag to suit other models' action interface
        s = torch.tensor(s, dtype=torch.float, device=self.device).unsqueeze(0)
        with torch.no_grad():
            m = self.actor_policy_net(s)
            a = np.clip(m.sample(), self.action_low,
                        self.action_high) if sample else m.mean
            return a.numpy()[0]