Пример #1
0
class SAC_Alpha:
    def __init__(self,
                 env,
                 render=False,
                 num_process=1,
                 memory_size=1000000,
                 lr_p=1e-3,
                 lr_a=3e-4,
                 lr_q=1e-3,
                 gamma=0.99,
                 polyak=0.995,
                 batch_size=100,
                 min_update_step=1000,
                 update_step=50,
                 target_update_delay=1,
                 seed=1,
                 ):
        self.env = env
        self.gamma = gamma
        self.polyak = polyak
        self.memory = FixedMemory(memory_size)
        self.render = render
        self.num_process = num_process
        self.lr_p = lr_p
        self.lr_a = lr_a
        self.lr_q = lr_q
        self.batch_size = batch_size
        self.min_update_step = min_update_step
        self.update_step = update_step
        self.target_update_delay = target_update_delay
        self.seed = seed

        self._init_model()

    def _init_model(self):
        """init model from parameters"""
        self.num_states = self.env.observation_space.shape[0]
        self.num_actions = self.env.action_space.shape[0]

        self.action_low, self.action_high = self.env.action_space.low[0], self.env.action_space.high[0]

        self.target_entropy = - np.prod(self.env.action_space.shape)
        # seeding
        np.random.seed(self.seed)
        torch.manual_seed(self.seed)
        self.env.seed(self.seed)

        self.policy_net = Actor(self.num_states, self.num_actions, action_limit=self.action_high).to(device)

        self.q_net_1 = Value(self.num_states + self.num_actions).to(device)
        self.q_net_target_1 = Value(self.num_states + self.num_actions).to(device)
        self.q_net_2 = Value(self.num_states + self.num_actions).to(device)
        self.q_net_target_2 = Value(self.num_states + self.num_actions).to(device)

        # self.alpha init
        self.alpha = torch.exp(torch.zeros(1, device=device)).requires_grad_()

        self.q_net_target_1.load_state_dict(self.q_net_1.state_dict())
        self.q_net_target_2.load_state_dict(self.q_net_2.state_dict())

        self.optimizer_p = optim.Adam(self.policy_net.parameters(), lr=self.lr_p)
        self.optimizer_a = optim.Adam([self.alpha], lr=self.lr_a)
        self.optimizer_q_1 = optim.Adam(self.q_net_1.parameters(), lr=self.lr_q)
        self.optimizer_q_2 = optim.Adam(self.q_net_2.parameters(), lr=self.lr_q)

    def choose_action(self, state):
        """select action"""
        state = FLOAT(state).unsqueeze(0).to(device)
        with torch.no_grad():
            action, _ = self.policy_net.get_action_log_prob(state)
        action = action.cpu().numpy()[0]
        return action, None

    def eval(self, i_iter, render=False):
        """evaluate model"""
        state = self.env.reset()
        test_reward = 0
        while True:
            if render:
                self.env.render()
            action, _ = self.choose_action(state)
            state, reward, done, _ = self.env.step(action)

            test_reward += reward
            if done:
                break
        print(f"Iter: {i_iter}, test Reward: {test_reward}")
        self.env.close()

    def learn(self, writer, i_iter, step):
        """interact"""

        state = self.env.reset()
        episode_reward = 0

        while True:

            if self.render:
                self.env.render()

            action, _ = self.choose_action(state)

            next_state, reward, done, _ = self.env.step(action)
            mask = 0 if done else 1
            # ('state', 'action', 'reward', 'next_state', 'mask')
            self.memory.push(state, action, reward, next_state, mask)

            episode_reward += reward

            if step >= self.min_update_step and step % self.update_step == 0:
                for k in range(1, self.update_step + 1):
                    batch = self.memory.sample(self.batch_size)  # random sample batch
                    self.update(batch, k)

            if done:
                break

            state = next_state

        self.env.close()

        print(f"Iter: {i_iter}, reward: {episode_reward}")
        # record reward information
        writer.add_scalar("sac_alpha/reward", episode_reward, i_iter)

    def update(self, batch, k_iter):
        """learn model"""
        batch_state = FLOAT(batch.state).to(device)
        batch_action = FLOAT(batch.action).to(device)
        batch_reward = FLOAT(batch.reward).to(device)
        batch_next_state = FLOAT(batch.next_state).to(device)
        batch_mask = FLOAT(batch.mask).to(device)

        # update by SAC Alpha
        sac_alpha_step(self.policy_net, self.q_net_1, self.q_net_2, self.alpha, self.q_net_target_1,
                       self.q_net_target_2,
                       self.optimizer_p, self.optimizer_q_1, self.optimizer_q_2, self.optimizer_a, batch_state,
                       batch_action, batch_reward, batch_next_state, batch_mask, self.gamma, self.polyak,
                       self.target_entropy,
                       k_iter % self.target_update_delay == 0)

    def load(self, model_path):
        print(f"Loading Saved Model from {model_path}")
        self.policy_net, self.q_net_1, self.q_net_2, self.alpha = torch.load(model_path, map_location=device)

    def save(self, save_path):
        """save model"""
        if not os.path.exists(save_path):
            os.mkdir(save_path)

        torch.save((self.policy_net, self.q_net_1, self.q_net_2, self.alpha), f"{save_path}/WebEye_sac_alpha.pt")
Пример #2
0
class DDPG:
    def __init__(
        self,
        env=None,
        render=False,
        num_process=1,
        memory_size=1000000,
        lr_p=1e-3,
        lr_v=1e-3,
        gamma=0.99,
        polyak=0.995,
        explore_size=10000,
        batch_size=100,
        min_update_step=1000,
        update_step=50,
        action_noise=0.1,
        seed=1,
    ):
        self.env = env
        self.render = render
        self.gamma = gamma
        self.polyak = polyak
        self.memory = FixedMemory(memory_size)
        self.explore_size = explore_size
        self.num_process = num_process
        self.lr_p = lr_p
        self.lr_v = lr_v
        self.batch_size = batch_size
        self.min_update_step = min_update_step
        self.update_step = update_step
        self.action_noise = action_noise
        self.seed = seed

        self._init_model()

    def _init_model(self):
        """init model from parameters"""
        self.num_states = self.env.observation_space.shape[0]
        self.num_actions = self.env.action_space.shape[0]

        self.action_low, self.action_high = self.env.action_space.low[
            0], self.env.action_space.high[0]
        # seeding
        np.random.seed(self.seed)
        torch.manual_seed(self.seed)
        self.env.seed(self.seed)

        self.policy_net = Actor(self.num_states, self.num_actions,
                                self.action_high).to(device)
        self.policy_net_target = Actor(self.num_states, self.num_actions,
                                       self.action_high).to(device)

        self.value_net = Value(self.num_states + self.num_actions).to(device)
        self.value_net_target = Value(self.num_states +
                                      self.num_actions).to(device)

        self.policy_net_target.load_state_dict(self.policy_net.state_dict())
        self.value_net_target.load_state_dict(self.value_net.state_dict())

        self.optimizer_p = optim.Adam(self.policy_net.parameters(),
                                      lr=self.lr_p)
        self.optimizer_v = optim.Adam(self.value_net.parameters(),
                                      lr=self.lr_v)

    def choose_action(self, state, noise_scale):
        """select action"""
        self.policy_net.eval()
        state = FLOAT(state).unsqueeze(0).to(device)
        with torch.no_grad():
            action = self.policy_net(state)
        self.policy_net.train()
        action = action.cpu().numpy()[0]
        # add noise
        noise = noise_scale * np.random.randn(self.num_actions)
        action += noise
        action = np.clip(action, -self.action_high, self.action_high)
        return action

    def eval(self, i_iter, render=False):
        """evaluate model"""
        self.policy_net.eval()
        self.value_net.eval()

        state = self.env.reset()
        test_reward = 0
        while True:
            if render:
                self.env.render()
            action = self.choose_action(state, 0)
            state, reward, done, _ = self.env.step(action)

            test_reward += reward
            if done:
                break
        print(f"Iter: {i_iter}, test Reward: {test_reward}")
        self.env.close()

    def learn(self, writer, i_iter, step):
        """interact"""
        self.policy_net.train()
        self.value_net.train()

        state = self.env.reset()
        episode_reward = 0

        while True:
            if self.render:
                self.env.render()

            action = self.choose_action(state, self.action_noise)

            next_state, reward, done, _ = self.env.step(action)
            mask = 0 if done else 1
            # ('state', 'action', 'reward', 'next_state', 'mask', 'log_prob')
            self.memory.push(state, action, reward, next_state, mask)

            episode_reward += reward

            if step >= self.min_update_step and step % self.update_step == 0:
                for _ in range(self.update_step):
                    batch = self.memory.sample(
                        self.batch_size)  # random sample batch
                    self.update(batch)

            if done:
                break

            state = next_state

        self.env.close()

        print(f"Iter: {i_iter}, reward: {episode_reward}")

        # record reward information
        writer.add_scalar("ddpg/reward", episode_reward, i_iter)

    def update(self, batch):
        """learn model"""
        batch_state = FLOAT(batch.state).to(device)
        batch_action = FLOAT(batch.action).to(device)
        batch_reward = FLOAT(batch.reward).to(device)
        batch_next_state = FLOAT(batch.next_state).to(device)
        batch_mask = FLOAT(batch.mask).to(device)

        # update by DDPG
        ddpg_step(self.policy_net, self.policy_net_target, self.value_net,
                  self.value_net_target, self.optimizer_p, self.optimizer_v,
                  batch_state, batch_action, batch_reward, batch_next_state,
                  batch_mask, self.gamma, self.polyak)

    def load(self, model_path):
        print(f"Loading Saved Model from {model_path}")
        self.policy_net, self.value_net = torch.load(model_path,
                                                     map_location=device)

    def save(self, save_path):
        if not os.path.exists(save_path):
            os.mkdir(save_path)
        """save model"""
        torch.save((self.policy_net, self.value_net),
                   f"{save_path}/WebEye_ddpg.pt")
Пример #3
0
def train_v_upper_envelope(states,
                           actions,
                           returns,
                           state_dim,
                           device,
                           seed,
                           upper_learning_rate=3e-3,
                           weight_decay=0.02,
                           max_step_num=int(1e6),
                           consecutive_steps=4,
                           k=10000):

    states = torch.from_numpy(np.array(states))
    actions = torch.from_numpy(np.array(actions))
    returns = torch.from_numpy(np.array(returns))  # returns is actually Gts

    use_gpu = True if device == "cuda:0" else False

    # Init upper_envelope net (*use relu as activation function
    upper_envelope = Value(state_dim, activation='relu')
    upper_envelope_retrain = Value(state_dim, activation='relu')

    optimizer_upper = torch.optim.Adam(upper_envelope.parameters(),
                                       lr=upper_learning_rate,
                                       weight_decay=weight_decay)
    optimizer_upper_retrain = torch.optim.Adam(
        upper_envelope_retrain.parameters(),
        lr=upper_learning_rate,
        weight_decay=weight_decay)

    if use_gpu:
        upper_envelope = upper_envelope.cuda()
        upper_envelope_retrain = upper_envelope_retrain.cuda()

    # =========================== #
    # Split data into training and testing #
    # But make sure the highest Ri is in the training set

    # pick out the highest data point
    highestR, indice = torch.max(returns, 0)
    highestR = highestR.view(-1, 1)
    highestS = states[indice]
    highestA = actions[indice]
    print("HighestR:", highestR)

    statesW = torch.cat((states[:indice], states[indice + 1:]))
    actionsW = torch.cat((actions[:indice], actions[indice + 1:]))
    returnsW = torch.cat((returns[:indice], returns[indice + 1:]))

    # shuffle the data
    perm = np.arange(statesW.shape[0])
    np.random.shuffle(perm)
    perm = LongTensor(perm).cuda() if use_gpu else LongTensor(perm)
    statesW, actionsW, returnsW = statesW[perm], actionsW[perm], returnsW[perm]

    # divide data into train/test
    divide = int(states.shape[0] * 0.8)
    train_states, train_actions, train_returns = statesW[:
                                                         divide], actionsW[:
                                                                           divide], returnsW[:
                                                                                             divide]
    test_states, test_actions, test_returns = statesW[divide:], actionsW[
        divide:], returnsW[divide:]

    # add the highest data into training
    print(train_states.size(), highestS.size())
    print(train_actions.size(), highestA.size())
    print(train_returns.size(), highestR.size())
    train_states = torch.cat((train_states.squeeze(), highestS.unsqueeze(0)))
    train_actions = torch.cat((train_actions.squeeze(), highestA.unsqueeze(0)))
    train_returns = torch.cat(
        (train_returns.squeeze(), highestR.squeeze().unsqueeze(0)))

    # train upper envelope
    # env_dummy = env_factory(0)
    # state_dim = env_dummy.observation_space.shape[0]
    # upper_envelope = Value(state_dim)
    # optimizer = torch.optim.Adam(upper_envelope.parameters(), lr=0.003, weight_decay=20)

    epoch_n = 100
    batch_size = 64
    optim_iter_num = int(math.ceil(train_states.shape[0] / batch_size))

    num_increase = 0
    previous_loss = math.inf

    calculate_vali = 2
    best_parameters = upper_envelope.state_dict()
    running_traning_steps = 0
    best_training_steps = running_traning_steps

    # Upper Envelope Training starts
    upper_envelope.train()

    while num_increase < consecutive_steps:
        # update theta for n steps, n = calculate_vali
        # train calculate_vali steps
        for i in range(calculate_vali):
            train_loss = 0
            perm = np.arange(train_states.shape[0])
            np.random.shuffle(perm)
            perm = LongTensor(perm).cuda() if use_gpu else LongTensor(perm)

            train_states, train_actions, train_returns = train_states[
                perm], train_actions[perm], train_returns[perm]

            for i in range(optim_iter_num):
                ind = slice(i * batch_size,
                            min((i + 1) * batch_size, states.shape[0]))
                states_b, returns_b = train_states[ind], train_returns[ind]
                states_b = Variable(states_b.float())
                returns_b = Variable(returns_b.float())
                Vsi = upper_envelope(states_b)
                # loss = loss_fn(Vsi, returns_b)
                loss = L2PenaltyLoss(Vsi, returns_b, k_val=k)
                train_loss += loss.detach()
                upper_envelope.zero_grad()
                loss.backward()
                optimizer_upper.step()

        # early stopping

        running_traning_steps += calculate_vali

        # calculate validation error
        test_iter = int(math.ceil(test_states.shape[0] / batch_size))
        validation_loss = 0
        for n in range(test_iter):
            ind = slice(n * batch_size,
                        min((n + 1) * batch_size, states.shape[0]))
            states_t, returns_t = test_states[ind], test_returns[ind]
            states_t = Variable(states_t.float())
            returns_t = Variable(returns_t.float())
            Vsi = upper_envelope(states_t)
            loss = L2PenaltyLoss(Vsi, returns_t, k_val=k)
            validation_loss += loss

        if validation_loss < previous_loss:
            best_training_steps = running_traning_steps
            previous_loss = validation_loss
            best_parameters = upper_envelope.state_dict()
            num_increase = 0
        else:
            num_increase += 1

    print("best_training_steps:", best_training_steps)
    upper_envelope.load_state_dict(best_parameters)

    # retrain on the whole set
    upper_envelope_retrain.train()

    optim_iter_num = int(math.ceil(states.shape[0] / batch_size))
    for i in range(best_training_steps):
        train_loss = 0
        perm = np.arange(states.shape[0])
        np.random.shuffle(perm)
        perm = LongTensor(perm).cuda() if use_gpu else LongTensor(perm)

        states, actions, returns = states[perm], actions[perm], returns[perm]

        for i in range(optim_iter_num):
            ind = slice(i * batch_size,
                        min((i + 1) * batch_size, states.shape[0]))
            states_b, returns_b = states[ind], returns[ind]
            states_b = Variable(states_b.float())
            returns_b = Variable(returns_b.float())
            Vsi = upper_envelope_retrain(states_b)
            #loss = loss_fn(Vsi, returns_b)
            loss = L2PenaltyLoss(Vsi, returns_b, k_val=k)
            train_loss += loss.detach()
            upper_envelope_retrain.zero_grad()
            loss.backward()
            optimizer_upper_retrain.step()

    upper_envelope.load_state_dict(upper_envelope_retrain.state_dict())
    print("Policy training is complete.")

    return upper_envelope