Exemplo n.º 1
0
    def __init__(
        self,
        d_state,
        d_action,
        device,
        gamma,
        tau,
        policy_lr,
        value_lr,
        value_loss,
        value_n_layers,
        value_n_units,
        value_activation,
        policy_n_layers,
        policy_n_units,
        policy_activation,
        grad_clip,
        policy_delay=2,
        policy_noise=0.2,
        noise_clip=0.5,
        expl_noise=0.1,
        tdg_error_weight=0,
        td_error_weight=1,
    ):
        super().__init__()

        self.actor = Actor(d_state, d_action, policy_n_layers, policy_n_units,
                           policy_activation).to(device)
        self.actor_target = copy.deepcopy(self.actor)
        self.actor_optimizer = RAdam(self.actor.parameters(), lr=policy_lr)

        self.critic = ActionValueFunction(d_state, d_action, value_n_layers,
                                          value_n_units,
                                          value_activation).to(device)
        self.critic_target = copy.deepcopy(self.critic)
        self.critic_optimizer = RAdam(self.critic.parameters(), lr=value_lr)

        self.discount = gamma
        self.tau = tau
        self.policy_delay = policy_delay
        self.policy_noise = policy_noise
        self.noise_clip = noise_clip
        self.expl_noise = expl_noise
        self.normalizer = None
        self.value_loss = value_loss
        self.grad_clip = grad_clip
        self.device = device
        self.last_actor_loss = 0

        self.tdg_error_weight = tdg_error_weight
        self.td_error_weight = td_error_weight
        self.step_counter = 0
Exemplo n.º 2
0
    def __init__(self, use_conv, nets, dimO, dimA, obs, obs2, is_training,
                 sess, scope='hyperactor'):
        self.actors = []
        with tf.variable_scope(scope):
            for i in range(FLAGS.num_options):
                actor = Actor(use_conv, nets, dimO, dimA, obs, obs2, is_training,
                              sess, scope='actor%d' % i)
                self.actors.append(actor)

        super(HyperOptionsActor, self).__init__(use_conv, nets, dimO, [FLAGS.num_options], obs, obs2,
                                                is_training, sess, scope)
Exemplo n.º 3
0
Arquivo: test.py Projeto: YuanyeMa/RL
def main():
    env = NormalizedEnv(gym.make('Pendulum-v0'))

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]

    agent = Actor(state_dim, action_dim).to('cuda')

    agent.load_state_dict(torch.load('./Models/78.0_actor.pt'))

    eposide = 0
    done = False
    eposide_list = []
    while eposide < 100:
        eposide_reward = 0
        state = env.reset()
        state = (state - env.observation_space.low) / (
            env.observation_space.high - env.observation_space.low)
        state = to_tensor(state)
        while not done:
            action = agent.forward(state).detach().cpu().data.numpy()
            state_, reward, done, _ = env.step(action)
            state_ = (state_ - env.observation_space.low) / (
                env.observation_space.high - env.observation_space.low)
            env.render()
            state = to_tensor(state_)
            eposide_reward += reward

        eposide_list.append(eposide_reward)
        eposide += 1
        done = False
        print("{} : {}".format(eposide, eposide_reward))

    import matplotlib.pyplot as plt
    x = np.arange(100)
    y = np.array(eposide_list)
    plt.plot(x, y)
    plt.savefig("./test_eposide_reward.png")

    env.close()
Exemplo n.º 4
0
def get_policy(dump: str, action_spec):
    state_dict = torch.load(dump, map_location="cpu")
    policy = Actor(*state_dict["args"].tolist())
    policy.load_state_dict(state_dict)
    policy.eval()

    @torch.no_grad()
    def _policy(time_step):
        state = np.concatenate(list(time_step.observation.values()))
        state_tensor = torch.tensor(state, dtype=torch.float)
        p = policy(state_tensor).numpy()
        return np.clip(p, action_spec.minimum, action_spec.maximum)

    return _policy
Exemplo n.º 5
0
class Action():
    def __init__(self, state_dim, action_dim):
        self.actor = Actor(state_dim, action_dim)
        self.actor.eval()
        self.noise = OrnsteinUhlenbeckActionNoise(action_dim)
        self.to_tensor = util.to_tensor
        pass

    def chose_action(self, state, explort):
        if explort:
            a0 = self.get_exploration_action(state)
        else:
            a0 = self.get_exploitation_action(state)
        return a0

    def get_exploitation_action(self, state):
        '''得到给定状态下依据目标演员网络计算出的行为,不探索
            Args:
                state numpy数组
            Returns:
                action numpy数组
        '''
        action = self.actor.forward(self.to_tensor(state)).squeeze(0)
        action = action.cpu().data.numpy()
        return action

    def get_exploration_action(self, state):
        '''得到给定状态下根据演员网络计算出的带噪声的行为,模拟一定的探索
            Args:
                state  numpy数组
            Returns:
                action numpy数组
        '''
        action = self.actor.forward(self.to_tensor(state)).squeeze(0)
        new_action = action.cpu().data.numpy() + (self.noise.sample())
        new_action = new_action.clip(min=-1, max=1)
        return new_action

    def load_param(self, source_model):
        self.actor.load_state_dict(source_model.state_dict())
REPLACE_ITER_A = args.target_update_a
REPLACE_ITER_C = args.target_update_c
GAMMA = args.gamma

env = Env()
STATE_DIM = env.state_dim
ACTION_DIM = env.action_dim
ACTION_BOUND = env.action_bound

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.log_device_placement=True
sess = tf.Session(config=config)

# Create actor and critic.
actor = Actor(sess, ACTION_DIM, ACTION_BOUND[1], LR_A, REPLACE_ITER_A)
critic = Critic(sess, STATE_DIM, ACTION_DIM, LR_C, GAMMA, REPLACE_ITER_C, actor.a, actor.a_)
actor.add_grad_to_graph(critic.a_grads)

M = Memory(MEMORY_CAPACITY, dims=2 * STATE_DIM + ACTION_DIM + 1)

saver = tf.train.Saver()
path = './checkpoints'

saver.restore(sess, tf.train.latest_checkpoint(path))

def eval():
    s = env.reset()
    while True:
        a = actor.choose_action(s)
        s_, r, done, collision = env.step(a)
Exemplo n.º 7
0
from memory import *

if __name__ == '__main__':

    env = gym.make('Pendulum-v0')
    env = env.unwrapped

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    action_bound = env.action_space.high

    var = 3.

    with tf.Session() as sess:
        memory = Memory(32, 10000)
        actor = Actor(sess, state_dim, action_bound, lr=0.01, tau=0.01)
        critic = Critic(sess, state_dim, actor.s, actor.s_, actor.a, actor.a_, gamma=0.9, lr=0.001, tau=0.01)
        t = critic.get_gradients()

        actor.generate_gradients(t)

        sess.run(tf.global_variables_initializer())

        for i in range(1000):
            s = env.reset()
            r_episode = 0
            for j in range(200):
                a = actor.choose_action(s)
                a = np.clip(np.random.normal(a, var), -action_bound, action_bound)  # 异策略探索
                s_, r, done, info = env.step(a)
Exemplo n.º 8
0

env = gym.make('Duckietown-udem1-v0')

# Wrappers

env = ResizeWrapper(env)
env = NormalizeWrapper(env)
env = ImgWrapper(env)  # to make the images from 160x120x3 into 3x160x120
env = ActionWrapper(env)
env = DtRewardWrapper(env)
state_size = env.observation_space.shape
action_size = env.action_space.shape[0]
max_action = float(env.action_space.high[0])

actor_agent = Actor(state_size, action_size, max_action)
actor_path = torch.load(
    '/home/ivlabs/users/sharath/final_year_thesis/ddpg_models/checkpoint_13_actor.pth'
)
actor_agent.load_state_dict(actor_path)
stack_size = 4
stacked_frames = deque(
    [np.zeros((120, 160), dtype=np.int) for i in range(stack_size)], maxlen=4)
state = env.reset()
with torch.no_grad():
    while True:
        state = env.reset()
        state, stacked_frames = stack_images(stacked_frames, state, True)
        rewards = []
        while True:
            state = torch.from_numpy(state).float()
Exemplo n.º 9
0
    def __init__(self,
                 env: callable,
                 state_shape: list,
                 action_size: int,
                 q_network_shape: tuple,
                 mu_network_shape: tuple,
                 buffer_size: int,
                 gamma: float,
                 tau: float,
                 noise_stddev: float,
                 save_dir: str,
                 actor_learning_rate: float,
                 critic_learning_rate: float,
                 batch_size: int,
                 episode: int,
                 train_epoch: int,
                 run_epoch: int = 100,
                 action_reshape: callable = None):
        self.env = env
        self.batch_size = batch_size
        self.state_shape = state_shape
        self.action_size = action_size
        self.q_network_shape = q_network_shape
        self.mu_network_shape = mu_network_shape
        self.buffer_size = buffer_size
        self.noise_stddev = noise_stddev
        self.episode = episode
        self.train_epoch = train_epoch
        self.dir = save_dir
        self.action_reshape = action_reshape
        self.run_epoch = run_epoch

        state_i = tf.placeholder("float32", [batch_size] + state_shape)
        state_i_next = tf.placeholder("float32", [batch_size] + state_shape)
        action_i = tf.placeholder("float32", [batch_size, action_size])
        mu_apo = Mu_Model("Mu_apo",
                          state_i_next,
                          action_size,
                          mu_network_shape,
                          batch_size,
                          trainable=False)
        q = Q_Model("Q_0", state_i, action_i, q_network_shape, batch_size)
        mu = Mu_Model("Mu_0",
                      state_i,
                      action_size,
                      mu_network_shape,
                      batch_size,
                      y_grads=q.a_grads)
        q_apo = Q_Model("Q_apo",
                        state_i_next,
                        mu_apo.a,
                        q_network_shape,
                        batch_size,
                        trainable=False)
        self._actor = Actor(mu, mu_apo, gamma, tau, actor_learning_rate)
        self._critic = Critic(q, q_apo, gamma, tau, batch_size,
                              critic_learning_rate)
        self._s_buf = ReplayBuf(buffer_size, self.state_shape)
        self._a_buf = ReplayBuf(buffer_size, [self.action_size])
        self._r_buf = ReplayBuf(buffer_size, [1])
        self._s_next_buf = ReplayBuf(buffer_size, self.state_shape)
        self._sess = None
        self._saver = tf.train.Saver()
Exemplo n.º 10
0
class Model(object):
    def __init__(self,
                 env: callable,
                 state_shape: list,
                 action_size: int,
                 q_network_shape: tuple,
                 mu_network_shape: tuple,
                 buffer_size: int,
                 gamma: float,
                 tau: float,
                 noise_stddev: float,
                 save_dir: str,
                 actor_learning_rate: float,
                 critic_learning_rate: float,
                 batch_size: int,
                 episode: int,
                 train_epoch: int,
                 run_epoch: int = 100,
                 action_reshape: callable = None):
        self.env = env
        self.batch_size = batch_size
        self.state_shape = state_shape
        self.action_size = action_size
        self.q_network_shape = q_network_shape
        self.mu_network_shape = mu_network_shape
        self.buffer_size = buffer_size
        self.noise_stddev = noise_stddev
        self.episode = episode
        self.train_epoch = train_epoch
        self.dir = save_dir
        self.action_reshape = action_reshape
        self.run_epoch = run_epoch

        state_i = tf.placeholder("float32", [batch_size] + state_shape)
        state_i_next = tf.placeholder("float32", [batch_size] + state_shape)
        action_i = tf.placeholder("float32", [batch_size, action_size])
        mu_apo = Mu_Model("Mu_apo",
                          state_i_next,
                          action_size,
                          mu_network_shape,
                          batch_size,
                          trainable=False)
        q = Q_Model("Q_0", state_i, action_i, q_network_shape, batch_size)
        mu = Mu_Model("Mu_0",
                      state_i,
                      action_size,
                      mu_network_shape,
                      batch_size,
                      y_grads=q.a_grads)
        q_apo = Q_Model("Q_apo",
                        state_i_next,
                        mu_apo.a,
                        q_network_shape,
                        batch_size,
                        trainable=False)
        self._actor = Actor(mu, mu_apo, gamma, tau, actor_learning_rate)
        self._critic = Critic(q, q_apo, gamma, tau, batch_size,
                              critic_learning_rate)
        self._s_buf = ReplayBuf(buffer_size, self.state_shape)
        self._a_buf = ReplayBuf(buffer_size, [self.action_size])
        self._r_buf = ReplayBuf(buffer_size, [1])
        self._s_next_buf = ReplayBuf(buffer_size, self.state_shape)
        self._sess = None
        self._saver = tf.train.Saver()

    def train(self):
        # input definition
        s_i = self._actor.s
        s_i_next = self._actor.s_apo
        a_i = self._critic.a
        r_i = self._critic.reward

        # data container definition
        data_s_i = np.zeros([self.batch_size] + self.state_shape)

        ck_pt = tf.train.get_checkpoint_state(self.dir)
        if ck_pt is not None:

            self._sess = tf.Session()
            self._saver.restore(self._sess, ck_pt.state.model_checkpoint_path)
            '''
            except:
                print("Load model error")
                self._sess.run([tf.global_variables_initializer(), tf.local_variables_initializer()])
            '''
        else:
            self._sess = tf.Session()
            self._sess.run([
                tf.global_variables_initializer(),
                tf.local_variables_initializer()
            ])
            self._sess.run([
                self._actor.init_target_net(),
                self._critic.init_target_net()
            ])

        count = 0
        for _ in range(self.episode):
            end_state = self.env.reset()
            while True:
                try:
                    self.env.render()
                except:
                    pass
                start_state = end_state
                data_s_i[0] = start_state
                action = self._sess.run(self._actor.a, {
                    s_i: data_s_i
                })[0] + np.random.normal(
                    0, scale=self.noise_stddev,
                    size=self.action_size)  # get an action, a = Mu(s)+Noise
                if self.action_reshape is not None:
                    end_state, reward, _done, _ = self.env.step(
                        self.action_reshape(action))
                else:
                    end_state, reward, _done, _ = self.env.step(action)
                #print("Action: {}".format(action))
                #print("Reward: {}".format(reward))

                self._s_buf.append(start_state)
                self._a_buf.append(action)
                self._r_buf.append(np.array([reward]))
                self._s_next_buf.append(end_state)

                count += 1

                if _done:  # final state
                    break

                if len(self._s_buf
                       ) >= self.batch_size * 10 and count >= self.run_epoch:
                    print("Action: {}".format(action))
                    count = 0
                    loss = np.zeros([0])
                    q = np.zeros([0])

                    for i in range(self.train_epoch):
                        sample = list(range(len(self._s_buf)))
                        np.random.shuffle(sample)
                        data_s_i = self._s_buf.get_by_indexes(
                            sample[:self.batch_size])  # get batch
                        data_a_i = self._a_buf.get_by_indexes(
                            sample[:self.batch_size])
                        data_r_i = self._r_buf.get_by_indexes(
                            sample[:self.batch_size])
                        data_s_i_next = self._s_next_buf.get_by_indexes(
                            sample[:self.batch_size])

                        _, loss = self._sess.run(
                            [self._critic.minimize_loss(), self._critic.loss
                             ],  # minimize critic loss
                            {
                                s_i: data_s_i,
                                a_i: data_a_i,
                                s_i_next: data_s_i_next,
                                r_i: data_r_i
                            })

                        a = self._sess.run(self._actor.a, {s_i: data_s_i})
                        _, a = self._sess.run(
                            [self._actor.maximize_action_q(), self._actor.a
                             ],  # maximize actor-critic value
                            {
                                s_i: data_s_i,
                                a_i: a
                            })
                        q = self._sess.run(
                            self._critic.Q,  # calculate q value
                            {
                                s_i: data_s_i,
                                a_i: a
                            })
                        self._sess.run(self._critic.update_target_net()
                                       )  # update target network
                        self._sess.run(self._actor.update_target_net())

                    print("Average loss: {}".format(loss.mean()))
                    print("Average Q value: {}".format(q.mean()))
Exemplo n.º 11
0
    session = tf.Session()

    actors = []
    critics = []
    actors_noise = []
    memories = []
    # actors & critics
    for i in range(env.n):
        n_action = env.action_space[i].n
        state_size = env.observation_space[i].shape[0]
        state = tf.placeholder(tf.float32, shape=[None, state_size])
        reward = tf.placeholder(tf.float32, [None, 1])
        state_next = tf.placeholder(tf.float32, shape=[None, state_size])
        speed = 0.8 if env.agents[i].adversary else 1

        actors.append(Actor('actor' + str(i), session, n_action, speed,
                            state, state_next))
        critics.append(Critic('critic' + str(i), session, n_action,
                              actors[i].eval_actions, actors[i].target_actions,
                              state, state_next, reward))
        actors[i].add_gradients(critics[i].action_gradients)
        actors_noise.append(OrnsteinUhlenbeckActionNoise(
            mu=ou_mus[i],
            sigma=ou_sigma[i],
            theta=ou_theta[i],
            dt=ou_dt[i],
            x0=ou_x0[i]))
        memories.append(Memory(args.memory_size))

    session.run(tf.global_variables_initializer())
    saver = tf.train.Saver(max_to_keep=10000000)
Exemplo n.º 12
0
 def __init__(self, state_dim, action_dim):
     self.actor = Actor(state_dim, action_dim)
     self.actor.eval()
     self.noise = OrnsteinUhlenbeckActionNoise(action_dim)
     self.to_tensor = util.to_tensor
     pass
Exemplo n.º 13
0
class TD3(nn.Module):
    def __init__(
        self,
        d_state,
        d_action,
        device,
        gamma,
        tau,
        policy_lr,
        value_lr,
        value_loss,
        value_n_layers,
        value_n_units,
        value_activation,
        policy_n_layers,
        policy_n_units,
        policy_activation,
        grad_clip,
        policy_delay=2,
        policy_noise=0.2,
        noise_clip=0.5,
        expl_noise=0.1,
        tdg_error_weight=0,
        td_error_weight=1,
    ):
        super().__init__()

        self.actor = Actor(d_state, d_action, policy_n_layers, policy_n_units,
                           policy_activation).to(device)
        self.actor_target = copy.deepcopy(self.actor)
        self.actor_optimizer = RAdam(self.actor.parameters(), lr=policy_lr)

        self.critic = ActionValueFunction(d_state, d_action, value_n_layers,
                                          value_n_units,
                                          value_activation).to(device)
        self.critic_target = copy.deepcopy(self.critic)
        self.critic_optimizer = RAdam(self.critic.parameters(), lr=value_lr)

        self.discount = gamma
        self.tau = tau
        self.policy_delay = policy_delay
        self.policy_noise = policy_noise
        self.noise_clip = noise_clip
        self.expl_noise = expl_noise
        self.normalizer = None
        self.value_loss = value_loss
        self.grad_clip = grad_clip
        self.device = device
        self.last_actor_loss = 0

        self.tdg_error_weight = tdg_error_weight
        self.td_error_weight = td_error_weight
        self.step_counter = 0

    def setup_normalizer(self, normalizer):
        self.normalizer = copy.deepcopy(normalizer)

    def get_action(self, states, deterministic=False):
        states = states.to(self.device)
        with torch.no_grad():
            if self.normalizer is not None:
                states = self.normalizer.normalize_states(states)
            actions = self.actor(states)
            if not deterministic:
                actions += torch.randn_like(actions) * self.expl_noise
            return actions.clamp(-1, +1)

    def get_action_with_logp(self, states):
        states = states.to(self.device)
        if self.normalizer is not None:
            states = self.normalizer.normalize_states(states)
        a = self.actor(states)
        return a, torch.ones(
            a.shape[0], device=a.device) * np.inf  # inf: should not be used

    def get_action_value(self, states, actions):
        with torch.no_grad():
            states = states.to(self.device)
            actions = actions.to(self.device)
            return self.critic(states, actions)[0]  # just q1

    def update(self, states, actions, logps, rewards, next_states, masks):
        if self.normalizer is not None:
            states = self.normalizer.normalize_states(states)
            next_states = self.normalizer.normalize_states(next_states)
        self.step_counter += 1

        # Select action according to policy and add clipped noise
        noise = (torch.randn_like(actions) * self.policy_noise).clamp(
            -self.noise_clip, self.noise_clip)
        raw_next_actions = self.actor_target(next_states)
        next_actions = (raw_next_actions + noise).clamp(-1, 1)

        # Compute the target Q value
        next_Q1, next_Q2 = self.critic_target(next_states, next_actions)
        next_Q = torch.min(next_Q1, next_Q2)
        q_target = rewards.unsqueeze(
            1) + self.discount * masks.float().unsqueeze(1) * next_Q
        zero_targets = torch.zeros_like(q_target, device=self.device)

        # Get current Q estimates
        q1, q2 = self.critic(states, actions)
        q1_td_error, q2_td_error = q_target - q1, q_target - q2

        critic_loss, standard_loss, gradient_loss = torch.tensor(
            0, device=self.device), torch.tensor(
                0, device=self.device), torch.tensor(0, device=self.device)
        if self.td_error_weight != 0:
            # Compute standard critic loss
            if self.value_loss == 'huber':
                standard_loss = 0.5 * (
                    F.smooth_l1_loss(q1_td_error, zero_targets) +
                    F.smooth_l1_loss(q2_td_error, zero_targets))
            elif self.value_loss == 'mse':
                standard_loss = 0.5 * (F.mse_loss(q1_td_error, zero_targets) +
                                       F.mse_loss(q2_td_error, zero_targets))
            critic_loss = critic_loss + self.td_error_weight * standard_loss
        if self.tdg_error_weight != 0:
            # Compute gradient critic loss
            gradients_error_norms1 = torch.autograd.grad(
                outputs=q1_td_error,
                inputs=actions,
                grad_outputs=torch.ones(q1_td_error.size(),
                                        device=self.device),
                retain_graph=True,
                create_graph=True,
                only_inputs=True)[0].flatten(start_dim=1).norm(dim=1,
                                                               keepdim=True)
            gradients_error_norms2 = torch.autograd.grad(
                outputs=q2_td_error,
                inputs=actions,
                grad_outputs=torch.ones(q2_td_error.size(),
                                        device=self.device),
                retain_graph=True,
                create_graph=True,
                only_inputs=True)[0].flatten(start_dim=1).norm(dim=1,
                                                               keepdim=True)
            if self.value_loss == 'huber':
                gradient_loss = 0.5 * (
                    F.smooth_l1_loss(gradients_error_norms1, zero_targets) +
                    F.smooth_l1_loss(gradients_error_norms2, zero_targets))
            elif self.value_loss == 'mse':
                gradient_loss = 0.5 * (
                    F.mse_loss(gradients_error_norms1, zero_targets) +
                    F.mse_loss(gradients_error_norms2, zero_targets))
            critic_loss = critic_loss + self.tdg_error_weight * gradient_loss

        # Optimize the critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_value_(self.critic.parameters(),
                                        self.grad_clip)
        self.critic_optimizer.step()

        if self.step_counter % self.policy_delay == 0:
            # Compute actor loss
            q1, q2 = self.critic(
                states,
                self.actor(states))  # originally in TD3 we had here q1 only
            q_min = torch.min(q1, q2)
            actor_loss = -q_min.mean()
            self.last_actor_loss = actor_loss.item()

            # Optimize the actor
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            torch.nn.utils.clip_grad_value_(self.actor.parameters(),
                                            self.grad_clip)
            self.actor_optimizer.step()

            # Update the frozen target policy
            for param, target_param in zip(self.actor.parameters(),
                                           self.actor_target.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

        # Update the frozen target value function
        for param, target_param in zip(self.critic.parameters(),
                                       self.critic_target.parameters()):
            target_param.data.copy_(self.tau * param.data +
                                    (1 - self.tau) * target_param.data)

        return raw_next_actions[0, 0].item(
        ), self.td_error_weight * standard_loss.item(
        ), self.tdg_error_weight * gradient_loss.item(), self.last_actor_loss

    @staticmethod
    def catastrophic_divergence(q_loss, pi_loss):
        return q_loss > 1e2 or (pi_loss is not None and abs(pi_loss) > 1e5)