def __init__(self, hidden_size, env):
        self.num_states = env.observation_space.shape[0]
        self.num_actions = env.action_space.shape[0]

        self.Actor = Actor(input_size=self.num_states,
                           hidden_size=hidden_size,
                           output_size=self.num_actions).cuda()

        self.Actor_target = Actor(input_size=self.num_states,
                                  hidden_size=hidden_size,
                                  output_size=self.num_actions).cuda()

        self.Critic = Critic(input_size=self.num_states,
                             hidden_size=hidden_size,
                             output_size=self.num_actions).cuda()

        self.Critic_target = Critic(input_size=self.num_states,
                                    hidden_size=hidden_size,
                                    output_size=self.num_actions).cuda()

        for target_param, param in zip(self.Actor_target.parameters(),
                                       self.Actor.parameters()):
            target_param.data = param.data

        for target_param, param in zip(self.Critic_target.parameters(),
                                       self.Critic.parameters()):
            target_param.data = param.data

        self.Memory = Memory(30000)
        self.criterion = nn.MSELoss().cuda()
        self.actor_optimizer = torch.optim.Adam(self.Actor.parameters(),
                                                lr=1e-2)
        self.critic_optimizer = torch.optim.Adam(self.Critic.parameters(),
                                                 lr=1e-1)
示例#2
0
 def __init__(self, state_dim, action_dim, max_action):
     self.actor = Actor(state_dim, action_dim,
                        max_action).to(device)  # gradient descent
     self.actor_target = Actor(state_dim, action_dim,
                               max_action).to(device)  # poliac average
     self.actor_target.load_state_dict(
         self.actor.state_dict())  # load with parameters of actor model
     self.actor_optimizer = torch.optim.Adam(self.actor.parameters())
     self.critic = Critic(state_dim, action_dim).to(device)
     self.critic_target = Critic(state_dim, action_dim).to(device)
     self.critic_target.load_state_dict(self.critic.state_dict())
     self.critic_optimizer = torch.optim.Adam(self.critic.parameters())
     self.max_action = max_action
示例#3
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Reward monitoring
        self.best_total_reward = -np.inf

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters
class DDPGagent:
    def __init__(self, hidden_size, env):
        self.num_states = env.observation_space.shape[0]
        self.num_actions = env.action_space.shape[0]

        self.Actor = Actor(input_size=self.num_states,
                           hidden_size=hidden_size,
                           output_size=self.num_actions).cuda()

        self.Actor_target = Actor(input_size=self.num_states,
                                  hidden_size=hidden_size,
                                  output_size=self.num_actions).cuda()

        self.Critic = Critic(input_size=self.num_states,
                             hidden_size=hidden_size,
                             output_size=self.num_actions).cuda()

        self.Critic_target = Critic(input_size=self.num_states,
                                    hidden_size=hidden_size,
                                    output_size=self.num_actions).cuda()

        for target_param, param in zip(self.Actor_target.parameters(),
                                       self.Actor.parameters()):
            target_param.data = param.data

        for target_param, param in zip(self.Critic_target.parameters(),
                                       self.Critic.parameters()):
            target_param.data = param.data

        self.Memory = Memory(30000)
        self.criterion = nn.MSELoss().cuda()
        self.actor_optimizer = torch.optim.Adam(self.Actor.parameters(),
                                                lr=1e-2)
        self.critic_optimizer = torch.optim.Adam(self.Critic.parameters(),
                                                 lr=1e-1)

    def get_action(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).cuda()
        action = self.Actor.forward(state)
        action = action.detach().cpu().numpy()
        return action

    def update(self, batch_size):
        states, actions, rewards, next_states, _ = self.Memory.sample(
            batch_size)
        states = torch.tensor(states).cuda()
        actions = torch.tensor(actions).cuda()
        rewards = torch.tensor(rewards).cuda()
        next_states = torch.tensor(next_states).cuda()

        Q_Value = self.Critic.forward(states, action=actions)
        next_actions = self.Actor_target(next_states)
        next_Q = self.Critic_target.forward(next_states, next_actions.detach())
        Q_prime = rewards + 0.99 * next_Q
        critic_loss = self.criterion(Q_Value, Q_prime)
        policy_loss = -self.Critic.forward(states,
                                           self.Actor.forward(states)).mean()

        self.actor_optimizer.zero_grad()
        policy_loss.backward()
        self.actor_optimizer.step()

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        for target_param, param in zip(self.Actor_target.parameters(),
                                       self.Actor.parameters()):
            target_param.data = (param.data * 1e-2 + target_param.data *
                                 (1.0 - 1e-2))

        for target_param, param in zip(self.Critic_target.parameters(),
                                       self.Critic.parameters()):
            target_param.data.copy_(param.data * 1e-2 + target_param.data *
                                    (1.0 - 1e-2))
示例#5
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Reward monitoring
        self.best_total_reward = -np.inf

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters

    def reset_episode(self):
        self.total_reward = 0.0
        #self.count = 0
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        self.total_reward += reward

        if self.total_reward > self.best_total_reward:
            self.best_total_reward = self.total_reward

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
import gym
import matplotlib.pyplot as plt
from Actor_Critic import Actor, Critic

GAME = 'CartPole-v0'
EPISODES = 1000
RENDER = False

env = gym.make(GAME)
env = env.unwrapped

n_states = env.observation_space.shape[0]
n_actions = env.action_space.n

actor = Actor(n_states, n_actions)
critic = Critic(n_states)


def run():
    plt.ion()
    total_r = 0
    avg_ep_r_hist = []
    for episode in range(EPISODES):
        ep_r = 0
        s = env.reset()
        while True:
            a = actor.choose_action(s)
            s_, r, done, info = env.step(a)
            r = -10 if done else r

            ep_r += r
示例#7
0
                    list(zip(all_new_select_x,all_new_select_y)),all_new_select_x.shape[0], 1,shuffle=True)

                for batch in batches:
                    x_batch, y_batch = zip(*batch)
                    # print(x_batch, y_batch)
                    train_step(x_batch, y_batch)

                if episode % 2 == 0:#在测试集上检验效果
                    print("\nEvaluation:")
                    n2,p2,acc2=dev_step(x_dev, y_dev, writer=dev_summary_writer)
                    print("acc_ori",acc_ori,"new_select_acc",acc2)

sess = tf.Session()

actor = Actor(sess, n_features=N_F, n_actions=N_A, lr=LR_A)
critic = Critic(sess, n_features=N_F, lr=LR_C)     # we need a good teacher, so the teacher should learn faster than the actor

sess.run(tf.global_variables_initializer())

if OUTPUT_GRAPH:
    tf.summary.FileWriter("logs/", sess.graph)

for i_episode in range(MAX_EPISODE):
    s = env.reset()
    t = 0
    track_r = []
    while True:
        if RENDER: env.render()

        a = actor.choose_action(s)
示例#8
0
class TD3(object):  # training the agent over ceratin amount of time steps.
    def __init__(self, state_dim, action_dim, max_action):
        self.actor = Actor(state_dim, action_dim,
                           max_action).to(device)  # gradient descent
        self.actor_target = Actor(state_dim, action_dim,
                                  max_action).to(device)  # poliac average
        self.actor_target.load_state_dict(
            self.actor.state_dict())  # load with parameters of actor model
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters())
        self.critic = Critic(state_dim, action_dim).to(device)
        self.critic_target = Critic(state_dim, action_dim).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters())
        self.max_action = max_action

    def select_action(self, state):  # input of actor
        state = torch.Tensor(state.reshape(1, -1)).to(
            device)  # in horizontal vector
        # convert back to numpy for adding noise and cliping
        return self.actor(state).cpu().data.numpy().flatten(
        )  # to forward propagation we can use cpu. flattern:to get action in 1D array

    # do exploration we have policy noise, standard deviation!
    # policy_freq: frequency of the delay, update the target once every iteration.
    def train(self,
              replay_buffer,
              iterations,
              batch_size=100,
              discount=0.99,
              tau=0.005,
              policy_noise=0.2,
              noise_clip=0.5,
              policy_freq=2):

        for it in range(iterations):
            # sample a batch of transitions (s, s’, a, r) from the memory, creat 4 seperate batches:
            batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = replay_buffer.sample(
                batch_size)
            state = torch.Tensor(batch_states).to(device)
            next_state = torch.Tensor(batch_next_states).to(device)
            action = torch.Tensor(batch_actions).to(device)
            reward = torch.Tensor(batch_rewards).to(device)
            done = torch.Tensor(batch_dones).to(device)

            # From the next state s', the Actor target plays the next action a'
            next_action = self.actor_target(next_state)

            # add Gaussian noise to this next action a’ and we clip it in a range of values supported by the environment
            noise = torch.Tensor(batch_actions).data.normal_(
                0, policy_noise).to(device)
            noise = noise.clamp(-noise_clip, noise_clip)
            next_action = (next_action + noise).clamp(-self.max_action,
                                                      self.max_action)

            # The two Critic targets take each the couple (s’, a’) as input and return two Q-values Qt1(s’,a’) and Qt2(s’,a’) as outputs:
            target_Q1, target_Q2 = self.critic_target(next_state, next_action)

            # keep the minimum of these two Q-values: min(Qt1, Qt2)
            target_Q = torch.min(target_Q1, target_Q2)

            # get the final target of the two Critic models, which is: Qt = r + γ * min(Qt1, Qt2)
            target_Q = reward + ((1 - done) * discount * target_Q).detach()

            # two Critic models take each the couple (s, a) as input and return two Q-values Q1(s,a) and Q2(s,a) as outputs
            current_Q1, current_Q2 = self.critic(state, action)

            # compute the loss coming from the two Critic models: Critic Loss = MSE_Loss(Q1(s,a), Qt) + MSE_Loss(Q2(s,a), Qt)
            critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(
                current_Q2, target_Q)

            # backpropagate this Critic loss and update the parameters of the two Critic models with a SGD optimizer
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()

            # every two iterations, we update our Actor model by performing gradient ascent on the output of the first Critic model
            if it % policy_freq == 0:
                actor_loss = -self.critic.Q1(state, self.actor(state)).mean()
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()

                # once every two iterations, we update the weights of the Actor target by polyak averaging
                for param, target_param in zip(self.actor.parameters(),
                                               self.actor_target.parameters()):
                    target_param.data.copy_(tau * param.data +
                                            (1 - tau) * target_param.data)

                # once every two iterations, we update the weights of the Critic target by polyak averaging
                for param, target_param in zip(
                        self.critic.parameters(),
                        self.critic_target.parameters()):
                    target_param.data.copy_(tau * param.data +
                                            (1 - tau) * target_param.data)

    # save method to save a trained model
    def save(self, filename, directory):
        torch.save(self.actor.state_dict(),
                   '%s/%s_actor.pth' % (directory, filename))
        torch.save(self.critic.state_dict(),
                   '%s/%s_critic.pth' % (directory, filename))

    # load method to load a pre-trained model
    def load(self, filename, directory):
        self.actor.load_state_dict(
            torch.load('%s/%s_actor.pth' % (directory, filename)))
        self.critic.load_state_dict(
            torch.load('%s/%s_critic.pth' % (directory, filename)))