class DDPGAgent:
    def __init__(self, in_actor, hidden_in_actor, hidden_out_actor, out_actor, in_critic, hidden_in_critic, hidden_out_critic, lr_actor=1.0e-2, lr_critic=1.0e-2):
        super(DDPGAgent, self).__init__()

        self.actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device)
        self.critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device)
        self.target_actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device)
        self.target_critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device)

        self.noise = OUNoise(out_actor, scale=1.0)

        # initialize targets same as original networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor)
        self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=1.e-5)

    def act(self, obs, noise=0.0):
        obs = obs.to(device)
        action = self.actor(obs) + noise*self.noise.noise()
        return action

    def target_act(self, obs, noise=0.0):
        obs = obs.to(device)
        action = self.target_actor(obs) + noise*self.noise.noise()
        return action
Пример #2
0
 def __init__(self, action_size, state_size, params, device):
     self.batch_size = params.batch_size
     self.buffer_size = params.buffer_size
     self.tau = params.tau
     self.actor_lr = params.actor_lr
     self.critic_lr = params.critic_lr
     self.actor_weight_decay = params.actor_weight_decay
     self.critic_weight_decay = params.critic_weight_decay
     self.gamma = params.gamma
     self.params = params
     self.step_number =0
     self.device = device
     
     self.action_size= action_size
     self.state_size = state_size
     
     self.max_score = 40
     self.current_score = 0
     
     self.seed =  4
     
     self.actor_local = ActorNetwork(self.state_size, self.action_size, self.seed).to(device)
     self.actor_target = ActorNetwork(self.state_size, self.action_size, self.seed).to(device)
     
     self.critic_local = CriticNetwork(state_size, action_size, self.seed, params).to(device)
     self.critic_target = CriticNetwork(state_size, action_size, self.seed, params).to(device)
     
     self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.actor_lr)
     self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.critic_lr, weight_decay=self.critic_weight_decay)
     
     self.memory_buffer = PrioritizedMemory(self.buffer_size, self.batch_size, device)
     
     self.noise = OUNoise((20,self.action_size), self.seed)
Пример #3
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters
Пример #4
0
    def __init__(self):
        """Initialize an DDPG Agent object."""
        super(DDPGAgent, self).__init__()
        self.config = Config.getInstance()
        self.actor = Actor(self.config.state_size, self.config.action_size,
                           self.config.seed).to(self.config.device)
        self.critic = Critic(self.config.num_agents * self.config.state_size,
                             self.config.num_agents * self.config.action_size,
                             self.config.seed).to(self.config.device)
        self.target_actor = Actor(self.config.state_size,
                                  self.config.action_size,
                                  self.config.seed).to(self.config.device)
        self.target_critic = Critic(
            self.config.num_agents * self.config.state_size,
            self.config.num_agents * self.config.action_size,
            self.config.seed).to(self.config.device)

        self.noise = OUNoise(self.config.action_size, scale=1.0)

        # initialize targets same as original networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        self.actor_optimizer = Adam(self.actor.parameters(),
                                    lr=self.config.lr_actor)
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=self.config.lr_critic,
                                     weight_decay=self.config.weight_decay)
Пример #5
0
    def __init__(self,
                 in_actor,
                 hidden_in_actor,
                 hidden_out_actor,
                 out_actor,
                 in_critic_state,
                 hidden_in_critic,
                 hidden_out_critic,
                 critic_input_action_size,
                 lr_actor=1.0e-4,
                 lr_critic=3.0e-4):
        super(DDPGAgent, self).__init__()

        self.actor = Actor(in_actor, hidden_in_actor, hidden_out_actor,
                           out_actor).to(device)
        self.critic = Critic(in_critic_state, hidden_in_critic,
                             hidden_out_critic,
                             critic_input_action_size).to(device)
        self.target_actor = Actor(in_actor, hidden_in_actor, hidden_out_actor,
                                  out_actor).to(device)
        self.target_critic = Critic(in_critic_state, hidden_in_critic,
                                    hidden_out_critic,
                                    critic_input_action_size).to(device)
        self.action_size = out_actor
        self.noise = OUNoise(out_actor, scale=1.0)
        # initialize targets same as original networks one time in the initial step
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)
        self.noise_reduction = 1.0
        self.actor_optimizer = Adam(self.actor.parameters(),
                                    lr=lr_actor,
                                    weight_decay=0)
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=lr_critic,
                                     weight_decay=0)
Пример #6
0
    def __init__(self, state_size, action_size, random_seed):

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random_seed

        # ------------------ actor ------------------ #
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)

        # ------------------ critic ----------------- #
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)

        # ------------------ optimizers ------------- #
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC)

        # ----------------------- initialize target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, 1)
        self.soft_update(self.actor_local, self.actor_target, 1)
        self.t_step = 0

        # Noise process
        self.noise = OUNoise(action_size, random_seed)
        # Replay Buffer
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   device, random_seed)
Пример #7
0
class DDPGAgent:
    def __init__(self, state_size, action_size, seed=0, lr_actor=1.0e-4, lr_critic=1.0e-3):
        super(DDPGAgent, self).__init__()

        self.actor = Actor(state_size, action_size).to(device)
        self.critic = Critic(state_size, action_size, seed=seed).to(device)
        self.target_actor = Actor(state_size, action_size).to(device)
        self.target_critic = Critic(state_size, action_size, seed=seed).to(device)

        self.noise = OUNoise(action_size, scale=1.0 )


        # initialize targets same as original networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor)
        self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic) #, weight_decay=1.e-5


    def act(self, obs, noise=0.0):
        obs = obs.to(device)
        self.actor.eval()
        action = self.actor(obs).cpu().data.numpy() + noise*self.noise.noise()
        return np.clip(action, -1, 1)

    def target_act(self, obs, noise=0.0):
        obs = obs.to(device)
        action = self.target_actor(obs).cpu().data.numpy() + noise*self.noise.noise()
        return np.clip(action, -1, 1)
Пример #8
0
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 seed=0,
                 lr_actor=1.0e-4,
                 lr_critic=1.0e-3):
        super(DDPGAgent, self).__init__()

        self.actor = networkforall.Actor(state_size, action_size).to(device)
        self.critic = networkforall.Critic(state_size,
                                           action_size,
                                           num_agents,
                                           seed=seed).to(device)

        self.target_actor = networkforall.Actor(state_size,
                                                action_size).to(device)
        self.target_critic = networkforall.Critic(state_size,
                                                  action_size,
                                                  num_agents,
                                                  seed=seed).to(device)

        self.noise = OUNoise(action_size, scale=1.0)

        # initialize targets same as original networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor)
        self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic)
Пример #9
0
 def __init__(self, in_actor, hidden_in_actor, hidden_out_actor, out_actor, in_critic, \
              hidden_in_critic, hidden_out_critic, seed = 0, lr_actor=1.0e-3, lr_critic=1.0e-3):
     self.actor = DDPGNet(in_actor,
                          hidden_in_actor,
                          hidden_out_actor,
                          out_actor,
                          actor=True).to(device)
     self.target_actor = DDPGNet(in_actor,
                                 hidden_in_actor,
                                 hidden_out_actor,
                                 out_actor,
                                 actor=True).to(device)
     self.critic = DDPGNet(in_critic,
                           hidden_in_actor,
                           hidden_out_actor,
                           1,
                           actor=False).to(device)
     self.target_critic = DDPGNet(in_critic,
                                  hidden_in_actor,
                                  hidden_out_actor,
                                  1,
                                  actor=False).to(device)
     self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr_actor)
     self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                        lr=lr_critic)
     self.memory = ReplayBuffer(out_actor, BUFFER_SIZE, BATCH_SIZE, seed)
     self.noise = OUNoise(out_actor, scale=1.0)
     self.action_size = out_actor
     self.t_step = 0
     self.soft_update(self.actor, self.target_actor, 1.)
     self.soft_update(self.critic, self.target_critic, 1.)
Пример #10
0
    def __init__(self,
                 num_agents,
                 local_obs_dim,
                 local_action_size,
                 global_obs_dim,
                 global_action_size,
                 lr_actor=1.0e-4,
                 random_seed=4,
                 device=device):
        super(DDPGActor, self).__init__()

        self.device = device

        # create actor/target_actor and critic/target_critic
        self.actor_local = Actor(local_obs_dim, local_action_size,
                                 random_seed).to(self.device)
        self.actor_target = Actor(local_obs_dim, local_action_size,
                                  random_seed).to(self.device)

        #noise
        self.action_noise = OUNoise(local_action_size,
                                    seed=random_seed,
                                    theta=0.15,
                                    sigma=0.2)
        #self.param_noise = ActorParamNoise(local_obs_dim, local_action_size,random_seed,stddev = 0.5).to(self.device)

        # copy parameters to target networks
        hard_update(self.actor_target, self.actor_local)

        # create optimizers
        self.actor_optimizer = Adam(self.actor_local.parameters(), lr=lr_actor)
Пример #11
0
class DDPGAgent:
    def __init__(self,
                 in_actor,
                 out_actor,
                 in_critic,
                 lr_actor=1.0e-4,
                 lr_critic=1.0e-3):
        super(DDPGAgent, self).__init__()
        self.actor = Actor(in_actor, out_actor).to(device)
        self.critic = Critic(in_critic, out_actor * 2).to(device)
        self.target_actor = Actor(in_actor, out_actor).to(device)
        self.target_critic = Critic(in_critic, out_actor * 2).to(device)
        self.noise = OUNoise(out_actor, scale=1.0)

        # initialize targets same as original networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor)
        self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic)

    def act(self, obs, noise=0.0):
        obs = obs.to(device)
        self.actor.eval()
        with torch.no_grad():
            action = self.actor(
                obs).cpu().data.numpy() + noise * self.noise.noise()
        self.actor.train()
        return np.clip(action, -1, 1)

    def target_act(self, obs, noise=0.0):
        obs = obs.to(device)
        action = self.target_actor(
            obs).cpu().data.numpy() + noise * self.noise.noise()
        return np.clip(action, -1, 1)
    def __init__(self, state_size, action_size, n_agents, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        self.stacked_state_size = state_size * n_agents
        self.stacked_action_size = action_size * n_agents

        # Actor networks
        self.actor_local = ActorNetwork(state_size, action_size,
                                        seed).to(device)
        self.actor_target = ActorNetwork(state_size, action_size,
                                         seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=ACTOR_LR)

        # Critic networks
        self.critic_local = CriticNetwork(self.stacked_state_size,
                                          self.stacked_action_size,
                                          seed).to(device)
        self.critic_target = CriticNetwork(self.stacked_state_size,
                                           self.stacked_action_size,
                                           seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=CRITIC_LR)

        # OUNoise
        self.exploration_noise = OUNoise(action_size, seed)
Пример #13
0
    def __init__(self, agent_id, state_size, action_size, n_agents, seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.n_agents = n_agents
        self.seed = random.seed(seed)
        self.agent_id = agent_id
        # DDPG-Network
        self.network = DDPGModel(n_agents, state_size, action_size, seed)
        self.actor_local = self.network.actor_local
        self.actor_target = self.network.actor_target
        self.optimizer_actor = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        self.critic_target = self.network.critic_target
        self.critic_local = self.network.critic_local
        self.optimizer_critic = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC)

        # set noise
        self.noise = OUNoise(action_size, seed)
        self.eps = EPS_START
        self.t_step = 0
        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
Пример #14
0
    def __init__(self, in_actor, actor_fc1_units, actor_fc2_units, out_actor,
                 in_critic, critic_fc1_units, critic_fc2_units, lr_actor,
                 lr_critic, weight_decay_actor, weight_decay_critic):
        super(DDPGAgent, self).__init__()

        self.actor = Actor(in_actor, actor_fc1_units, actor_fc2_units,
                           out_actor).to(device)
        self.critic = Critic(in_critic, critic_fc1_units, critic_fc2_units,
                             1).to(device)
        self.target_actor = Actor(in_actor, actor_fc1_units, actor_fc2_units,
                                  out_actor).to(device)
        self.target_critic = Critic(in_critic, critic_fc1_units,
                                    critic_fc2_units, 1).to(device)

        self.target_actor.eval()
        self.target_critic.eval()

        self.noise = OUNoise(out_actor)

        # initialize targets same as original networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        self.actor_optimizer = Adam(self.actor.parameters(),
                                    lr=lr_actor,
                                    weight_decay=weight_decay_actor)
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=lr_critic,
                                     weight_decay=weight_decay_critic)
    def __init__(self,
                 in_actor,
                 hidden_in_actor,
                 hidden_out_actor,
                 out_actor,
                 lr_actor=1.0e-3):
        super(DDPGAgent, self).__init__()

        self.actor = Network(in_actor,
                             hidden_in_actor,
                             hidden_out_actor,
                             out_actor,
                             actor=True).to(device)
        self.target_actor = Network(in_actor,
                                    hidden_in_actor,
                                    hidden_out_actor,
                                    out_actor,
                                    actor=True).to(device)

        self.noise = OUNoise(out_actor, scale=1.0)

        # initialize targets same as original networks
        hard_update(self.target_actor, self.actor)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor)
Пример #16
0
class DDPGAgent:
    def __init__(self, in_actor, out_actor, hidden_in_actor, hidden_out_actor, state_dim_in_critic, action_dim_inp_critic, hidden_in_critic, hidden_out_critic, lr_actor=1.0e-4, lr_critic=1.0e-3):
        super(DDPGAgent, self).__init__()

        self.actor = Actor(in_actor, out_actor, hidden_in_actor, hidden_out_actor).to(device)
        self.critic = Critic(state_dim_in_critic, action_dim_inp_critic, hidden_in_critic, hidden_out_critic).to(device)
        self.target_actor = Actor(in_actor, out_actor, hidden_in_actor, hidden_out_actor).to(device)
        self.target_critic = Critic(state_dim_in_critic, action_dim_inp_critic, hidden_in_critic, hidden_out_critic).to(device)

      
        self.noise = OUNoise(out_actor, scale=1.0 )
        
        self.tau = TAU
        
        # initialize targets same as original networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor)
        self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=1.e-5)
        
    def act(self, obs, noise=0.0):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(obs).float().to(device).view(-1, 24)
        self.actor.eval()
        
        with torch.no_grad():
            action = self.actor(state).cpu().data.numpy()
        self.actor.train()
        add_noise = noise * self.noise.noise()
        action += add_noise.cpu().data.numpy()
        
        return np.clip(action, -1, 1).reshape(-1)

    def target_act(self, obs, noise=0.0):
        obs = obs.to(device)
        action = self.target_actor(obs) + noise*self.noise.noise()
        return action
    
    def reset(self):
        self.noise.reset()
        
        
    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
            
    def soft_update_all(self):
        DDPGAgent.soft_update(local_model=self.critic, target_model=self.critic_target, tau=self.tau)
        DDPGAgent.soft_update(local_model=self.actor, target_model=self.actor_target, tau=self.tau)
Пример #17
0
class DDPGAgent:
    def __init__(
            self,
            in_actor,
            out_actor,
            n_filt_actor,
            kernel_size_actor,
            stride_actor,
            fc_units_actor,
            in_critic,
            n_filt_critic,
            kernel_size_critic,
            stride_critic,
            fc_units_critic,
            lr_actor=1.0e-3,
            lr_critic=1.0e-5):  # 1e-5 was getting to 0.4 score (sporadically)
        super(DDPGAgent, self).__init__()

        self.actor = Network(in_actor,
                             out_actor,
                             n_filt_actor,
                             kernel_size_actor,
                             stride_actor,
                             fc_units_actor,
                             actor=True).to(device)
        self.critic = Network(in_critic, 1, n_filt_critic, kernel_size_critic,
                              stride_critic, fc_units_critic).to(device)
        self.target_actor = Network(in_actor,
                                    out_actor,
                                    n_filt_actor,
                                    kernel_size_actor,
                                    stride_actor,
                                    fc_units_actor,
                                    actor=True).to(device)
        self.target_critic = Network(in_critic, 1, n_filt_critic,
                                     kernel_size_critic, stride_critic,
                                     fc_units_critic).to(device)

        self.noise = OUNoise(out_actor, scale=.1)

        # initialize targets same as original networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor)
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=lr_critic,
                                     weight_decay=1e-3)

    def act(self, obs, noise=0.0):
        obs = obs.to(device)
        action = self.actor(obs) + noise * self.noise.noise()
        return action

    def target_act(self, obs, noise=0.0):
        obs = obs.to(device)
        action = self.target_actor(obs) + noise * self.noise.noise()
        return action
Пример #18
0
class DDPGActor():
    def __init__(self,
                 num_agents,
                 local_obs_dim,
                 local_action_size,
                 global_obs_dim,
                 global_action_size,
                 lr_actor=1.0e-4,
                 random_seed=4,
                 device=device):
        super(DDPGActor, self).__init__()

        self.device = device

        # create actor/target_actor and critic/target_critic
        self.actor_local = Actor(local_obs_dim, local_action_size,
                                 random_seed).to(self.device)
        self.actor_target = Actor(local_obs_dim, local_action_size,
                                  random_seed).to(self.device)

        #noise
        self.action_noise = OUNoise(local_action_size,
                                    seed=random_seed,
                                    theta=0.15,
                                    sigma=0.2)
        #self.param_noise = ActorParamNoise(local_obs_dim, local_action_size,random_seed,stddev = 0.5).to(self.device)

        # copy parameters to target networks
        hard_update(self.actor_target, self.actor_local)

        # create optimizers
        self.actor_optimizer = Adam(self.actor_local.parameters(), lr=lr_actor)

    def act(self, local_obs, noise_coef, add_noise=True):
        state = torch.from_numpy(local_obs).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.action_noise.sample(
            ) * noise_coef  #self.sigma * np.random.randn(self.action_size)#self.noise.sample()[0] * self.noise_coef
        return np.clip(action, -1, 1)

    def target_act(self, local_obs, noise_coef=0, add_noise=False):
        #state = torch.from_numpy(local_obs).float().to(device)
        state = local_obs
        self.actor_target.eval()
        with torch.no_grad():
            action = self.actor_target(state).cpu().data.numpy()
        self.actor_target.train()
        if add_noise:
            action += self.action_noise.sample(
            ) * noise_coef  #self.sigma * np.random.randn(self.action_size)#self.noise.sample()[0] * self.noise_coef
        return np.clip(action, -1, 1)

    def update_target(self, tau):
        soft_update(self.actor_target, self.actor_local, tau)
Пример #19
0
class DDPGAgent:
    """Interacts with and learns from the environment using DDPG method."""
    def __init__(self):
        """Initialize an DDPG Agent object."""
        super(DDPGAgent, self).__init__()
        self.config = Config.getInstance()
        self.actor = Actor(self.config.state_size, self.config.action_size,
                           self.config.seed).to(self.config.device)
        self.critic = Critic(self.config.num_agents * self.config.state_size,
                             self.config.num_agents * self.config.action_size,
                             self.config.seed).to(self.config.device)
        self.target_actor = Actor(self.config.state_size,
                                  self.config.action_size,
                                  self.config.seed).to(self.config.device)
        self.target_critic = Critic(
            self.config.num_agents * self.config.state_size,
            self.config.num_agents * self.config.action_size,
            self.config.seed).to(self.config.device)

        self.noise = OUNoise(self.config.action_size, scale=1.0)

        # initialize targets same as original networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        self.actor_optimizer = Adam(self.actor.parameters(),
                                    lr=self.config.lr_actor)
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=self.config.lr_critic,
                                     weight_decay=self.config.weight_decay)

    def act(self, obs, noise_decay_parameter=0.0):
        """
        Returns actions for given state as per current policy for an agent.
        """
        obs = torch.from_numpy(obs).float().to(self.config.device)
        self.actor.eval()
        with torch.no_grad():
            action = self.actor(obs).cpu().data.numpy()
        self.actor.train()
        action += noise_decay_parameter * self.noise.sample()
        return action

    def target_act(self, obs, noise_decay_parameter=0.0):
        """
        Returns target network actions from an agent
        """
        obs = obs.to(self.config.device)
        action = self.target_actor(
            obs) + noise_decay_parameter * self.noise.sample()
        return action

    def reset(self):
        """Reset the internal state of noise mean(mu)"""
        self.noise.reset()
Пример #20
0
class DDPGAgent:
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 seed=0,
                 lr_actor=1.0e-4,
                 lr_critic=1.0e-3):
        super(DDPGAgent, self).__init__()

        self.actor = networkforall.Actor(state_size, action_size).to(device)
        self.critic = networkforall.Critic(state_size,
                                           action_size,
                                           num_agents,
                                           seed=seed).to(device)

        self.target_actor = networkforall.Actor(state_size,
                                                action_size).to(device)
        self.target_critic = networkforall.Critic(state_size,
                                                  action_size,
                                                  num_agents,
                                                  seed=seed).to(device)

        self.noise = OUNoise(action_size, scale=1.0)

        # initialize targets same as original networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor)
        self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic)

    def act(self, obs, noise=0.0):

        obs = obs.to(device)
        self.actor.eval(
        )  #Sets the module in evaluation mode. Seems to get rid of error you get from batchnorm without it
        #convert to cpu() since noise is in cpu()
        action = self.actor(
            obs).cpu().data.numpy() + noise * self.noise.noise()

        #np.clip to make the action lie between -1 and 1
        return np.clip(action, -1, 1)

    def target_act(self, obs, noise=0.0):

        obs = obs.to(device)
        self.target_actor.eval()
        # convert to cpu() since noise is in cpu()
        action = self.target_actor(
            obs).cpu().data.numpy() + noise * self.noise.noise()

        # np.clip to make the action lie between -1 and 1
        return np.clip(action, -1, 1)
Пример #21
0
    def __init__(self,
                 state_size,
                 action_size,
                 hidden_in_dim,
                 hidden_out_dim,
                 extrem_out=64,
                 num_agents=2,
                 lr_actor=1.0e-4,
                 lr_critic=1.0e-3):
        super(DDPGAgent, self).__init__()
        critic_state_size = state_size * num_agents
        critic_action_size = (action_size * (num_agents))
        self.actor = Network(state_size,
                             action_size,
                             hidden_in_dim,
                             hidden_out_dim,
                             hidden_extrem_out=extrem_out,
                             actor=True).to(device)
        self.critic = Network(critic_state_size,
                              critic_action_size,
                              hidden_in_dim,
                              hidden_out_dim,
                              hidden_extrem_out=extrem_out).to(device)
        self.target_actor = Network(state_size,
                                    action_size,
                                    hidden_in_dim,
                                    hidden_out_dim,
                                    hidden_extrem_out=extrem_out,
                                    actor=True).to(device)
        self.target_critic = Network(critic_state_size,
                                     critic_action_size,
                                     hidden_in_dim,
                                     hidden_out_dim,
                                     hidden_extrem_out=extrem_out).to(device)

        self.noise = OUNoise(action_size, scale=1.0)

        # initialize targets same as original networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor)
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=lr_critic,
                                     weight_decay=0)

        print("critic", self.critic, self.target_critic, "optim",
              self.critic_optimizer)
        print("actor", self.actor, self.target_actor, "optim",
              self.actor_optimizer)
Пример #22
0
    def __init__(self, state_size, action_size, params, n_agents=2):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            params (dict): all parameters
        """
        self.params = params
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(params["seed"])

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(
            state_size, action_size, params).to(device)
        self.actor_target = Actor(
            state_size, action_size, params).to(device)
        self.actor_optimizer = optim.Adam(
            self.actor_local.parameters(), lr=params["lr_actor"])

        # Critic Network (w/ Target Network)
        if params["type"] == "MADDPG":
            self.critic_local = Critic(
                n_agents * state_size, n_agents * action_size, params).to(device)
            self.critic_target = Critic(
                n_agents * state_size, n_agents * action_size, params).to(device)
            self.critic_optimizer = optim.Adam(
                self.critic_local.parameters(), lr=params["lr_critic"], weight_decay=params["weight_decay"])
        else:
            self.critic_local = Critic(
                state_size, action_size, params).to(device)
            self.critic_target = Critic(
                state_size, action_size, params).to(device)
            self.critic_optimizer = optim.Adam(
                self.critic_local.parameters(), lr=params["lr_critic"], weight_decay=params["weight_decay"])

        # initialize targets same as original networks
        hard_update(self.actor_target, self.actor_local)
        hard_update(self.critic_target, self.critic_local)

        # Noise process
        self.noise = OUNoise(action_size, params["seed"])

        # Replay memory
        self.memory = ReplayBuffer(
            action_size, self.params["buffer_size"], self.params["batch_size"], params["seed"])

        self.t_step = 0
Пример #23
0
    def __init__(self, config):
        """Initialize an Agent object.
        
        Params
        ======
            config : configuration given a variety of parameters
        """
        self.config = config

        # set parameter for ML
        self.set_parameters(config)
        # Q-Network
        self.create_networks()
        # Noise process
        self.noise = OUNoise(self.action_size, self.seed, sigma=self.sigma)
Пример #24
0
class DDPGAgent:
    """ Implements the structure of the DDPG Reinforcement Learning algorithm"""
    def __init__(self, actor_layer_sizes=[24, 128,128,2], critic_layer_sizes=[24, 128,128,1], lr_actor=1e-3, lr_critic=1e-3, clamp_actions=True, logger=None, log_layers=False):
        super(DDPGAgent, self).__init__()

        # SET UP ACTOR AND CRITIC NETWORKS
        self.actor = Network(layer_sizes=actor_layer_sizes, actor=True, logger=logger, log_layers=log_layers).to(device)
        self.critic = Network(layer_sizes=critic_layer_sizes, logger=logger, log_layers=log_layers).to(device)
        self.target_actor = Network(layer_sizes=actor_layer_sizes, actor=True, logger=logger, log_layers=log_layers).to(device)
        self.target_critic = Network(layer_sizes=critic_layer_sizes, logger=logger, log_layers=log_layers).to(device)

        # INITIALIZE TARGET NETWORKS TO HAVE SAME WEIGHTS AS LOCAL NETWORKS
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        # OPTIMIZERS
        self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor)
        self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=0.0)

        # NOISE - for exploration of actions
        self.noise = OUNoise(actor_layer_sizes[-1], scale=1.0 )
        self.clamp_actions = clamp_actions

    def act(self, obs, noise=0.0):
        """ Given a tensor representing the states, it returns the predicted
            actions the agent should take using the LOCAL network.

            If `noise` is provided, it adds some random noise to the actions
            to make the agent explore.
        """
        obs = obs.to(device)
        action = self.actor(obs) + noise*self.noise.noise()
        if self.clamp_actions:
            action = torch.clamp(action, -1.0, 1.0)
        return action

    def target_act(self, obs, noise=0.0):
        """ Given a tensor representing the states, it returns the predicted
            actions the agent should take using the LOCAL network.

            If `noise` is provided, it adds some random noise to the actions
            to make the agent explore.
        """
        obs = obs.to(device)
        action = self.target_actor(obs) + noise*self.noise.noise()
        if self.clamp_actions:
            action = torch.clamp(action, -1.0, 1.0)
        return action
Пример #25
0
    def __init__(self,
                 in_actor,
                 out_actor,
                 in_critic,
                 lr_actor=1.0e-4,
                 lr_critic=1.0e-3):
        super(DDPGAgent, self).__init__()

        hidden_in_actor = 64
        hidden_out_actor = 128
        hidden_in_critic = hidden_in_actor
        hidden_out_critic = hidden_out_actor

        self.actor = Network(in_actor,
                             hidden_in_actor,
                             hidden_out_actor,
                             out_actor,
                             actor=True).to(device)
        self.critic = Network(in_critic,
                              hidden_in_critic,
                              hidden_out_critic,
                              1,
                              out_actor,
                              actor=False).to(device)
        self.target_actor = Network(in_actor,
                                    hidden_in_actor,
                                    hidden_out_actor,
                                    out_actor,
                                    actor=True).to(device)
        self.target_critic = Network(in_critic,
                                     hidden_in_critic,
                                     hidden_out_critic,
                                     1,
                                     out_actor,
                                     actor=False).to(device)

        self.noise = OUNoise(out_actor, scale=0.9)  #scale 1.0
        self.noise_shape = out_actor

        # initialize targets same as original networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor)
        WD = 1e-5
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=lr_critic,
                                     weight_decay=WD)
Пример #26
0
    def __init__(self, state_size, action_size, config):
        """ Initialize an agent object """

        self.state_size = state_size
        self.action_size = action_size
        self.config = config

        # retrieve number of agents
        self.num_agents = config["DDPG"]["num_agents"]

        # logging for this class
        self.logger = logging.getLogger(self.__class__.__name__)

        # gpu support
        self.device = pick_device(config, self.logger)

        ## Actor local and target networks
        self.actor_local = Actor(state_size, action_size,
                                 config).to(self.device)
        self.actor_target = Actor(state_size, action_size,
                                  config).to(self.device)
        self.actor_optimizer = getattr(
            optim, config["optimizer_actor"]["optimizer_type"])(
                self.actor_local.parameters(),
                betas=tuple(config["optimizer_actor"]["betas"]),
                **config["optimizer_actor"]["optimizer_params"])

        ## Critic local and target networks
        self.critic_local = Critic(state_size, action_size,
                                   config).to(self.device)
        self.critic_target = Critic(state_size, action_size,
                                    config).to(self.device)
        self.critic_optimizer = getattr(
            optim, config["optimizer_critic"]["optimizer_type"])(
                self.critic_local.parameters(),
                betas=tuple(config["optimizer_critic"]["betas"]),
                **config["optimizer_critic"]["optimizer_params"])

        ## Noise process
        self.noise = OUNoise((self.num_agents, action_size))

        ## Replay memory
        self.memory = ReplayBuffer(config=config,
                                   action_size=action_size,
                                   buffer_size=int(
                                       config["DDPG"]["buffer_size"]),
                                   batch_size=config["trainer"]["batch_size"])
Пример #27
0
    def __init__(self,
                 agent_id,
                 model,
                 action_size=2,
                 seed=0,
                 tau=1e-3,
                 lr_actor=1e-4,
                 lr_critic=1e-3,
                 weight_decay=0.0):
        """
        Params
        ======
            model: model object
            action_size (int): dimension of each action
            seed (int): Random seed
            tau (float): for soft update of target parameters
            lr_actor (float): learning rate for actor
            lr_critic (float): learning rate for critic
            weight_decay (float): L2 weight decay
        """
        random.seed(seed)
        self.id = agent_id
        self.action_size = action_size
        self.tau = tau
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic

        # Actor Network
        self.actor_local = model.actor_local
        self.actor_target = model.actor_target
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Network
        self.critic_local = model.critic_local
        self.critic_target = model.critic_target
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=weight_decay)

        # Set weights for local and target actor, respectively, critic the same
        self.hard_update(self.actor_target, self.actor_local)
        self.hard_update(self.critic_target, self.critic_local)

        # Noise process
        self.noise = OUNoise(action_size, seed)
Пример #28
0
    def __init__(self, state_size, action_size, num_agents, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agens
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)
        self.eps = eps_start

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # initialize targets same as original networks
        #self.hard_update(self.actor_target, self.actor_local)
        #self.hard_update(self.critic_target, self.critic_local)

        # Noise process
        #self.noise = OUNoise(action_size, random_seed)
        self.noise = OUNoise((num_agents, action_size), random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
Пример #29
0
    def __init__(self,
                 num_agents,
                 local_obs_dim,
                 local_action_size,
                 global_obs_dim,
                 global_action_size,
                 lr_actor=1.0e-4,
                 lr_critic=1.0e-4,
                 random_seed=4,
                 device=device,
                 weight_decay=0.0):
        super(DDPGAgent, self).__init__()

        self.device = device
        self.weight_decay = weight_decay

        # create actor/target_actor and critic/target_critic
        self.actor = Actor(local_obs_dim, local_action_size,
                           random_seed).to(self.device)
        self.critic = CentralizedCritic(global_obs_dim,
                                        global_action_size).to(self.device)
        self.target_actor = Actor(local_obs_dim, local_action_size,
                                  random_seed).to(self.device)
        self.target_critic = CentralizedCritic(
            global_obs_dim, global_action_size).to(self.device)

        #noise
        self.action_noise = OUNoise(local_action_size, scale=1.0, sigma=0.1)
        self.param_noise = ActorParamNoise(local_obs_dim,
                                           local_action_size,
                                           random_seed,
                                           stddev=0.5).to(self.device)
        #self.param_noise_rate = 0.999 # apply this rate to the param noise, gradually get rid of the noise
        #self.use_action_noise = use_action_noise
        #self.use_param_noise = use_param_noise

        # copy parameters to target networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        # create optimizers
        self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor)
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=lr_critic,
                                     weight_decay=self.weight_decay)
Пример #30
0
    def __init__(self,
                 in_actor,
                 hidden_in_actor,
                 hidden_out_actor,
                 out_actor,
                 in_critic,
                 hidden_in_critic,
                 hidden_out_critic,
                 lr_actor=1.0e-2,
                 lr_critic=1.0e-2,
                 weight_decay=1.0e-5,
                 device='cuda:0'):
        super(DDPGAgent, self).__init__()

        hidden_gat_dim = 64
        self.actor = ActorNetwork(in_actor,
                                  hidden_in_actor,
                                  hidden_out_actor,
                                  out_actor,
                                  actor=True).to(device)
        self.critic = CriticNetwork(in_critic, hidden_gat_dim,
                                    hidden_in_critic, hidden_out_critic,
                                    1).to(device)
        # print("actor parameters are: " + str(self.count_parameters(self.actor)))
        # print("critic parameters are: " + str(self.count_parameters(self.critic)))
        self.target_actor = ActorNetwork(in_actor,
                                         hidden_in_actor,
                                         hidden_out_actor,
                                         out_actor,
                                         actor=True).to(device)
        self.target_critic = CriticNetwork(in_critic, hidden_gat_dim,
                                           hidden_in_critic, hidden_out_critic,
                                           1).to(device)

        self.noise = OUNoise(out_actor, scale=1.0)
        self.device = device

        # initialize targets same as original networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor)
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=lr_critic,
                                     weight_decay=weight_decay)
Пример #31
0
    def __init__(self,
                 state_dim,
                 action_dim,
                 p_learning_rate=0.0002,
                 q_learning_rate=0.001,
                 gamma=0.9,
                 eta=0.0003,
                 batch_size=64,
                 replay_buffer_size=1024 * 1024,
                 min_train_replays=1024 * 16,
                 logdir='',
                 save_path='',
                 *args,
                 **kwargs):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.hl1_dim = 250  # hidden layer 1
        self.hl2_dim = 250  # hidden layer 2

        self.batch_size = batch_size
        self.replay_buffer_size = replay_buffer_size
        self.min_train_replays = min_train_replays

        self.noise = OUNoise(action_dim)
        self.time_step = 0
        self.replay_buffer = deque()

        self.gamma = gamma
        self.eta = eta
        self.alpha = self.initial_alpha = 1.0
        self.final_alpha = 0.01
        self.p_learning_rate = p_learning_rate
        self.q_learning_rate = q_learning_rate

        self.save_path = save_path

        self.create_network()
        self.session = tf.InteractiveSession()
        self.session.run(tf.initialize_all_variables())
        self.session.run(self.init_target_theta)
        # self.load()
        self.summary_writer = tf.train.SummaryWriter(logdir, self.session.graph)
Пример #32
0
class Ddpg(object):
    def __init__(self,
                 state_dim,
                 action_dim,
                 p_learning_rate=0.0002,
                 q_learning_rate=0.001,
                 gamma=0.9,
                 eta=0.0003,
                 batch_size=64,
                 replay_buffer_size=1024 * 1024,
                 min_train_replays=1024 * 16,
                 logdir='',
                 save_path='',
                 *args,
                 **kwargs):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.hl1_dim = 250  # hidden layer 1
        self.hl2_dim = 250  # hidden layer 2

        self.batch_size = batch_size
        self.replay_buffer_size = replay_buffer_size
        self.min_train_replays = min_train_replays

        self.noise = OUNoise(action_dim)
        self.time_step = 0
        self.replay_buffer = deque()

        self.gamma = gamma
        self.eta = eta
        self.alpha = self.initial_alpha = 1.0
        self.final_alpha = 0.01
        self.p_learning_rate = p_learning_rate
        self.q_learning_rate = q_learning_rate

        self.save_path = save_path

        self.create_network()
        self.session = tf.InteractiveSession()
        self.session.run(tf.initialize_all_variables())
        self.session.run(self.init_target_theta)
        # self.load()
        self.summary_writer = tf.train.SummaryWriter(logdir, self.session.graph)

    def theta_p(self):
        with tf.variable_scope("theta_p"):
            return [
                tf.Variable(random_init([self.state_dim, self.hl1_dim]), name="W1"),
                tf.Variable(random_init([self.hl1_dim]), name="b1"),
                tf.Variable(random_init([self.hl1_dim, self.hl2_dim]), name="W2"),
                tf.Variable(random_init([self.hl2_dim]), name="b2"),
                tf.Variable(random_init([self.hl2_dim, self.action_dim]), name="W3"),
                tf.Variable(random_init([self.action_dim]), name="b3")
            ]

    def theta_q(self):
        with tf.variable_scope("theta_q"):
            return [
                tf.Variable(random_init([self.state_dim, self.hl1_dim]), name='W1'),
                tf.Variable(random_init([self.hl1_dim]), name='b1'),
                tf.Variable(random_init([self.hl1_dim + self.action_dim, self.hl2_dim]), name='W2'),
                tf.Variable(random_init([self.hl2_dim]), name='b2'),
                tf.Variable(random_init([self.hl2_dim, 1]), name='W3'),
                tf.Variable(random_init([1]), name='b3')
            ]

    def create_policy_network(self, state, theta, name="policy_network"):
        with tf.variable_op_scope([state], name, name):
            h0 = tf.identity(state, "state")
            h1 = tf.nn.relu(tf.matmul(h0, theta[0]) + theta[1], name='h1')
            h2 = tf.nn.relu(tf.matmul(h1, theta[2]) + theta[3], name="h2")
            h3 = tf.identity(tf.matmul(h2, theta[4]) + theta[5], name='h3')
            action = tf.nn.tanh(h3, name='action')
            return action

    def create_q_network(self, state, action, theta, name='q_network'):
        with tf.variable_op_scope([state, action], name, name):
            h0 = tf.identity(state, name='state')
            h1_state = tf.nn.relu(tf.matmul(h0, theta[0]) + theta[1])
            # h1 = concat(h1_state,action)
            h1 = tf.concat(1, [h1_state, action], name="h1")
            h2 = tf.nn.relu(tf.matmul(h1, theta[2]) + theta[3], name="h2")
            h3 = tf.add(tf.matmul(h2, theta[4]), theta[5], name='h3')
            q = tf.squeeze(h3, [1], name='q')
            return q

    def create_network(self):

        theta_q, theta_p = self.theta_q(), self.theta_p()
        target_theta_q, target_theta_p = self.theta_q(), self.theta_p()

        # init target theta with the  same value of theta
        init_target_theta_q = [
            target_theta_q[i].assign(theta_q[i].value()) for i in range(len(theta_q))
            ]

        init_target_theta_p = [
            target_theta_p[i].assign(theta_p[i].value()) for i in range(len(theta_p))
            ]
        self.init_target_theta = init_target_theta_q + init_target_theta_p

        self.state = tf.placeholder(tf.float32, [None, self.state_dim], 'state')
        self.action = tf.placeholder(tf.float32, [None, self.action_dim], 'action')
        self.next_state = tf.placeholder(tf.float32, [None, self.state_dim], 'next_state')
        self.reward = tf.placeholder(tf.float32, [None], 'reward')
        self.terminate = tf.placeholder(tf.bool, [None], 'terminate')

        #  q optimizer
        q = self.create_q_network(self.state, self.action, theta_q)
        next_action = self.create_policy_network(self.next_state, target_theta_p)
        next_q = self.create_q_network(self.next_state, next_action, target_theta_q)
        y_input = tf.stop_gradient(tf.select(self.terminate, self.reward, self.reward + self.gamma * next_q))
        q_error = tf.reduce_mean(tf.square(y_input - q))
        ## normalize
        q_loss = q_error + tf.add_n([0.01 * tf.nn.l2_loss(var) for var in theta_q])
        q_optimizer = tf.train.AdamOptimizer(self.q_learning_rate)
        grads_and_vars_q = q_optimizer.compute_gradients(q_loss, var_list=theta_q)
        q_train = q_optimizer.apply_gradients(grads_and_vars_q)

        #  policy optimizer
        self.action_exploration = self.create_policy_network(self.state, theta_p)
        q1 = self.create_q_network(self.state, self.action_exploration, theta_q)
        p_error = - tf.reduce_mean(q1)
        ## normalize
        p_loss = p_error + tf.add_n([0.01 * tf.nn.l2_loss(var) for var in theta_p])
        p_optimizer = tf.train.AdamOptimizer(self.p_learning_rate)
        grads_and_vars_p = p_optimizer.compute_gradients(p_loss, var_list=theta_p)
        p_train = p_optimizer.apply_gradients(grads_and_vars_p)

        # train q and update target_theta_q
        update_theta_q = [
            target_theta_q[i].assign(theta_q[i].value() * self.eta + target_theta_q[i].value() * (1 - self.eta)) for i
            in range(len(theta_q))]

        with tf.control_dependencies([q_train]):
            self.train_q = tf.group(*update_theta_q)

        # train p and update target_theta_p
        update_theta_p = [
            target_theta_p[i].assign(theta_p[i].value() * self.eta + target_theta_p[i].value() * (1 - self.eta)) for i
            in range(len(theta_p))]

        with tf.control_dependencies([p_train]):
            self.train_p = tf.group(*update_theta_p)


        # summary
        tf.scalar_summary('q_loss', q_loss)
        tf.scalar_summary('p_loss', p_loss)
        self.merged_op = tf.merge_all_summaries()

    def train(self):
        minibatch = random.sample(self.replay_buffer, self.batch_size)
        state_batch = [v[0] for v in minibatch]
        action_batch = [v[1] for v in minibatch]
        reward_batch = [v[2] for v in minibatch]
        next_state_batch = [v[3] for v in minibatch]
        terminate_batch = [v[4] for v in minibatch]

        _, _, summary_str = self.session.run([self.train_p, self.train_q, self.merged_op], feed_dict={
            self.state: state_batch,
            self.action: action_batch,
            self.reward: reward_batch,
            self.terminate: terminate_batch,
            self.next_state: next_state_batch
        })
        self.summary_writer.add_summary(summary_str, self.time_step)
        self.summary_writer.flush()

        if self.time_step % 1000 == 0:
            self.save(self.time_step)

    def observe_action(self, state, action, reward, next_state, terminate):
        self.time_step += 1
        self.replay_buffer.append((state, action, reward, next_state, terminate))
        if len(self.replay_buffer) > self.replay_buffer_size:
            self.replay_buffer.popleft()

        if self.time_step > self.min_train_replays:
            self.train()

        if terminate:
            self.noise.reset()

    def exploration(self, state):
        action = self.session.run(self.action_exploration, feed_dict={self.state: [state]})[0]
        return np.clip(action, -1, 1)

    def exploration_with_noise(self, state):
        action = self.session.run(self.action_exploration, feed_dict={self.state: [state]})[0]
        self.alpha -= (self.initial_alpha - self.final_alpha) / 100000
        self.alpha = max(self.alpha, 0.0)
        noise = self.noise.noise() * self.alpha
        return np.clip(action + noise, -1, 1)

    def save(self, step):
        saver = tf.train.Saver()
        saver.save(self.session, save_path=self.save_path, global_step=step)

    def load(self):
        saver = tf.train.Saver()
        checkpoint = tf.train.get_checkpoint_state(self.save_path)
        if checkpoint and checkpoint.model_checkpoint_path:
            saver.restore(self.session, checkpoint.model_checkpoint_path)