def __init__(self, action_size, action_type, state_size, hidden_in_size,
                 hidden_out_size, num_atoms, lr_actor, lr_critic, l2_decay,
                 noise_type, OU_mu, OU_theta, OU_sigma):
        super(DDPGAgent, self).__init__()

        # creating actors, critics and targets using the specified layer sizes. Note for the critics we assume 2 agents
        self.actor = Actor(action_size, state_size, hidden_in_size,
                           hidden_out_size, action_type).to(device)
        self.critic = Critic(2 * action_size, 2 * state_size, hidden_in_size,
                             hidden_out_size, num_atoms).to(device)
        self.target_actor = Actor(action_size, state_size, hidden_in_size,
                                  hidden_out_size, action_type).to(device)
        self.target_critic = Critic(2 * action_size, 2 * state_size,
                                    hidden_in_size, hidden_out_size,
                                    num_atoms).to(device)
        self.noise_type = noise_type
        self.action_type = action_type

        if noise_type == 'OUNoise':  # if we're using OUNoise it needs to be initialised as it is an autocorrelated process
            self.noise = OUNoise(action_size, OU_mu, OU_theta, OU_sigma)

        # initialize targets same as original networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        # initialize optimisers using specigied learning rates
        self.actor_optimizer = Adam(self.actor.parameters(),
                                    lr=lr_actor,
                                    weight_decay=l2_decay)
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=lr_critic,
                                     weight_decay=l2_decay)
示例#2
0
    def __init__(self, state_size, action_size, params):
        """
        Build model, random process and intilize it.
        """
        torch.manual_seed(params['SEED'])
        self.random_process = OUNoise(action_size, params)

        self.local_actor = Actor(state_size, action_size, params['SEED'],
                                 params['FC1'], params['FC2']).to(device)
        self.target_actor = Actor(state_size, action_size, params['SEED'],
                                  params['FC1'], params['FC2']).to(device)
        # Initialize target networks weights with local networks
        self.hard_copy(self.local_actor, self.target_actor)
        # Optimizer for local actor networks
        self.actor_optimizer = torch.optim.Adam(self.local_actor.parameters(),
                                                params['LR_ACTOR'])

        self.local_critic = Critic(state_size, action_size, params['SEED'],
                                   params['FC1'], params['FC2']).to(device)
        self.target_critic = Critic(state_size, action_size, params['SEED'],
                                    params['FC1'], params['FC2']).to(device)
        # Initialize target networks weights with local networks
        self.hard_copy(self.local_critic, self.target_critic)
        # Optimizer for local critic networks
        self.critic_optimizer = torch.optim.Adam(
            self.local_critic.parameters(), params['LR_CRITIC'])
示例#3
0
 def __init__(self,action_dim,state_dim,agentParam,useLaw,useCenCritc,num_agent,CNN=False, width=None, height=None, channel=None):
     self.CNN = CNN
     self.device = agentParam["device"]
     if CNN:
         self.CNN_preprocessA = CNN_preprocess(width,height,channel)
         self.CNN_preprocessC = CNN_preprocess(width,height,channel)
         state_dim = self.CNN_preprocessA.get_state_dim()
     #if agentParam["ifload"]:
         #self.actor = torch.load(agentParam["filename"]+"actor_"+agentParam["id"]+".pth",map_location = torch.device('cuda'))
         #self.critic = torch.load(agentParam["filename"]+"critic_"+agentParam["id"]+".pth",map_location = torch.device('cuda'))
     #else:
     if useLaw:
         self.actor = ActorLaw(action_dim,state_dim).to(self.device)
     else:
         self.actor = Actor(action_dim,state_dim).to(self.device)
     if useCenCritc:
         self.critic = Centralised_Critic(state_dim,num_agent).to(self.device)
     else:
         self.critic = Critic(state_dim).to(self.device)
     self.action_dim = action_dim
     self.state_dim = state_dim
     self.noise_epsilon = 0.99
     self.constant_decay = 0.1
     self.optimizerA = torch.optim.Adam(self.actor.parameters(), lr = 0.001)
     self.optimizerC = torch.optim.Adam(self.critic.parameters(), lr = 0.001)
     self.lr_scheduler = {"optA":torch.optim.lr_scheduler.StepLR(self.optimizerA,step_size=1000,gamma=0.9,last_epoch=-1),
                          "optC":torch.optim.lr_scheduler.StepLR(self.optimizerC,step_size=1000,gamma=0.9,last_epoch=-1)}
     if CNN:
         # self.CNN_preprocessA = CNN_preprocess(width,height,channel)
         # self.CNN_preprocessC = CNN_preprocess
         self.optimizerA = torch.optim.Adam(itertools.chain(self.CNN_preprocessA.parameters(),self.actor.parameters()),lr=0.0001)
         self.optimizerC = torch.optim.Adam(itertools.chain(self.CNN_preprocessC.parameters(),self.critic.parameters()),lr=0.001)
         self.lr_scheduler = {"optA": torch.optim.lr_scheduler.StepLR(self.optimizerA, step_size=10000, gamma=0.9, last_epoch=-1),
                              "optC": torch.optim.lr_scheduler.StepLR(self.optimizerC, step_size=10000, gamma=0.9, last_epoch=-1)}
示例#4
0
    def __init__(self, state_dim, action_dim, max_action, agent_n, logger):
        # 存在于 GPU 的神经网络
        self.actor = Actor(state_dim, action_dim,
                           max_action).to(self.device)  # origin_network
        self.actor_target = Actor(state_dim, action_dim,
                                  max_action).to(self.device)  # target_network
        self.actor_target.load_state_dict(self.actor.state_dict(
        ))  # initiate actor_target with actor's parameters
        # pytorch 中的 tensor 默认requires_grad 属性为false,即不参与梯度传播运算,特别地,opimizer中模型参数是会参与梯度优化的
        self.actor_optimizer = optim.Adam(
            self.actor.parameters(),
            pdata.LEARNING_RATE)  # 以pdata.LEARNING_RATE指定学习率优化actor中的参数

        self.critic = CriticCentral(agent_n).to(self.device)
        self.critic_target = CriticCentral(agent_n).to(self.device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           pdata.LEARNING_RATE)
        # self.replay_buffer 取消:每个Agent不再有独立的经验池
        self.writer = SummaryWriter(pdata.DIRECTORY + 'runs')
        self.num_critic_update_iteration = 0
        self.num_actor_update_iteration = 0
        self.num_training = 0

        self.logger = logger
    def __init__(self, state_size=24, action_size=2, seed=1, num_agents=2):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.num_agents = num_agents

        # DDPG specific configuration
        hidden_size = 512
        self.CHECKPOINT_FOLDER = './'

        # Defining networks
        self.actor = Actor(state_size, hidden_size, action_size).to(device)
        self.actor_target = Actor(state_size, hidden_size, action_size).to(device)

        self.critic = Critic(state_size, self.action_size, hidden_size, 1).to(device)
        self.critic_target = Critic(state_size, self.action_size, hidden_size, 1).to(device)

        self.optimizer_actor = optim.Adam(self.actor.parameters(), lr=ACTOR_LR)
        self.optimizer_critic = optim.Adam(self.critic.parameters(), lr=CRITIC_LR)

        # Noise
        self.noises = OUNoise((num_agents, action_size), seed)

        # Initialize replay buffer
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.t_step = 0
        
        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
示例#7
0
    def __init__(self, s_dim, a_dim, num_agent, **kwargs):
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.config = kwargs['config']
        self.num_agent = num_agent

        self.actor = Actor(s_dim, a_dim)
        self.actor_target = Actor(s_dim, a_dim)
        self.critic = Critic(s_dim, a_dim, num_agent)
        self.critic_target = Critic(s_dim, a_dim, num_agent)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=self.config.a_lr)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=self.config.c_lr)
        self.a_loss = 0
        self.c_loss = 0

        if self.config.use_cuda:
            self.actor.cuda()
            self.actor_target.cuda()
            self.critic.cuda()
            self.critic_target.cuda()

        hard_update(self.actor, self.actor_target)
        hard_update(self.critic, self.critic_target)

        self.random_process = OrnsteinUhlenbeckProcess(
            size=self.a_dim,
            theta=self.config.ou_theta,
            mu=self.config.ou_mu,
            sigma=self.config.ou_sigma)
示例#8
0
    def __init__(self, s_dim, a_dim, **kwargs):
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.config = kwargs['config']
        self.device = 'cuda' if self.config.use_cuda else 'cpu'

        self.actor = Actor(s_dim, a_dim)
        self.actor_target = Actor(s_dim, a_dim)
        self.critic = Critic(s_dim, a_dim, 1)
        self.critic_target = Critic(s_dim, a_dim, 1)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=self.config.a_lr)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=self.config.c_lr)
        self.c_loss = 0
        self.a_loss = 0

        if self.config.use_cuda:
            self.actor.cuda()
            self.actor_target.cuda()
            self.critic.cuda()
            self.critic_target.cuda()

        hard_update(self.actor, self.actor_target)
        hard_update(self.critic, self.critic_target)

        self.random_process = OrnsteinUhlenbeckProcess(
            size=self.a_dim,
            theta=self.config.ou_theta,
            mu=self.config.ou_mu,
            sigma=self.config.ou_sigma)
        self.replay_buffer = list()
        self.epsilon = 1.
        self.depsilon = self.epsilon / self.config.epsilon_decay
示例#9
0
class DDPG():
    """
    Individual Agent.
    """
    def __init__(self, state_size, action_size, params):
        """
        Build model, random process and intilize it.
        """
        torch.manual_seed(params['SEED'])
        self.random_process = OUNoise(action_size, params)

        self.local_actor = Actor(state_size, action_size, params['SEED'],
                                 params['FC1'], params['FC2']).to(device)
        self.target_actor = Actor(state_size, action_size, params['SEED'],
                                  params['FC1'], params['FC2']).to(device)
        # Initialize target networks weights with local networks
        self.hard_copy(self.local_actor, self.target_actor)
        # Optimizer for local actor networks
        self.actor_optimizer = torch.optim.Adam(self.local_actor.parameters(),
                                                params['LR_ACTOR'])

        self.local_critic = Critic(state_size, action_size, params['SEED'],
                                   params['FC1'], params['FC2']).to(device)
        self.target_critic = Critic(state_size, action_size, params['SEED'],
                                    params['FC1'], params['FC2']).to(device)
        # Initialize target networks weights with local networks
        self.hard_copy(self.local_critic, self.target_critic)
        # Optimizer for local critic networks
        self.critic_optimizer = torch.optim.Adam(
            self.local_critic.parameters(), params['LR_CRITIC'])

    def reset_noise(self):
        """
        Reset the noise state every episode
        """
        self.random_process.reset()

    def hard_copy(self, local, target):
        """
        hard copy the weights of the local network to the target network
        """
        for local_param, target_param in zip(local.parameters(),
                                             target.parameters()):
            target_param.data.copy_(local_param.data)

    def soft_copy(self, tau):
        """
        soft update target network
        𝜃_target = 𝜏*𝜃_local + (1 - 𝜏)*𝜃_target 
        """
        for local_param, target_param in zip(self.local_actor.parameters(),
                                             self.target_actor.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1 - tau) * target_param.data)

        for local_param, target_param in zip(self.local_critic.parameters(),
                                             self.target_critic.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1 - tau) * target_param.data)
示例#10
0
 def __init__(self,
              action_dim,
              state_dim,
              CNN=False,
              width=None,
              height=None,
              channel=None,
              device='cpu'):
     self.CNN = CNN
     if CNN:
         self.CNN_preprocessA = CNN_preprocess(width, height, channel)
         self.CNN_preprocessC = CNN_preprocess(width, height, channel)
         state_dim = self.CNN_preprocessA.get_state_dim()
     self.device = device
     self.actor = Actor(action_dim, state_dim)
     self.critic = Critic(state_dim)
     self.action_dim = action_dim
     self.state_dim = state_dim
     self.noise_epsilon = 0.999
     self.constant_decay = 1
     self.optimizerA = torch.optim.Adam(self.actor.parameters(), lr=0.00001)
     self.optimizerC = torch.optim.Adam(self.critic.parameters(), lr=0.01)
     self.lr_scheduler = {
         "optA":
         torch.optim.lr_scheduler.StepLR(self.optimizerA,
                                         step_size=1000,
                                         gamma=1,
                                         last_epoch=-1),
         "optC":
         torch.optim.lr_scheduler.StepLR(self.optimizerC,
                                         step_size=1000,
                                         gamma=0.9,
                                         last_epoch=-1)
     }
     if CNN:
         # self.CNN_preprocessA = CNN_preprocess(width,height,channel)
         # self.CNN_preprocessC = CNN_preprocess
         self.optimizerA = torch.optim.Adam(itertools.chain(
             self.CNN_preprocessA.parameters(), self.actor.parameters()),
                                            lr=0.0001)
         self.optimizerC = torch.optim.Adam(itertools.chain(
             self.CNN_preprocessC.parameters(), self.critic.parameters()),
                                            lr=0.001)
         self.lr_scheduler = {
             "optA":
             torch.optim.lr_scheduler.StepLR(self.optimizerA,
                                             step_size=10000,
                                             gamma=1,
                                             last_epoch=-1),
             "optC":
             torch.optim.lr_scheduler.StepLR(self.optimizerC,
                                             step_size=10000,
                                             gamma=0.9,
                                             last_epoch=-1)
         }
示例#11
0
    def build_agent(self):
        # build the actor-critic network and also their target networks
        self.actor = Actor(self.state_dim, self.action_dim, self.l1_dim, self.l2_dim,self.alpha)
        self.target_actor = copy.deepcopy(self.actor)
        self.critic = Critic(self.state_dim, self.action_dim, self.l1_dim, self.l2_dim,self.beta)
        self.target_critic = copy.deepcopy(self.critic)

        # build the replaybuffer
        self.replaybuffer = ReplayBuffer(self.max_replay_size, self.state_dim, self.action_dim)
        # build the OUNoise for action selection 
        self.noise = OUNoise(self.action_dim)
示例#12
0
def main():
    ip_port = ('127.0.0.1', 9999)
    s = socket.socket()
    s.bind(ip_port)
    s.listen()
    stuck = 0
    time = 0
    pre = preprocess()

    OUTPUT_GRAPH = False
    MAX_EPISODE = 500
    MAX_EP_STEPS = 2000  # maximum time step in one episode
    RENDER = False  # rendering wastes time
    GAMMA = 0.9  # reward discount in TD error
    LR_A = 0.001  # learning rate for actor
    LR_C = 0.01  # learning rate for critic
    N_F = 26
    N_A = 4

    sess = tf.Session()
    actor = Actor(sess, observation_dim=N_F, action_dim=N_A, lr=LR_A)
    critic = Critic(sess, observation_dim=N_F, lr=LR_C)
    sess.run(tf.global_variables_initializer())
    while (True):
        conn, addr = s.accept()
        # epsilon -= 1.0 / explore
        recv_data = conn.recv(1024)
        if not recv_data:
            break
        pos_info = unpack('51f', recv_data)
        r, info, stuck = pre.function(pos_info, time, stuck)

        state = list(info)
        state.append(stuck)
        state = np.array(state)
        while (True):
            # try:
            action = actor.act(state)
            conn.send(pack('4f', action, 1, 0, 1.0))

            recv_data = conn.recv(1024)
            if not recv_data:
                break
            pos_info = unpack('51f', recv_data)
            r, info, stuck = pre.function(pos_info, time, stuck)

            #steer, acc,
            state_ = list(info)
            state_.append(stuck)
            state_ = np.array(state_)

            state = state_
示例#13
0
class Predator:
    def __init__(self, s_dim, a_dim, num_agent, **kwargs):
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.config = kwargs['config']
        self.num_agent = num_agent

        self.actor = Actor(s_dim, a_dim)
        self.actor_target = Actor(s_dim, a_dim)
        self.critic = Critic(s_dim, a_dim, num_agent)
        self.critic_target = Critic(s_dim, a_dim, num_agent)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=self.config.a_lr)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=self.config.c_lr)
        self.a_loss = 0
        self.c_loss = 0

        if self.config.use_cuda:
            self.actor.cuda()
            self.actor_target.cuda()
            self.critic.cuda()
            self.critic_target.cuda()

        hard_update(self.actor, self.actor_target)
        hard_update(self.critic, self.critic_target)

        self.random_process = OrnsteinUhlenbeckProcess(
            size=self.a_dim,
            theta=self.config.ou_theta,
            mu=self.config.ou_mu,
            sigma=self.config.ou_sigma)

    def get_batches(self):
        experiences = random.sample(self.replay_buffer, self.batch_size)

        state_batches = np.array([_[0] for _ in experiences])
        action_batches = np.array([_[1] for _ in experiences])
        reward_batches = np.array([_[2] for _ in experiences])
        next_state_batches = np.array([_[3] for _ in experiences])
        done_batches = np.array([_[4] for _ in experiences])

        return state_batches, action_batches, reward_batches, next_state_batches, done_batches

    def random_action(self):
        action = np.random.uniform(low=-1.,
                                   high=1.,
                                   size=(self.num_agent, self.a_dim))
        return action

    def reset(self):
        self.random_process.reset_states()
示例#14
0
文件: ddpg.py 项目: juandd18/Tenis
    def __init__(self, num_in_pol, num_out_pol, num_in_critic, hidden_dim_actor=120,
    hidden_dim_critic=64,lr_actor=0.01,lr_critic=0.01,batch_size=64,
    max_episode_len=100,tau=0.02,gamma = 0.99,agent_name='one', discrete_action=False):
        """
        Inputs:
            num_in_pol (int): number of dimensions for policy input
            num_out_pol (int): number of dimensions for policy output
            num_in_critic (int): number of dimensions for critic input
        """
        self.policy = Actor(num_in_pol, num_out_pol,
                                 hidden_dim=hidden_dim_actor,
                                 discrete_action=discrete_action)
        self.critic = Critic(num_in_pol, 1,num_out_pol,
                                 hidden_dim=hidden_dim_critic)
        self.target_policy = Actor(num_in_pol, num_out_pol,
                                        hidden_dim=hidden_dim_actor,
                                        discrete_action=discrete_action)
        self.target_critic = Critic(num_in_pol, 1,num_out_pol,
                                        hidden_dim=hidden_dim_critic)
        hard_update(self.target_policy, self.policy)
        hard_update(self.target_critic, self.critic)
        self.policy_optimizer = Adam(self.policy.parameters(), lr=lr_actor)
        self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic,weight_decay=0)
        
        self.policy = self.policy.float()
        self.critic = self.critic.float()
        self.target_policy = self.target_policy.float()
        self.target_critic = self.target_critic.float()

        self.agent_name = agent_name
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        #self.replay_buffer = ReplayBuffer(1e7)
        self.replay_buffer = ReplayBufferOption(500000,self.batch_size,12)
        self.max_replay_buffer_len = batch_size * max_episode_len
        self.replay_sample_index = None
        self.niter = 0
        self.eps = 5.0
        self.eps_decay = 1/(250*5)

        self.exploration = OUNoise(num_out_pol)
        self.discrete_action = discrete_action

        self.num_history = 2
        self.states = []
        self.actions = []
        self.rewards = []
        self.next_states = []
        self.dones = []
    def __init__(self, action_size, state_size, shared_replay_buffer, memory):
        optimizer_fn = lambda params: torch.optim.Adam(params, lr=1e-4)
        noise_fn = lambda: OUNoise(action_size, SEED)
        memory_fn = lambda: ReplayBuffer(action_size, int(1e6), BATCH_SIZE, SEED, DEVICE)
        actor_network_fn = lambda: Actor(action_size, state_size, (256,128), SEED).to(DEVICE)
        critic_network_fn = lambda: Critic(action_size, state_size, (256,128), SEED).to(DEVICE)
        
        self.seed = SEED
        
        self.actor_local = actor_network_fn()
        self.actor_target = actor_network_fn()
        self.actor_optimizer = optimizer_fn(self.actor_local.parameters())
        
        self.critic_local = critic_network_fn()
        self.critic_target = critic_network_fn()
        self.critic_optimizer = optimizer_fn(self.critic_local.parameters())
        
        self.soft_update(self.critic_local, self.critic_target, 1)
        self.soft_update(self.actor_local, self.actor_target, 1)

        self.noise = noise_fn()
        if shared_replay_buffer:
            self.memory = memory
        else:
            self.memory = memory_fn()
示例#16
0
class IAC():
    def __init__(self, action_dim, state_dim):
        self.actor = Actor(action_dim, state_dim)
        self.critic = Critic(state_dim)
        self.action_dim = action_dim
        self.state_dim = state_dim
        self.optimizerA = torch.optim.Adam(self.actor.parameters(), lr=0.001)
        self.optimizerC = torch.optim.Adam(self.critic.parameters(), lr=0.01)
        self.lr_scheduler = {
            "optA":
            torch.optim.lr_scheduler.StepLR(self.optimizerA,
                                            step_size=1000,
                                            gamma=0.9,
                                            last_epoch=-1),
            "optC":
            torch.optim.lr_scheduler.StepLR(self.optimizerC,
                                            step_size=1000,
                                            gamma=0.9,
                                            last_epoch=-1)
        }
        # self.act_prob
        # self.act_log_prob

    def choose_action(self, s):
        s = torch.Tensor(s).squeeze(0)
        self.act_prob = self.actor(s) + 0.00001
        m = torch.distributions.Categorical(self.act_prob)
        # self.act_log_prob = m.log_prob(m.sample())
        temp = m.sample()
        return temp

    def cal_tderr(self, s, r, s_):
        s = torch.Tensor(s).unsqueeze(0)
        s_ = torch.Tensor(s_).unsqueeze(0)
        v_ = self.critic(s_).detach()
        v = self.critic(s)
        return r + 0.9 * v_ - v

    def learnCritic(self, s, r, s_):
        td_err = self.cal_tderr(s, r, s_)
        loss = torch.mul(td_err, td_err)  #torch.square(td_err)
        self.optimizerC.zero_grad()
        loss.backward()
        self.optimizerC.step()
        self.lr_scheduler["optC"].step()

    def learnActor(self, s, r, s_, a):
        td_err = self.cal_tderr(s, r, s_)
        m = torch.log(self.act_prob[a])
        temp = m * td_err.detach()
        loss = -torch.mean(temp)
        self.optimizerA.zero_grad()
        loss.backward()
        self.optimizerA.step()
        self.lr_scheduler["optA"].step()

    def update(self, s, r, s_, a):
        self.learnCritic(s, r, s_)
        self.learnActor(s, r, s_, a)
示例#17
0
    def __init__(self,
                 gamma,
                 tau,
                 num_inputs,
                 action_space,
                 replay_size,
                 normalize_obs=True,
                 normalize_returns=False,
                 critic_l2_reg=1e-2,
                 num_outputs=1,
                 entropy_coeff=0.1,
                 action_coeff=0.1):

        super(DDPG, self).__init__(gamma=gamma,
                                   tau=tau,
                                   num_inputs=num_inputs,
                                   action_space=action_space,
                                   replay_size=replay_size,
                                   normalize_obs=normalize_obs,
                                   normalize_returns=normalize_returns)

        self.num_outputs = num_outputs
        self.entropy_coeff = entropy_coeff
        self.action_coeff = action_coeff
        self.critic_l2_reg = critic_l2_reg

        self.actor = Actor(self.num_inputs, self.action_space,
                           self.num_outputs).to(self.device)
        self.actor_target = Actor(self.num_inputs, self.action_space,
                                  self.num_outputs).to(self.device)
        self.actor_perturbed = Actor(self.num_inputs, self.action_space,
                                     self.num_outputs).to(self.device)
        self.actor_optim = Adam(self.actor.parameters(), lr=1e-4)

        self.critic = Critic(self.num_inputs + self.action_space.shape[0]).to(
            self.device)
        self.critic_target = Critic(self.num_inputs +
                                    self.action_space.shape[0]).to(self.device)
        self.critic_optim = Adam(self.critic.parameters(),
                                 lr=1e-3,
                                 weight_decay=critic_l2_reg)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)
class DDPGAgent:
    def __init__(self, action_size, action_type, state_size, hidden_in_size,
                 hidden_out_size, num_atoms, lr_actor, lr_critic, l2_decay,
                 noise_type, OU_mu, OU_theta, OU_sigma):
        super(DDPGAgent, self).__init__()

        # creating actors, critics and targets using the specified layer sizes. Note for the critics we assume 2 agents
        self.actor = Actor(action_size, state_size, hidden_in_size,
                           hidden_out_size, action_type).to(device)
        self.critic = Critic(2 * action_size, 2 * state_size, hidden_in_size,
                             hidden_out_size, num_atoms).to(device)
        self.target_actor = Actor(action_size, state_size, hidden_in_size,
                                  hidden_out_size, action_type).to(device)
        self.target_critic = Critic(2 * action_size, 2 * state_size,
                                    hidden_in_size, hidden_out_size,
                                    num_atoms).to(device)
        self.noise_type = noise_type
        self.action_type = action_type

        if noise_type == 'OUNoise':  # if we're using OUNoise it needs to be initialised as it is an autocorrelated process
            self.noise = OUNoise(action_size, OU_mu, OU_theta, OU_sigma)

        # initialize targets same as original networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        # initialize optimisers using specigied learning rates
        self.actor_optimizer = Adam(self.actor.parameters(),
                                    lr=lr_actor,
                                    weight_decay=l2_decay)
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=lr_critic,
                                     weight_decay=l2_decay)

    def act(self, obs, noise_scale=0.0):
        obs = obs.to(device)
        action = self.actor(obs)

        if noise_scale == 0.0:  # if no noise then just return action as is
            pass
        elif self.noise_type == 'OUNoise':
            noise = 1.5 * self.noise.noise()
            action += noise_scale * (noise - action)
            action = torch.clamp(action, -1, 1)
        elif self.noise_type == 'BetaNoise':
            action = BetaNoise(action, noise_scale)
        elif self.noise_type == 'GaussNoise':
            action = GaussNoise(action, noise_scale)
        elif self.noise_type == 'WeightedNoise':
            action = WeightedNoise(action, noise_scale, self.action_type)
        return action

    # target actor is only used for updates not exploration and so has no noise
    def target_act(self, obs):
        obs = obs.to(device)
        action = self.target_actor(obs)
        return action
示例#19
0
 def __init__(self, action_dim, state_dim):
     self.actor = Actor(action_dim, state_dim)
     self.critic = Critic(state_dim)
     self.action_dim = action_dim
     self.state_dim = state_dim
     self.optimizerA = torch.optim.Adam(self.actor.parameters(), lr=0.001)
     self.optimizerC = torch.optim.Adam(self.critic.parameters(), lr=0.01)
     self.lr_scheduler = {
         "optA":
         torch.optim.lr_scheduler.StepLR(self.optimizerA,
                                         step_size=1000,
                                         gamma=0.9,
                                         last_epoch=-1),
         "optC":
         torch.optim.lr_scheduler.StepLR(self.optimizerC,
                                         step_size=1000,
                                         gamma=0.9,
                                         last_epoch=-1)
     }
示例#20
0
    def __init__(self, n_states, n_actions):
        # hyper parameters
        self.replay_size = 1000000
        self.experience_replay = deque(maxlen=self.replay_size)
        self.n_actions = n_actions
        self.n_states = n_states
        self.lr = 0.0003
        self.batch_size = 128
        self.gamma = 0.99
        self.H = -2
        self.Tau = 0.01

        # actor network
        self.actor = Actor(n_states=n_states, n_actions=n_actions).to(DEVICE)

        # dual critic network, with corresponding targets
        self.critic = Critic(n_states=n_states, n_actions=n_actions).to(DEVICE)
        self.critic2 = Critic(n_states=n_states,
                              n_actions=n_actions).to(DEVICE)
        self.target_critic = Critic(n_states=n_states,
                                    n_actions=n_actions).to(DEVICE)
        self.target_critic2 = Critic(n_states=n_states,
                                     n_actions=n_actions).to(DEVICE)

        # make the target critics start off same as the main networks
        for target_param, local_param in zip(self.target_critic.parameters(),
                                             self.critic.parameters()):
            target_param.data.copy_(local_param)

        for target_param, local_param in zip(self.target_critic2.parameters(),
                                             self.critic2.parameters()):
            target_param.data.copy_(local_param)

        # temperature variable
        self.log_alpha = torch.tensor(0.0, device=DEVICE, requires_grad=True)
        self.optim_alpha = Adam(params=[self.log_alpha], lr=self.lr)
        self.alpha = 0.2

        self.optim_actor = Adam(params=self.actor.parameters(), lr=self.lr)
        self.optim_critic = Adam(params=self.critic.parameters(), lr=self.lr)
        self.optim_critic_2 = Adam(params=self.critic2.parameters(),
                                   lr=self.lr)
示例#21
0
    def __init__(self, state_size, action_size, num_agents, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)
        self.eps = eps_start
        self.eps_decay = 1 / (eps_p * LEARN_NUM
                              )  # set decay rate based on epsilon end target

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise((num_agents, action_size), random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)
示例#22
0
    def __init__(self,state_dim,action_dim,hidden_dim = 64, learning_rate = 3e-4,entropy_coef = 1e-2,critic_coef =0.5, gamma = 0.99, lmbda =0.95,eps_clip= 0.2,K_epoch = 10,minibatch_size = 64,device = 'cpu'):
        super(PPO,self).__init__()
        
        self.entropy_coef = entropy_coef
        self.critic_coef = critic_coef
        self.gamma = gamma
        self.lmbda = lmbda
        self.eps_clip = eps_clip
        self.K_epoch = K_epoch
        self.minibatch_size = minibatch_size
        self.max_grad_norm = 0.5
        
        self.data = Rollouts()
        
        self.actor = Actor(state_dim,action_dim,hidden_dim)
        self.critic = Critic(state_dim,hidden_dim)
        
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=learning_rate)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=learning_rate)

        self.device = device
示例#23
0
    def __init__(self, 
        state_size, action_size, replay_memory, random_seed=0, nb_agent = 20, bs = 128,
        gamma=0.99, tau=1e-3, lr_actor=1e-4, lr_critic=1e-4, wd_actor=0, wd_critic=0,
        clip_actor = None, clip_critic=None, update_interval = 20, update_times = 10): 

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.nb_agent = nb_agent
        self.bs = bs
        self.update_interval = update_interval
        self.update_times = update_times
        self.timestep = 0

        self.gamma = gamma
        self.tau = tau
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.wd_critic = wd_critic
        self.wd_actor = wd_actor
        self.clip_critic=clip_critic
        self.clip_actor = clip_actor
        self.actor_losses = []
        self.critic_losses = []

        # Actor #0
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor,weight_decay=self.wd_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic,weight_decay=self.wd_critic)

        # Noise process
        self.noise = OUNoise((self.nb_agent, action_size), random_seed)

        # Replay memory
        self.memory = replay_memory
示例#24
0
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

        self.noise = OUNoise((action_size), random_seed)

        # Make sure target is initialized with the same weight as the source (found on slack to make big difference)
        self.hard_update(self.actor_target, self.actor_local)
        self.hard_update(self.critic_target, self.critic_local)
示例#25
0
    def __init__(self, s_dim, a_dim, n_agents, **kwargs):
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.config = kwargs['config']
        self.n_agents = n_agents
        self.device = 'cuda' if self.config.use_cuda else 'cpu'
        # Networks
        self.policy = Actor(s_dim, a_dim, n_agents)
        self.policy_target = Actor(s_dim, a_dim, n_agents)
        self.critic = Critic(s_dim, a_dim, n_agents)
        self.critic_target = Critic(s_dim, a_dim, n_agents)

        if self.config.use_cuda:
            self.policy.cuda()
            self.policy_target.cuda()
            self.critic.cuda()
            self.critic_target.cuda()

        self.policy_optimizer = torch.optim.Adam(self.policy.parameters(),
                                                 lr=self.config.a_lr)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=self.config.c_lr)

        hard_update(self.policy, self.policy_target)
        hard_update(self.critic, self.critic_target)

        self.random_process = OrnsteinUhlenbeckProcess(
            size=self.a_dim,
            theta=self.config.ou_theta,
            mu=self.config.ou_mu,
            sigma=self.config.ou_sigma)
        self.replay_buffer = list()
        self.epsilon = 1.
        self.depsilon = self.epsilon / self.config.epsilon_decay

        self.c_loss = None
        self.a_loss = None
        self.action_log = list()
示例#26
0
    def __init__(self,
                 state_size,
                 action_size,
                 aid=0,
                 num_agents=2,
                 seed=1234):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, seed=seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  seed=seed).to(device)
        self.actor_optimizer = Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size,
                                   action_size,
                                   num_agents=num_agents,
                                   seed=seed).to(device)
        self.critic_target = Critic(state_size,
                                    action_size,
                                    num_agents=num_agents,
                                    seed=seed).to(device)
        self.critic_optimizer = Adam(self.critic_local.parameters(),
                                     lr=LR_CRITIC,
                                     weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, seed=seed)
示例#27
0
    def __init__(self, replay_buffer, noise, state_dim, action_dim, seed, fc1_units = 256, fc2_units = 128,
                 device="cpu", lr_actor=1e-4, lr_critic=1e-3, batch_size=128, discount=0.99, tau=1e-3):
        torch.manual_seed(seed)

        self.actor_local = Actor(state_dim, action_dim, fc1_units, fc2_units, seed).to(device)
        self.critic_local = Critic(state_dim, action_dim, fc1_units, fc2_units, seed).to(device)
        
        self.actor_optimizer = optim.Adam(params=self.actor_local.parameters(), lr=lr_actor)
        self.critic_optimizer = optim.Adam(params=self.critic_local.parameters(), lr=lr_critic)
        
        self.actor_target = Actor(state_dim, action_dim, fc1_units, fc2_units, seed).to(device)
        self.critic_target = Critic(state_dim, action_dim, fc1_units, fc2_units, seed).to(device)

        self.buffer = replay_buffer
        self.noise = noise
        self.device = device
        self.batch_size = batch_size
        self.discount = discount

        self.tau = tau

        Agent.hard_update(model_local=self.actor_local, model_target=self.actor_target)
        Agent.hard_update(model_local=self.critic_local, model_target=self.critic_target)
示例#28
0
    def __init__(self,
                 env,
                 lr=3e-4,
                 gamma=0.99,
                 polyak=5e-3,
                 alpha=0.2,
                 reward_scale=1.0,
                 cuda=True,
                 writer=None):
        state_size = env.observation_space.shape[0]
        action_size = env.action_space.shape[0]
        self.actor = Actor(state_size, action_size)
        self.critic = Critic(state_size, action_size)
        self.target_critic = Critic(state_size, action_size).eval()
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr)
        self.q1_optimizer = optim.Adam(self.critic.q1.parameters(), lr=lr)
        self.q2_optimizer = optim.Adam(self.critic.q2.parameters(), lr=lr)

        self.target_critic.load_state_dict(self.critic.state_dict())
        for param in self.target_critic.parameters():
            param.requires_grad = False

        self.memory = ReplayMemory()

        self.gamma = gamma
        self.alpha = alpha
        self.polyak = polyak  # Always between 0 and 1, usually close to 1
        self.reward_scale = reward_scale

        self.writer = writer

        self.cuda = cuda
        if cuda:
            self.actor = self.actor.to('cuda')
            self.critic = self.critic.to('cuda')
            self.target_critic = self.target_critic.to('cuda')
示例#29
0
class BiCNet():
    def __init__(self, s_dim, a_dim, n_agents, **kwargs):
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.config = kwargs['config']
        self.n_agents = n_agents
        self.device = 'cuda' if self.config.use_cuda else 'cpu'
        # Networks
        self.policy = Actor(s_dim, a_dim, n_agents)
        self.policy_target = Actor(s_dim, a_dim, n_agents)
        self.critic = Critic(s_dim, a_dim, n_agents)
        self.critic_target = Critic(s_dim, a_dim, n_agents)

        if self.config.use_cuda:
            self.policy.cuda()
            self.policy_target.cuda()
            self.critic.cuda()
            self.critic_target.cuda()

        self.policy_optimizer = torch.optim.Adam(self.policy.parameters(),
                                                 lr=self.config.a_lr)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=self.config.c_lr)

        hard_update(self.policy, self.policy_target)
        hard_update(self.critic, self.critic_target)

        self.random_process = OrnsteinUhlenbeckProcess(
            size=self.a_dim,
            theta=self.config.ou_theta,
            mu=self.config.ou_mu,
            sigma=self.config.ou_sigma)
        self.replay_buffer = list()
        self.epsilon = 1.
        self.depsilon = self.epsilon / self.config.epsilon_decay

        self.c_loss = None
        self.a_loss = None
        self.action_log = list()

    def choose_action(self, obs, noisy=True):
        obs = torch.Tensor([obs]).to(self.device)

        action = self.policy(obs).cpu().detach().numpy()[0]
        self.action_log.append(action)

        if noisy:
            for agent_idx in range(self.n_agents):
                pass
                # action[agent_idx] += self.epsilon * self.random_process.sample()
            self.epsilon -= self.depsilon
            self.epsilon = max(self.epsilon, 0.001)
        np.clip(action, -1., 1.)

        return action

    def reset(self):
        self.random_process.reset_states()
        self.action_log.clear()

    def prep_train(self):
        self.policy.train()
        self.critic.train()
        self.policy_target.train()
        self.critic_target.train()

    def prep_eval(self):
        self.policy.eval()
        self.critic.eval()
        self.policy_target.eval()
        self.critic_target.eval()

    def random_action(self):
        return np.random.uniform(low=-1, high=1, size=(self.n_agents, 2))

    def memory(self, s, a, r, s_, done):
        self.replay_buffer.append((s, a, r, s_, done))

        if len(self.replay_buffer) >= self.config.memory_length:
            self.replay_buffer.pop(0)

    def get_batches(self):
        experiences = random.sample(self.replay_buffer, self.config.batch_size)

        state_batches = np.array([_[0] for _ in experiences])
        action_batches = np.array([_[1] for _ in experiences])
        reward_batches = np.array([_[2] for _ in experiences])
        next_state_batches = np.array([_[3] for _ in experiences])
        done_batches = np.array([_[4] for _ in experiences])

        return state_batches, action_batches, reward_batches, next_state_batches, done_batches

    def train(self):

        state_batches, action_batches, reward_batches, next_state_batches, done_batches = self.get_batches(
        )

        state_batches = torch.Tensor(state_batches).to(self.device)
        action_batches = torch.Tensor(action_batches).to(self.device)
        reward_batches = torch.Tensor(reward_batches).reshape(
            self.config.batch_size, self.n_agents, 1).to(self.device)
        next_state_batches = torch.Tensor(next_state_batches).to(self.device)
        done_batches = torch.Tensor(
            (done_batches == False) * 1).reshape(self.config.batch_size,
                                                 self.n_agents,
                                                 1).to(self.device)

        target_next_actions = self.policy_target.forward(next_state_batches)
        target_next_q = self.critic_target.forward(next_state_batches,
                                                   target_next_actions)
        main_q = self.critic(state_batches, action_batches)
        '''
        How to concat each agent's Q value?
        '''
        #target_next_q = target_next_q
        #main_q = main_q.mean(dim=1)
        '''
        Reward Norm
        '''
        # reward_batches = (reward_batches - reward_batches.mean(dim=0)) / reward_batches.std(dim=0) / 1024

        # Critic Loss
        self.critic.zero_grad()
        baselines = reward_batches + done_batches * self.config.gamma * target_next_q
        loss_critic = torch.nn.MSELoss()(main_q, baselines.detach())
        loss_critic.backward()
        torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5)
        self.critic_optimizer.step()

        # Actor Loss
        self.policy.zero_grad()
        clear_action_batches = self.policy.forward(state_batches)
        loss_actor = -self.critic.forward(state_batches,
                                          clear_action_batches).mean()
        loss_actor += (clear_action_batches**2).mean() * 1e-3
        loss_actor.backward()
        torch.nn.utils.clip_grad_norm_(self.policy.parameters(), 0.5)
        self.policy_optimizer.step()

        # This is for logging
        self.c_loss = loss_critic.item()
        self.a_loss = loss_actor.item()

        soft_update(self.policy, self.policy_target, self.config.tau)
        soft_update(self.critic, self.critic_target, self.config.tau)

    def get_loss(self):
        return self.c_loss, self.a_loss

    def get_action_std(self):
        return np.array(self.action_log).std(axis=-1).mean()
示例#30
0
    def __init__(self, state_size: int, action_size: int, seed: int,
                 n_agent: int):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.n_agent = n_agent
        self.seed = random.seed(seed)
        self.global_step = 0
        self.update_step = 0

        # Initialize actor and critic local and target networks
        self.actor = Actor(state_size,
                           action_size,
                           seed,
                           ACTOR_NETWORK_LINEAR_SIZES,
                           batch_normalization=ACTOR_BATCH_NORM).to(device)
        self.actor_target = Actor(
            state_size,
            action_size,
            seed,
            ACTOR_NETWORK_LINEAR_SIZES,
            batch_normalization=ACTOR_BATCH_NORM).to(device)
        self.critic = Critic(state_size,
                             action_size,
                             seed,
                             CRITIC_NETWORK_LINEAR_SIZES,
                             batch_normalization=CRITIC_BATCH_NORM).to(device)
        self.critic_second = Critic(
            state_size,
            action_size,
            seed,
            CRITIC_SECOND_NETWORK_LINEAR_SIZES,
            batch_normalization=CRITIC_BATCH_NORM).to(device)
        self.critic_second_target = Critic(
            state_size,
            action_size,
            seed,
            CRITIC_SECOND_NETWORK_LINEAR_SIZES,
            batch_normalization=CRITIC_BATCH_NORM).to(device)
        self.critic_target = Critic(
            state_size,
            action_size,
            seed,
            CRITIC_NETWORK_LINEAR_SIZES,
            batch_normalization=CRITIC_BATCH_NORM).to(device)
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=ACTOR_LEARNING_RATE)
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=CRITIC_LEARNING_RATE)
        self.critic_second_optimizer = optim.Adam(
            self.critic_second.parameters(), lr=CRITIC_LEARNING_RATE)
        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = [0] * n_agent
        self.noise = OUNoise(action_size, seed, decay_period=50)

        # Copy parameters from local network to target network
        for target_param, param in zip(self.actor_target.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_(param.data)
        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data)
        for target_param, param in zip(self.critic_second_target.parameters(),
                                       self.critic_second.parameters()):
            target_param.data.copy_(param.data)