class DDPGAgent: def __init__(self, in_actor, hidden_in_actor, hidden_out_actor, out_actor, in_critic, hidden_in_critic, hidden_out_critic, lr_actor=1.0e-2, lr_critic=1.0e-2): super(DDPGAgent, self).__init__() self.actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) self.critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device) self.target_actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) self.target_critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device) self.noise = OUNoise(out_actor, scale=1.0) # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=1.e-5) def act(self, obs, noise=0.0): obs = obs.to(device) action = self.actor(obs) + noise*self.noise.noise() return action def target_act(self, obs, noise=0.0): obs = obs.to(device) action = self.target_actor(obs) + noise*self.noise.noise() return action
def __init__(self, action_size, state_size, params, device): self.batch_size = params.batch_size self.buffer_size = params.buffer_size self.tau = params.tau self.actor_lr = params.actor_lr self.critic_lr = params.critic_lr self.actor_weight_decay = params.actor_weight_decay self.critic_weight_decay = params.critic_weight_decay self.gamma = params.gamma self.params = params self.step_number =0 self.device = device self.action_size= action_size self.state_size = state_size self.max_score = 40 self.current_score = 0 self.seed = 4 self.actor_local = ActorNetwork(self.state_size, self.action_size, self.seed).to(device) self.actor_target = ActorNetwork(self.state_size, self.action_size, self.seed).to(device) self.critic_local = CriticNetwork(state_size, action_size, self.seed, params).to(device) self.critic_target = CriticNetwork(state_size, action_size, self.seed, params).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.actor_lr) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.critic_lr, weight_decay=self.critic_weight_decay) self.memory_buffer = PrioritizedMemory(self.buffer_size, self.batch_size, device) self.noise = OUNoise((20,self.action_size), self.seed)
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters
def __init__(self): """Initialize an DDPG Agent object.""" super(DDPGAgent, self).__init__() self.config = Config.getInstance() self.actor = Actor(self.config.state_size, self.config.action_size, self.config.seed).to(self.config.device) self.critic = Critic(self.config.num_agents * self.config.state_size, self.config.num_agents * self.config.action_size, self.config.seed).to(self.config.device) self.target_actor = Actor(self.config.state_size, self.config.action_size, self.config.seed).to(self.config.device) self.target_critic = Critic( self.config.num_agents * self.config.state_size, self.config.num_agents * self.config.action_size, self.config.seed).to(self.config.device) self.noise = OUNoise(self.config.action_size, scale=1.0) # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.actor_optimizer = Adam(self.actor.parameters(), lr=self.config.lr_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=self.config.lr_critic, weight_decay=self.config.weight_decay)
def __init__(self, in_actor, hidden_in_actor, hidden_out_actor, out_actor, in_critic_state, hidden_in_critic, hidden_out_critic, critic_input_action_size, lr_actor=1.0e-4, lr_critic=3.0e-4): super(DDPGAgent, self).__init__() self.actor = Actor(in_actor, hidden_in_actor, hidden_out_actor, out_actor).to(device) self.critic = Critic(in_critic_state, hidden_in_critic, hidden_out_critic, critic_input_action_size).to(device) self.target_actor = Actor(in_actor, hidden_in_actor, hidden_out_actor, out_actor).to(device) self.target_critic = Critic(in_critic_state, hidden_in_critic, hidden_out_critic, critic_input_action_size).to(device) self.action_size = out_actor self.noise = OUNoise(out_actor, scale=1.0) # initialize targets same as original networks one time in the initial step hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.noise_reduction = 1.0 self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor, weight_decay=0) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=0)
def __init__(self, state_size, action_size, random_seed): self.state_size = state_size self.action_size = action_size self.seed = random_seed # ------------------ actor ------------------ # self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) # ------------------ critic ----------------- # self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) # ------------------ optimizers ------------- # self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC) # ----------------------- initialize target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, 1) self.soft_update(self.actor_local, self.actor_target, 1) self.t_step = 0 # Noise process self.noise = OUNoise(action_size, random_seed) # Replay Buffer self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, device, random_seed)
class DDPGAgent: def __init__(self, state_size, action_size, seed=0, lr_actor=1.0e-4, lr_critic=1.0e-3): super(DDPGAgent, self).__init__() self.actor = Actor(state_size, action_size).to(device) self.critic = Critic(state_size, action_size, seed=seed).to(device) self.target_actor = Actor(state_size, action_size).to(device) self.target_critic = Critic(state_size, action_size, seed=seed).to(device) self.noise = OUNoise(action_size, scale=1.0 ) # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic) #, weight_decay=1.e-5 def act(self, obs, noise=0.0): obs = obs.to(device) self.actor.eval() action = self.actor(obs).cpu().data.numpy() + noise*self.noise.noise() return np.clip(action, -1, 1) def target_act(self, obs, noise=0.0): obs = obs.to(device) action = self.target_actor(obs).cpu().data.numpy() + noise*self.noise.noise() return np.clip(action, -1, 1)
def __init__(self, state_size, action_size, num_agents, seed=0, lr_actor=1.0e-4, lr_critic=1.0e-3): super(DDPGAgent, self).__init__() self.actor = networkforall.Actor(state_size, action_size).to(device) self.critic = networkforall.Critic(state_size, action_size, num_agents, seed=seed).to(device) self.target_actor = networkforall.Actor(state_size, action_size).to(device) self.target_critic = networkforall.Critic(state_size, action_size, num_agents, seed=seed).to(device) self.noise = OUNoise(action_size, scale=1.0) # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic)
def __init__(self, in_actor, hidden_in_actor, hidden_out_actor, out_actor, in_critic, \ hidden_in_critic, hidden_out_critic, seed = 0, lr_actor=1.0e-3, lr_critic=1.0e-3): self.actor = DDPGNet(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) self.target_actor = DDPGNet(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) self.critic = DDPGNet(in_critic, hidden_in_actor, hidden_out_actor, 1, actor=False).to(device) self.target_critic = DDPGNet(in_critic, hidden_in_actor, hidden_out_actor, 1, actor=False).to(device) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr_actor) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr_critic) self.memory = ReplayBuffer(out_actor, BUFFER_SIZE, BATCH_SIZE, seed) self.noise = OUNoise(out_actor, scale=1.0) self.action_size = out_actor self.t_step = 0 self.soft_update(self.actor, self.target_actor, 1.) self.soft_update(self.critic, self.target_critic, 1.)
def __init__(self, num_agents, local_obs_dim, local_action_size, global_obs_dim, global_action_size, lr_actor=1.0e-4, random_seed=4, device=device): super(DDPGActor, self).__init__() self.device = device # create actor/target_actor and critic/target_critic self.actor_local = Actor(local_obs_dim, local_action_size, random_seed).to(self.device) self.actor_target = Actor(local_obs_dim, local_action_size, random_seed).to(self.device) #noise self.action_noise = OUNoise(local_action_size, seed=random_seed, theta=0.15, sigma=0.2) #self.param_noise = ActorParamNoise(local_obs_dim, local_action_size,random_seed,stddev = 0.5).to(self.device) # copy parameters to target networks hard_update(self.actor_target, self.actor_local) # create optimizers self.actor_optimizer = Adam(self.actor_local.parameters(), lr=lr_actor)
class DDPGAgent: def __init__(self, in_actor, out_actor, in_critic, lr_actor=1.0e-4, lr_critic=1.0e-3): super(DDPGAgent, self).__init__() self.actor = Actor(in_actor, out_actor).to(device) self.critic = Critic(in_critic, out_actor * 2).to(device) self.target_actor = Actor(in_actor, out_actor).to(device) self.target_critic = Critic(in_critic, out_actor * 2).to(device) self.noise = OUNoise(out_actor, scale=1.0) # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic) def act(self, obs, noise=0.0): obs = obs.to(device) self.actor.eval() with torch.no_grad(): action = self.actor( obs).cpu().data.numpy() + noise * self.noise.noise() self.actor.train() return np.clip(action, -1, 1) def target_act(self, obs, noise=0.0): obs = obs.to(device) action = self.target_actor( obs).cpu().data.numpy() + noise * self.noise.noise() return np.clip(action, -1, 1)
def __init__(self, state_size, action_size, n_agents, seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.stacked_state_size = state_size * n_agents self.stacked_action_size = action_size * n_agents # Actor networks self.actor_local = ActorNetwork(state_size, action_size, seed).to(device) self.actor_target = ActorNetwork(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=ACTOR_LR) # Critic networks self.critic_local = CriticNetwork(self.stacked_state_size, self.stacked_action_size, seed).to(device) self.critic_target = CriticNetwork(self.stacked_state_size, self.stacked_action_size, seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=CRITIC_LR) # OUNoise self.exploration_noise = OUNoise(action_size, seed)
def __init__(self, agent_id, state_size, action_size, n_agents, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.n_agents = n_agents self.seed = random.seed(seed) self.agent_id = agent_id # DDPG-Network self.network = DDPGModel(n_agents, state_size, action_size, seed) self.actor_local = self.network.actor_local self.actor_target = self.network.actor_target self.optimizer_actor = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) self.critic_target = self.network.critic_target self.critic_local = self.network.critic_local self.optimizer_critic = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC) # set noise self.noise = OUNoise(action_size, seed) self.eps = EPS_START self.t_step = 0 # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
def __init__(self, in_actor, actor_fc1_units, actor_fc2_units, out_actor, in_critic, critic_fc1_units, critic_fc2_units, lr_actor, lr_critic, weight_decay_actor, weight_decay_critic): super(DDPGAgent, self).__init__() self.actor = Actor(in_actor, actor_fc1_units, actor_fc2_units, out_actor).to(device) self.critic = Critic(in_critic, critic_fc1_units, critic_fc2_units, 1).to(device) self.target_actor = Actor(in_actor, actor_fc1_units, actor_fc2_units, out_actor).to(device) self.target_critic = Critic(in_critic, critic_fc1_units, critic_fc2_units, 1).to(device) self.target_actor.eval() self.target_critic.eval() self.noise = OUNoise(out_actor) # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor, weight_decay=weight_decay_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=weight_decay_critic)
def __init__(self, in_actor, hidden_in_actor, hidden_out_actor, out_actor, lr_actor=1.0e-3): super(DDPGAgent, self).__init__() self.actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) self.target_actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) self.noise = OUNoise(out_actor, scale=1.0) # initialize targets same as original networks hard_update(self.target_actor, self.actor) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor)
class DDPGAgent: def __init__(self, in_actor, out_actor, hidden_in_actor, hidden_out_actor, state_dim_in_critic, action_dim_inp_critic, hidden_in_critic, hidden_out_critic, lr_actor=1.0e-4, lr_critic=1.0e-3): super(DDPGAgent, self).__init__() self.actor = Actor(in_actor, out_actor, hidden_in_actor, hidden_out_actor).to(device) self.critic = Critic(state_dim_in_critic, action_dim_inp_critic, hidden_in_critic, hidden_out_critic).to(device) self.target_actor = Actor(in_actor, out_actor, hidden_in_actor, hidden_out_actor).to(device) self.target_critic = Critic(state_dim_in_critic, action_dim_inp_critic, hidden_in_critic, hidden_out_critic).to(device) self.noise = OUNoise(out_actor, scale=1.0 ) self.tau = TAU # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=1.e-5) def act(self, obs, noise=0.0): """Returns actions for given state as per current policy.""" state = torch.from_numpy(obs).float().to(device).view(-1, 24) self.actor.eval() with torch.no_grad(): action = self.actor(state).cpu().data.numpy() self.actor.train() add_noise = noise * self.noise.noise() action += add_noise.cpu().data.numpy() return np.clip(action, -1, 1).reshape(-1) def target_act(self, obs, noise=0.0): obs = obs.to(device) action = self.target_actor(obs) + noise*self.noise.noise() return action def reset(self): self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) def soft_update_all(self): DDPGAgent.soft_update(local_model=self.critic, target_model=self.critic_target, tau=self.tau) DDPGAgent.soft_update(local_model=self.actor, target_model=self.actor_target, tau=self.tau)
class DDPGAgent: def __init__( self, in_actor, out_actor, n_filt_actor, kernel_size_actor, stride_actor, fc_units_actor, in_critic, n_filt_critic, kernel_size_critic, stride_critic, fc_units_critic, lr_actor=1.0e-3, lr_critic=1.0e-5): # 1e-5 was getting to 0.4 score (sporadically) super(DDPGAgent, self).__init__() self.actor = Network(in_actor, out_actor, n_filt_actor, kernel_size_actor, stride_actor, fc_units_actor, actor=True).to(device) self.critic = Network(in_critic, 1, n_filt_critic, kernel_size_critic, stride_critic, fc_units_critic).to(device) self.target_actor = Network(in_actor, out_actor, n_filt_actor, kernel_size_actor, stride_actor, fc_units_actor, actor=True).to(device) self.target_critic = Network(in_critic, 1, n_filt_critic, kernel_size_critic, stride_critic, fc_units_critic).to(device) self.noise = OUNoise(out_actor, scale=.1) # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=1e-3) def act(self, obs, noise=0.0): obs = obs.to(device) action = self.actor(obs) + noise * self.noise.noise() return action def target_act(self, obs, noise=0.0): obs = obs.to(device) action = self.target_actor(obs) + noise * self.noise.noise() return action
class DDPGActor(): def __init__(self, num_agents, local_obs_dim, local_action_size, global_obs_dim, global_action_size, lr_actor=1.0e-4, random_seed=4, device=device): super(DDPGActor, self).__init__() self.device = device # create actor/target_actor and critic/target_critic self.actor_local = Actor(local_obs_dim, local_action_size, random_seed).to(self.device) self.actor_target = Actor(local_obs_dim, local_action_size, random_seed).to(self.device) #noise self.action_noise = OUNoise(local_action_size, seed=random_seed, theta=0.15, sigma=0.2) #self.param_noise = ActorParamNoise(local_obs_dim, local_action_size,random_seed,stddev = 0.5).to(self.device) # copy parameters to target networks hard_update(self.actor_target, self.actor_local) # create optimizers self.actor_optimizer = Adam(self.actor_local.parameters(), lr=lr_actor) def act(self, local_obs, noise_coef, add_noise=True): state = torch.from_numpy(local_obs).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.action_noise.sample( ) * noise_coef #self.sigma * np.random.randn(self.action_size)#self.noise.sample()[0] * self.noise_coef return np.clip(action, -1, 1) def target_act(self, local_obs, noise_coef=0, add_noise=False): #state = torch.from_numpy(local_obs).float().to(device) state = local_obs self.actor_target.eval() with torch.no_grad(): action = self.actor_target(state).cpu().data.numpy() self.actor_target.train() if add_noise: action += self.action_noise.sample( ) * noise_coef #self.sigma * np.random.randn(self.action_size)#self.noise.sample()[0] * self.noise_coef return np.clip(action, -1, 1) def update_target(self, tau): soft_update(self.actor_target, self.actor_local, tau)
class DDPGAgent: """Interacts with and learns from the environment using DDPG method.""" def __init__(self): """Initialize an DDPG Agent object.""" super(DDPGAgent, self).__init__() self.config = Config.getInstance() self.actor = Actor(self.config.state_size, self.config.action_size, self.config.seed).to(self.config.device) self.critic = Critic(self.config.num_agents * self.config.state_size, self.config.num_agents * self.config.action_size, self.config.seed).to(self.config.device) self.target_actor = Actor(self.config.state_size, self.config.action_size, self.config.seed).to(self.config.device) self.target_critic = Critic( self.config.num_agents * self.config.state_size, self.config.num_agents * self.config.action_size, self.config.seed).to(self.config.device) self.noise = OUNoise(self.config.action_size, scale=1.0) # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.actor_optimizer = Adam(self.actor.parameters(), lr=self.config.lr_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=self.config.lr_critic, weight_decay=self.config.weight_decay) def act(self, obs, noise_decay_parameter=0.0): """ Returns actions for given state as per current policy for an agent. """ obs = torch.from_numpy(obs).float().to(self.config.device) self.actor.eval() with torch.no_grad(): action = self.actor(obs).cpu().data.numpy() self.actor.train() action += noise_decay_parameter * self.noise.sample() return action def target_act(self, obs, noise_decay_parameter=0.0): """ Returns target network actions from an agent """ obs = obs.to(self.config.device) action = self.target_actor( obs) + noise_decay_parameter * self.noise.sample() return action def reset(self): """Reset the internal state of noise mean(mu)""" self.noise.reset()
class DDPGAgent: def __init__(self, state_size, action_size, num_agents, seed=0, lr_actor=1.0e-4, lr_critic=1.0e-3): super(DDPGAgent, self).__init__() self.actor = networkforall.Actor(state_size, action_size).to(device) self.critic = networkforall.Critic(state_size, action_size, num_agents, seed=seed).to(device) self.target_actor = networkforall.Actor(state_size, action_size).to(device) self.target_critic = networkforall.Critic(state_size, action_size, num_agents, seed=seed).to(device) self.noise = OUNoise(action_size, scale=1.0) # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic) def act(self, obs, noise=0.0): obs = obs.to(device) self.actor.eval( ) #Sets the module in evaluation mode. Seems to get rid of error you get from batchnorm without it #convert to cpu() since noise is in cpu() action = self.actor( obs).cpu().data.numpy() + noise * self.noise.noise() #np.clip to make the action lie between -1 and 1 return np.clip(action, -1, 1) def target_act(self, obs, noise=0.0): obs = obs.to(device) self.target_actor.eval() # convert to cpu() since noise is in cpu() action = self.target_actor( obs).cpu().data.numpy() + noise * self.noise.noise() # np.clip to make the action lie between -1 and 1 return np.clip(action, -1, 1)
def __init__(self, state_size, action_size, hidden_in_dim, hidden_out_dim, extrem_out=64, num_agents=2, lr_actor=1.0e-4, lr_critic=1.0e-3): super(DDPGAgent, self).__init__() critic_state_size = state_size * num_agents critic_action_size = (action_size * (num_agents)) self.actor = Network(state_size, action_size, hidden_in_dim, hidden_out_dim, hidden_extrem_out=extrem_out, actor=True).to(device) self.critic = Network(critic_state_size, critic_action_size, hidden_in_dim, hidden_out_dim, hidden_extrem_out=extrem_out).to(device) self.target_actor = Network(state_size, action_size, hidden_in_dim, hidden_out_dim, hidden_extrem_out=extrem_out, actor=True).to(device) self.target_critic = Network(critic_state_size, critic_action_size, hidden_in_dim, hidden_out_dim, hidden_extrem_out=extrem_out).to(device) self.noise = OUNoise(action_size, scale=1.0) # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=0) print("critic", self.critic, self.target_critic, "optim", self.critic_optimizer) print("actor", self.actor, self.target_actor, "optim", self.actor_optimizer)
def __init__(self, state_size, action_size, params, n_agents=2): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action params (dict): all parameters """ self.params = params self.state_size = state_size self.action_size = action_size self.seed = random.seed(params["seed"]) # Actor Network (w/ Target Network) self.actor_local = Actor( state_size, action_size, params).to(device) self.actor_target = Actor( state_size, action_size, params).to(device) self.actor_optimizer = optim.Adam( self.actor_local.parameters(), lr=params["lr_actor"]) # Critic Network (w/ Target Network) if params["type"] == "MADDPG": self.critic_local = Critic( n_agents * state_size, n_agents * action_size, params).to(device) self.critic_target = Critic( n_agents * state_size, n_agents * action_size, params).to(device) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=params["lr_critic"], weight_decay=params["weight_decay"]) else: self.critic_local = Critic( state_size, action_size, params).to(device) self.critic_target = Critic( state_size, action_size, params).to(device) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=params["lr_critic"], weight_decay=params["weight_decay"]) # initialize targets same as original networks hard_update(self.actor_target, self.actor_local) hard_update(self.critic_target, self.critic_local) # Noise process self.noise = OUNoise(action_size, params["seed"]) # Replay memory self.memory = ReplayBuffer( action_size, self.params["buffer_size"], self.params["batch_size"], params["seed"]) self.t_step = 0
def __init__(self, config): """Initialize an Agent object. Params ====== config : configuration given a variety of parameters """ self.config = config # set parameter for ML self.set_parameters(config) # Q-Network self.create_networks() # Noise process self.noise = OUNoise(self.action_size, self.seed, sigma=self.sigma)
class DDPGAgent: """ Implements the structure of the DDPG Reinforcement Learning algorithm""" def __init__(self, actor_layer_sizes=[24, 128,128,2], critic_layer_sizes=[24, 128,128,1], lr_actor=1e-3, lr_critic=1e-3, clamp_actions=True, logger=None, log_layers=False): super(DDPGAgent, self).__init__() # SET UP ACTOR AND CRITIC NETWORKS self.actor = Network(layer_sizes=actor_layer_sizes, actor=True, logger=logger, log_layers=log_layers).to(device) self.critic = Network(layer_sizes=critic_layer_sizes, logger=logger, log_layers=log_layers).to(device) self.target_actor = Network(layer_sizes=actor_layer_sizes, actor=True, logger=logger, log_layers=log_layers).to(device) self.target_critic = Network(layer_sizes=critic_layer_sizes, logger=logger, log_layers=log_layers).to(device) # INITIALIZE TARGET NETWORKS TO HAVE SAME WEIGHTS AS LOCAL NETWORKS hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) # OPTIMIZERS self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=0.0) # NOISE - for exploration of actions self.noise = OUNoise(actor_layer_sizes[-1], scale=1.0 ) self.clamp_actions = clamp_actions def act(self, obs, noise=0.0): """ Given a tensor representing the states, it returns the predicted actions the agent should take using the LOCAL network. If `noise` is provided, it adds some random noise to the actions to make the agent explore. """ obs = obs.to(device) action = self.actor(obs) + noise*self.noise.noise() if self.clamp_actions: action = torch.clamp(action, -1.0, 1.0) return action def target_act(self, obs, noise=0.0): """ Given a tensor representing the states, it returns the predicted actions the agent should take using the LOCAL network. If `noise` is provided, it adds some random noise to the actions to make the agent explore. """ obs = obs.to(device) action = self.target_actor(obs) + noise*self.noise.noise() if self.clamp_actions: action = torch.clamp(action, -1.0, 1.0) return action
def __init__(self, in_actor, out_actor, in_critic, lr_actor=1.0e-4, lr_critic=1.0e-3): super(DDPGAgent, self).__init__() hidden_in_actor = 64 hidden_out_actor = 128 hidden_in_critic = hidden_in_actor hidden_out_critic = hidden_out_actor self.actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) self.critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1, out_actor, actor=False).to(device) self.target_actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) self.target_critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1, out_actor, actor=False).to(device) self.noise = OUNoise(out_actor, scale=0.9) #scale 1.0 self.noise_shape = out_actor # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor) WD = 1e-5 self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=WD)
def __init__(self, state_size, action_size, config): """ Initialize an agent object """ self.state_size = state_size self.action_size = action_size self.config = config # retrieve number of agents self.num_agents = config["DDPG"]["num_agents"] # logging for this class self.logger = logging.getLogger(self.__class__.__name__) # gpu support self.device = pick_device(config, self.logger) ## Actor local and target networks self.actor_local = Actor(state_size, action_size, config).to(self.device) self.actor_target = Actor(state_size, action_size, config).to(self.device) self.actor_optimizer = getattr( optim, config["optimizer_actor"]["optimizer_type"])( self.actor_local.parameters(), betas=tuple(config["optimizer_actor"]["betas"]), **config["optimizer_actor"]["optimizer_params"]) ## Critic local and target networks self.critic_local = Critic(state_size, action_size, config).to(self.device) self.critic_target = Critic(state_size, action_size, config).to(self.device) self.critic_optimizer = getattr( optim, config["optimizer_critic"]["optimizer_type"])( self.critic_local.parameters(), betas=tuple(config["optimizer_critic"]["betas"]), **config["optimizer_critic"]["optimizer_params"]) ## Noise process self.noise = OUNoise((self.num_agents, action_size)) ## Replay memory self.memory = ReplayBuffer(config=config, action_size=action_size, buffer_size=int( config["DDPG"]["buffer_size"]), batch_size=config["trainer"]["batch_size"])
def __init__(self, agent_id, model, action_size=2, seed=0, tau=1e-3, lr_actor=1e-4, lr_critic=1e-3, weight_decay=0.0): """ Params ====== model: model object action_size (int): dimension of each action seed (int): Random seed tau (float): for soft update of target parameters lr_actor (float): learning rate for actor lr_critic (float): learning rate for critic weight_decay (float): L2 weight decay """ random.seed(seed) self.id = agent_id self.action_size = action_size self.tau = tau self.lr_actor = lr_actor self.lr_critic = lr_critic # Actor Network self.actor_local = model.actor_local self.actor_target = model.actor_target self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network self.critic_local = model.critic_local self.critic_target = model.critic_target self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=weight_decay) # Set weights for local and target actor, respectively, critic the same self.hard_update(self.actor_target, self.actor_local) self.hard_update(self.critic_target, self.critic_local) # Noise process self.noise = OUNoise(action_size, seed)
def __init__(self, state_size, action_size, num_agents, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agens random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) self.eps = eps_start # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # initialize targets same as original networks #self.hard_update(self.actor_target, self.actor_local) #self.hard_update(self.critic_target, self.critic_local) # Noise process #self.noise = OUNoise(action_size, random_seed) self.noise = OUNoise((num_agents, action_size), random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self, num_agents, local_obs_dim, local_action_size, global_obs_dim, global_action_size, lr_actor=1.0e-4, lr_critic=1.0e-4, random_seed=4, device=device, weight_decay=0.0): super(DDPGAgent, self).__init__() self.device = device self.weight_decay = weight_decay # create actor/target_actor and critic/target_critic self.actor = Actor(local_obs_dim, local_action_size, random_seed).to(self.device) self.critic = CentralizedCritic(global_obs_dim, global_action_size).to(self.device) self.target_actor = Actor(local_obs_dim, local_action_size, random_seed).to(self.device) self.target_critic = CentralizedCritic( global_obs_dim, global_action_size).to(self.device) #noise self.action_noise = OUNoise(local_action_size, scale=1.0, sigma=0.1) self.param_noise = ActorParamNoise(local_obs_dim, local_action_size, random_seed, stddev=0.5).to(self.device) #self.param_noise_rate = 0.999 # apply this rate to the param noise, gradually get rid of the noise #self.use_action_noise = use_action_noise #self.use_param_noise = use_param_noise # copy parameters to target networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) # create optimizers self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=self.weight_decay)
def __init__(self, in_actor, hidden_in_actor, hidden_out_actor, out_actor, in_critic, hidden_in_critic, hidden_out_critic, lr_actor=1.0e-2, lr_critic=1.0e-2, weight_decay=1.0e-5, device='cuda:0'): super(DDPGAgent, self).__init__() hidden_gat_dim = 64 self.actor = ActorNetwork(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) self.critic = CriticNetwork(in_critic, hidden_gat_dim, hidden_in_critic, hidden_out_critic, 1).to(device) # print("actor parameters are: " + str(self.count_parameters(self.actor))) # print("critic parameters are: " + str(self.count_parameters(self.critic))) self.target_actor = ActorNetwork(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) self.target_critic = CriticNetwork(in_critic, hidden_gat_dim, hidden_in_critic, hidden_out_critic, 1).to(device) self.noise = OUNoise(out_actor, scale=1.0) self.device = device # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=weight_decay)
def __init__(self, state_dim, action_dim, p_learning_rate=0.0002, q_learning_rate=0.001, gamma=0.9, eta=0.0003, batch_size=64, replay_buffer_size=1024 * 1024, min_train_replays=1024 * 16, logdir='', save_path='', *args, **kwargs): self.state_dim = state_dim self.action_dim = action_dim self.hl1_dim = 250 # hidden layer 1 self.hl2_dim = 250 # hidden layer 2 self.batch_size = batch_size self.replay_buffer_size = replay_buffer_size self.min_train_replays = min_train_replays self.noise = OUNoise(action_dim) self.time_step = 0 self.replay_buffer = deque() self.gamma = gamma self.eta = eta self.alpha = self.initial_alpha = 1.0 self.final_alpha = 0.01 self.p_learning_rate = p_learning_rate self.q_learning_rate = q_learning_rate self.save_path = save_path self.create_network() self.session = tf.InteractiveSession() self.session.run(tf.initialize_all_variables()) self.session.run(self.init_target_theta) # self.load() self.summary_writer = tf.train.SummaryWriter(logdir, self.session.graph)
class Ddpg(object): def __init__(self, state_dim, action_dim, p_learning_rate=0.0002, q_learning_rate=0.001, gamma=0.9, eta=0.0003, batch_size=64, replay_buffer_size=1024 * 1024, min_train_replays=1024 * 16, logdir='', save_path='', *args, **kwargs): self.state_dim = state_dim self.action_dim = action_dim self.hl1_dim = 250 # hidden layer 1 self.hl2_dim = 250 # hidden layer 2 self.batch_size = batch_size self.replay_buffer_size = replay_buffer_size self.min_train_replays = min_train_replays self.noise = OUNoise(action_dim) self.time_step = 0 self.replay_buffer = deque() self.gamma = gamma self.eta = eta self.alpha = self.initial_alpha = 1.0 self.final_alpha = 0.01 self.p_learning_rate = p_learning_rate self.q_learning_rate = q_learning_rate self.save_path = save_path self.create_network() self.session = tf.InteractiveSession() self.session.run(tf.initialize_all_variables()) self.session.run(self.init_target_theta) # self.load() self.summary_writer = tf.train.SummaryWriter(logdir, self.session.graph) def theta_p(self): with tf.variable_scope("theta_p"): return [ tf.Variable(random_init([self.state_dim, self.hl1_dim]), name="W1"), tf.Variable(random_init([self.hl1_dim]), name="b1"), tf.Variable(random_init([self.hl1_dim, self.hl2_dim]), name="W2"), tf.Variable(random_init([self.hl2_dim]), name="b2"), tf.Variable(random_init([self.hl2_dim, self.action_dim]), name="W3"), tf.Variable(random_init([self.action_dim]), name="b3") ] def theta_q(self): with tf.variable_scope("theta_q"): return [ tf.Variable(random_init([self.state_dim, self.hl1_dim]), name='W1'), tf.Variable(random_init([self.hl1_dim]), name='b1'), tf.Variable(random_init([self.hl1_dim + self.action_dim, self.hl2_dim]), name='W2'), tf.Variable(random_init([self.hl2_dim]), name='b2'), tf.Variable(random_init([self.hl2_dim, 1]), name='W3'), tf.Variable(random_init([1]), name='b3') ] def create_policy_network(self, state, theta, name="policy_network"): with tf.variable_op_scope([state], name, name): h0 = tf.identity(state, "state") h1 = tf.nn.relu(tf.matmul(h0, theta[0]) + theta[1], name='h1') h2 = tf.nn.relu(tf.matmul(h1, theta[2]) + theta[3], name="h2") h3 = tf.identity(tf.matmul(h2, theta[4]) + theta[5], name='h3') action = tf.nn.tanh(h3, name='action') return action def create_q_network(self, state, action, theta, name='q_network'): with tf.variable_op_scope([state, action], name, name): h0 = tf.identity(state, name='state') h1_state = tf.nn.relu(tf.matmul(h0, theta[0]) + theta[1]) # h1 = concat(h1_state,action) h1 = tf.concat(1, [h1_state, action], name="h1") h2 = tf.nn.relu(tf.matmul(h1, theta[2]) + theta[3], name="h2") h3 = tf.add(tf.matmul(h2, theta[4]), theta[5], name='h3') q = tf.squeeze(h3, [1], name='q') return q def create_network(self): theta_q, theta_p = self.theta_q(), self.theta_p() target_theta_q, target_theta_p = self.theta_q(), self.theta_p() # init target theta with the same value of theta init_target_theta_q = [ target_theta_q[i].assign(theta_q[i].value()) for i in range(len(theta_q)) ] init_target_theta_p = [ target_theta_p[i].assign(theta_p[i].value()) for i in range(len(theta_p)) ] self.init_target_theta = init_target_theta_q + init_target_theta_p self.state = tf.placeholder(tf.float32, [None, self.state_dim], 'state') self.action = tf.placeholder(tf.float32, [None, self.action_dim], 'action') self.next_state = tf.placeholder(tf.float32, [None, self.state_dim], 'next_state') self.reward = tf.placeholder(tf.float32, [None], 'reward') self.terminate = tf.placeholder(tf.bool, [None], 'terminate') # q optimizer q = self.create_q_network(self.state, self.action, theta_q) next_action = self.create_policy_network(self.next_state, target_theta_p) next_q = self.create_q_network(self.next_state, next_action, target_theta_q) y_input = tf.stop_gradient(tf.select(self.terminate, self.reward, self.reward + self.gamma * next_q)) q_error = tf.reduce_mean(tf.square(y_input - q)) ## normalize q_loss = q_error + tf.add_n([0.01 * tf.nn.l2_loss(var) for var in theta_q]) q_optimizer = tf.train.AdamOptimizer(self.q_learning_rate) grads_and_vars_q = q_optimizer.compute_gradients(q_loss, var_list=theta_q) q_train = q_optimizer.apply_gradients(grads_and_vars_q) # policy optimizer self.action_exploration = self.create_policy_network(self.state, theta_p) q1 = self.create_q_network(self.state, self.action_exploration, theta_q) p_error = - tf.reduce_mean(q1) ## normalize p_loss = p_error + tf.add_n([0.01 * tf.nn.l2_loss(var) for var in theta_p]) p_optimizer = tf.train.AdamOptimizer(self.p_learning_rate) grads_and_vars_p = p_optimizer.compute_gradients(p_loss, var_list=theta_p) p_train = p_optimizer.apply_gradients(grads_and_vars_p) # train q and update target_theta_q update_theta_q = [ target_theta_q[i].assign(theta_q[i].value() * self.eta + target_theta_q[i].value() * (1 - self.eta)) for i in range(len(theta_q))] with tf.control_dependencies([q_train]): self.train_q = tf.group(*update_theta_q) # train p and update target_theta_p update_theta_p = [ target_theta_p[i].assign(theta_p[i].value() * self.eta + target_theta_p[i].value() * (1 - self.eta)) for i in range(len(theta_p))] with tf.control_dependencies([p_train]): self.train_p = tf.group(*update_theta_p) # summary tf.scalar_summary('q_loss', q_loss) tf.scalar_summary('p_loss', p_loss) self.merged_op = tf.merge_all_summaries() def train(self): minibatch = random.sample(self.replay_buffer, self.batch_size) state_batch = [v[0] for v in minibatch] action_batch = [v[1] for v in minibatch] reward_batch = [v[2] for v in minibatch] next_state_batch = [v[3] for v in minibatch] terminate_batch = [v[4] for v in minibatch] _, _, summary_str = self.session.run([self.train_p, self.train_q, self.merged_op], feed_dict={ self.state: state_batch, self.action: action_batch, self.reward: reward_batch, self.terminate: terminate_batch, self.next_state: next_state_batch }) self.summary_writer.add_summary(summary_str, self.time_step) self.summary_writer.flush() if self.time_step % 1000 == 0: self.save(self.time_step) def observe_action(self, state, action, reward, next_state, terminate): self.time_step += 1 self.replay_buffer.append((state, action, reward, next_state, terminate)) if len(self.replay_buffer) > self.replay_buffer_size: self.replay_buffer.popleft() if self.time_step > self.min_train_replays: self.train() if terminate: self.noise.reset() def exploration(self, state): action = self.session.run(self.action_exploration, feed_dict={self.state: [state]})[0] return np.clip(action, -1, 1) def exploration_with_noise(self, state): action = self.session.run(self.action_exploration, feed_dict={self.state: [state]})[0] self.alpha -= (self.initial_alpha - self.final_alpha) / 100000 self.alpha = max(self.alpha, 0.0) noise = self.noise.noise() * self.alpha return np.clip(action + noise, -1, 1) def save(self, step): saver = tf.train.Saver() saver.save(self.session, save_path=self.save_path, global_step=step) def load(self): saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(self.save_path) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(self.session, checkpoint.model_checkpoint_path)