def __init__(self, action_size, action_type, state_size, hidden_in_size, hidden_out_size, num_atoms, lr_actor, lr_critic, l2_decay, noise_type, OU_mu, OU_theta, OU_sigma): super(DDPGAgent, self).__init__() # creating actors, critics and targets using the specified layer sizes. Note for the critics we assume 2 agents self.actor = Actor(action_size, state_size, hidden_in_size, hidden_out_size, action_type).to(device) self.critic = Critic(2 * action_size, 2 * state_size, hidden_in_size, hidden_out_size, num_atoms).to(device) self.target_actor = Actor(action_size, state_size, hidden_in_size, hidden_out_size, action_type).to(device) self.target_critic = Critic(2 * action_size, 2 * state_size, hidden_in_size, hidden_out_size, num_atoms).to(device) self.noise_type = noise_type self.action_type = action_type if noise_type == 'OUNoise': # if we're using OUNoise it needs to be initialised as it is an autocorrelated process self.noise = OUNoise(action_size, OU_mu, OU_theta, OU_sigma) # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) # initialize optimisers using specigied learning rates self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor, weight_decay=l2_decay) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=l2_decay)
def __init__(self, state_size, action_size, params): """ Build model, random process and intilize it. """ torch.manual_seed(params['SEED']) self.random_process = OUNoise(action_size, params) self.local_actor = Actor(state_size, action_size, params['SEED'], params['FC1'], params['FC2']).to(device) self.target_actor = Actor(state_size, action_size, params['SEED'], params['FC1'], params['FC2']).to(device) # Initialize target networks weights with local networks self.hard_copy(self.local_actor, self.target_actor) # Optimizer for local actor networks self.actor_optimizer = torch.optim.Adam(self.local_actor.parameters(), params['LR_ACTOR']) self.local_critic = Critic(state_size, action_size, params['SEED'], params['FC1'], params['FC2']).to(device) self.target_critic = Critic(state_size, action_size, params['SEED'], params['FC1'], params['FC2']).to(device) # Initialize target networks weights with local networks self.hard_copy(self.local_critic, self.target_critic) # Optimizer for local critic networks self.critic_optimizer = torch.optim.Adam( self.local_critic.parameters(), params['LR_CRITIC'])
def __init__(self,action_dim,state_dim,agentParam,useLaw,useCenCritc,num_agent,CNN=False, width=None, height=None, channel=None): self.CNN = CNN self.device = agentParam["device"] if CNN: self.CNN_preprocessA = CNN_preprocess(width,height,channel) self.CNN_preprocessC = CNN_preprocess(width,height,channel) state_dim = self.CNN_preprocessA.get_state_dim() #if agentParam["ifload"]: #self.actor = torch.load(agentParam["filename"]+"actor_"+agentParam["id"]+".pth",map_location = torch.device('cuda')) #self.critic = torch.load(agentParam["filename"]+"critic_"+agentParam["id"]+".pth",map_location = torch.device('cuda')) #else: if useLaw: self.actor = ActorLaw(action_dim,state_dim).to(self.device) else: self.actor = Actor(action_dim,state_dim).to(self.device) if useCenCritc: self.critic = Centralised_Critic(state_dim,num_agent).to(self.device) else: self.critic = Critic(state_dim).to(self.device) self.action_dim = action_dim self.state_dim = state_dim self.noise_epsilon = 0.99 self.constant_decay = 0.1 self.optimizerA = torch.optim.Adam(self.actor.parameters(), lr = 0.001) self.optimizerC = torch.optim.Adam(self.critic.parameters(), lr = 0.001) self.lr_scheduler = {"optA":torch.optim.lr_scheduler.StepLR(self.optimizerA,step_size=1000,gamma=0.9,last_epoch=-1), "optC":torch.optim.lr_scheduler.StepLR(self.optimizerC,step_size=1000,gamma=0.9,last_epoch=-1)} if CNN: # self.CNN_preprocessA = CNN_preprocess(width,height,channel) # self.CNN_preprocessC = CNN_preprocess self.optimizerA = torch.optim.Adam(itertools.chain(self.CNN_preprocessA.parameters(),self.actor.parameters()),lr=0.0001) self.optimizerC = torch.optim.Adam(itertools.chain(self.CNN_preprocessC.parameters(),self.critic.parameters()),lr=0.001) self.lr_scheduler = {"optA": torch.optim.lr_scheduler.StepLR(self.optimizerA, step_size=10000, gamma=0.9, last_epoch=-1), "optC": torch.optim.lr_scheduler.StepLR(self.optimizerC, step_size=10000, gamma=0.9, last_epoch=-1)}
def __init__(self, state_dim, action_dim, max_action, agent_n, logger): # 存在于 GPU 的神经网络 self.actor = Actor(state_dim, action_dim, max_action).to(self.device) # origin_network self.actor_target = Actor(state_dim, action_dim, max_action).to(self.device) # target_network self.actor_target.load_state_dict(self.actor.state_dict( )) # initiate actor_target with actor's parameters # pytorch 中的 tensor 默认requires_grad 属性为false,即不参与梯度传播运算,特别地,opimizer中模型参数是会参与梯度优化的 self.actor_optimizer = optim.Adam( self.actor.parameters(), pdata.LEARNING_RATE) # 以pdata.LEARNING_RATE指定学习率优化actor中的参数 self.critic = CriticCentral(agent_n).to(self.device) self.critic_target = CriticCentral(agent_n).to(self.device) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = optim.Adam(self.critic.parameters(), pdata.LEARNING_RATE) # self.replay_buffer 取消:每个Agent不再有独立的经验池 self.writer = SummaryWriter(pdata.DIRECTORY + 'runs') self.num_critic_update_iteration = 0 self.num_actor_update_iteration = 0 self.num_training = 0 self.logger = logger
def __init__(self, state_size=24, action_size=2, seed=1, num_agents=2): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.num_agents = num_agents # DDPG specific configuration hidden_size = 512 self.CHECKPOINT_FOLDER = './' # Defining networks self.actor = Actor(state_size, hidden_size, action_size).to(device) self.actor_target = Actor(state_size, hidden_size, action_size).to(device) self.critic = Critic(state_size, self.action_size, hidden_size, 1).to(device) self.critic_target = Critic(state_size, self.action_size, hidden_size, 1).to(device) self.optimizer_actor = optim.Adam(self.actor.parameters(), lr=ACTOR_LR) self.optimizer_critic = optim.Adam(self.critic.parameters(), lr=CRITIC_LR) # Noise self.noises = OUNoise((num_agents, action_size), seed) # Initialize replay buffer self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.t_step = 0 # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
def __init__(self, s_dim, a_dim, num_agent, **kwargs): self.s_dim = s_dim self.a_dim = a_dim self.config = kwargs['config'] self.num_agent = num_agent self.actor = Actor(s_dim, a_dim) self.actor_target = Actor(s_dim, a_dim) self.critic = Critic(s_dim, a_dim, num_agent) self.critic_target = Critic(s_dim, a_dim, num_agent) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.config.a_lr) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.config.c_lr) self.a_loss = 0 self.c_loss = 0 if self.config.use_cuda: self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() hard_update(self.actor, self.actor_target) hard_update(self.critic, self.critic_target) self.random_process = OrnsteinUhlenbeckProcess( size=self.a_dim, theta=self.config.ou_theta, mu=self.config.ou_mu, sigma=self.config.ou_sigma)
def __init__(self, s_dim, a_dim, **kwargs): self.s_dim = s_dim self.a_dim = a_dim self.config = kwargs['config'] self.device = 'cuda' if self.config.use_cuda else 'cpu' self.actor = Actor(s_dim, a_dim) self.actor_target = Actor(s_dim, a_dim) self.critic = Critic(s_dim, a_dim, 1) self.critic_target = Critic(s_dim, a_dim, 1) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.config.a_lr) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.config.c_lr) self.c_loss = 0 self.a_loss = 0 if self.config.use_cuda: self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() hard_update(self.actor, self.actor_target) hard_update(self.critic, self.critic_target) self.random_process = OrnsteinUhlenbeckProcess( size=self.a_dim, theta=self.config.ou_theta, mu=self.config.ou_mu, sigma=self.config.ou_sigma) self.replay_buffer = list() self.epsilon = 1. self.depsilon = self.epsilon / self.config.epsilon_decay
class DDPG(): """ Individual Agent. """ def __init__(self, state_size, action_size, params): """ Build model, random process and intilize it. """ torch.manual_seed(params['SEED']) self.random_process = OUNoise(action_size, params) self.local_actor = Actor(state_size, action_size, params['SEED'], params['FC1'], params['FC2']).to(device) self.target_actor = Actor(state_size, action_size, params['SEED'], params['FC1'], params['FC2']).to(device) # Initialize target networks weights with local networks self.hard_copy(self.local_actor, self.target_actor) # Optimizer for local actor networks self.actor_optimizer = torch.optim.Adam(self.local_actor.parameters(), params['LR_ACTOR']) self.local_critic = Critic(state_size, action_size, params['SEED'], params['FC1'], params['FC2']).to(device) self.target_critic = Critic(state_size, action_size, params['SEED'], params['FC1'], params['FC2']).to(device) # Initialize target networks weights with local networks self.hard_copy(self.local_critic, self.target_critic) # Optimizer for local critic networks self.critic_optimizer = torch.optim.Adam( self.local_critic.parameters(), params['LR_CRITIC']) def reset_noise(self): """ Reset the noise state every episode """ self.random_process.reset() def hard_copy(self, local, target): """ hard copy the weights of the local network to the target network """ for local_param, target_param in zip(local.parameters(), target.parameters()): target_param.data.copy_(local_param.data) def soft_copy(self, tau): """ soft update target network 𝜃_target = 𝜏*𝜃_local + (1 - 𝜏)*𝜃_target """ for local_param, target_param in zip(self.local_actor.parameters(), self.target_actor.parameters()): target_param.data.copy_(tau * local_param.data + (1 - tau) * target_param.data) for local_param, target_param in zip(self.local_critic.parameters(), self.target_critic.parameters()): target_param.data.copy_(tau * local_param.data + (1 - tau) * target_param.data)
def __init__(self, action_dim, state_dim, CNN=False, width=None, height=None, channel=None, device='cpu'): self.CNN = CNN if CNN: self.CNN_preprocessA = CNN_preprocess(width, height, channel) self.CNN_preprocessC = CNN_preprocess(width, height, channel) state_dim = self.CNN_preprocessA.get_state_dim() self.device = device self.actor = Actor(action_dim, state_dim) self.critic = Critic(state_dim) self.action_dim = action_dim self.state_dim = state_dim self.noise_epsilon = 0.999 self.constant_decay = 1 self.optimizerA = torch.optim.Adam(self.actor.parameters(), lr=0.00001) self.optimizerC = torch.optim.Adam(self.critic.parameters(), lr=0.01) self.lr_scheduler = { "optA": torch.optim.lr_scheduler.StepLR(self.optimizerA, step_size=1000, gamma=1, last_epoch=-1), "optC": torch.optim.lr_scheduler.StepLR(self.optimizerC, step_size=1000, gamma=0.9, last_epoch=-1) } if CNN: # self.CNN_preprocessA = CNN_preprocess(width,height,channel) # self.CNN_preprocessC = CNN_preprocess self.optimizerA = torch.optim.Adam(itertools.chain( self.CNN_preprocessA.parameters(), self.actor.parameters()), lr=0.0001) self.optimizerC = torch.optim.Adam(itertools.chain( self.CNN_preprocessC.parameters(), self.critic.parameters()), lr=0.001) self.lr_scheduler = { "optA": torch.optim.lr_scheduler.StepLR(self.optimizerA, step_size=10000, gamma=1, last_epoch=-1), "optC": torch.optim.lr_scheduler.StepLR(self.optimizerC, step_size=10000, gamma=0.9, last_epoch=-1) }
def build_agent(self): # build the actor-critic network and also their target networks self.actor = Actor(self.state_dim, self.action_dim, self.l1_dim, self.l2_dim,self.alpha) self.target_actor = copy.deepcopy(self.actor) self.critic = Critic(self.state_dim, self.action_dim, self.l1_dim, self.l2_dim,self.beta) self.target_critic = copy.deepcopy(self.critic) # build the replaybuffer self.replaybuffer = ReplayBuffer(self.max_replay_size, self.state_dim, self.action_dim) # build the OUNoise for action selection self.noise = OUNoise(self.action_dim)
def main(): ip_port = ('127.0.0.1', 9999) s = socket.socket() s.bind(ip_port) s.listen() stuck = 0 time = 0 pre = preprocess() OUTPUT_GRAPH = False MAX_EPISODE = 500 MAX_EP_STEPS = 2000 # maximum time step in one episode RENDER = False # rendering wastes time GAMMA = 0.9 # reward discount in TD error LR_A = 0.001 # learning rate for actor LR_C = 0.01 # learning rate for critic N_F = 26 N_A = 4 sess = tf.Session() actor = Actor(sess, observation_dim=N_F, action_dim=N_A, lr=LR_A) critic = Critic(sess, observation_dim=N_F, lr=LR_C) sess.run(tf.global_variables_initializer()) while (True): conn, addr = s.accept() # epsilon -= 1.0 / explore recv_data = conn.recv(1024) if not recv_data: break pos_info = unpack('51f', recv_data) r, info, stuck = pre.function(pos_info, time, stuck) state = list(info) state.append(stuck) state = np.array(state) while (True): # try: action = actor.act(state) conn.send(pack('4f', action, 1, 0, 1.0)) recv_data = conn.recv(1024) if not recv_data: break pos_info = unpack('51f', recv_data) r, info, stuck = pre.function(pos_info, time, stuck) #steer, acc, state_ = list(info) state_.append(stuck) state_ = np.array(state_) state = state_
class Predator: def __init__(self, s_dim, a_dim, num_agent, **kwargs): self.s_dim = s_dim self.a_dim = a_dim self.config = kwargs['config'] self.num_agent = num_agent self.actor = Actor(s_dim, a_dim) self.actor_target = Actor(s_dim, a_dim) self.critic = Critic(s_dim, a_dim, num_agent) self.critic_target = Critic(s_dim, a_dim, num_agent) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.config.a_lr) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.config.c_lr) self.a_loss = 0 self.c_loss = 0 if self.config.use_cuda: self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() hard_update(self.actor, self.actor_target) hard_update(self.critic, self.critic_target) self.random_process = OrnsteinUhlenbeckProcess( size=self.a_dim, theta=self.config.ou_theta, mu=self.config.ou_mu, sigma=self.config.ou_sigma) def get_batches(self): experiences = random.sample(self.replay_buffer, self.batch_size) state_batches = np.array([_[0] for _ in experiences]) action_batches = np.array([_[1] for _ in experiences]) reward_batches = np.array([_[2] for _ in experiences]) next_state_batches = np.array([_[3] for _ in experiences]) done_batches = np.array([_[4] for _ in experiences]) return state_batches, action_batches, reward_batches, next_state_batches, done_batches def random_action(self): action = np.random.uniform(low=-1., high=1., size=(self.num_agent, self.a_dim)) return action def reset(self): self.random_process.reset_states()
def __init__(self, num_in_pol, num_out_pol, num_in_critic, hidden_dim_actor=120, hidden_dim_critic=64,lr_actor=0.01,lr_critic=0.01,batch_size=64, max_episode_len=100,tau=0.02,gamma = 0.99,agent_name='one', discrete_action=False): """ Inputs: num_in_pol (int): number of dimensions for policy input num_out_pol (int): number of dimensions for policy output num_in_critic (int): number of dimensions for critic input """ self.policy = Actor(num_in_pol, num_out_pol, hidden_dim=hidden_dim_actor, discrete_action=discrete_action) self.critic = Critic(num_in_pol, 1,num_out_pol, hidden_dim=hidden_dim_critic) self.target_policy = Actor(num_in_pol, num_out_pol, hidden_dim=hidden_dim_actor, discrete_action=discrete_action) self.target_critic = Critic(num_in_pol, 1,num_out_pol, hidden_dim=hidden_dim_critic) hard_update(self.target_policy, self.policy) hard_update(self.target_critic, self.critic) self.policy_optimizer = Adam(self.policy.parameters(), lr=lr_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic,weight_decay=0) self.policy = self.policy.float() self.critic = self.critic.float() self.target_policy = self.target_policy.float() self.target_critic = self.target_critic.float() self.agent_name = agent_name self.gamma = gamma self.tau = tau self.batch_size = batch_size #self.replay_buffer = ReplayBuffer(1e7) self.replay_buffer = ReplayBufferOption(500000,self.batch_size,12) self.max_replay_buffer_len = batch_size * max_episode_len self.replay_sample_index = None self.niter = 0 self.eps = 5.0 self.eps_decay = 1/(250*5) self.exploration = OUNoise(num_out_pol) self.discrete_action = discrete_action self.num_history = 2 self.states = [] self.actions = [] self.rewards = [] self.next_states = [] self.dones = []
def __init__(self, action_size, state_size, shared_replay_buffer, memory): optimizer_fn = lambda params: torch.optim.Adam(params, lr=1e-4) noise_fn = lambda: OUNoise(action_size, SEED) memory_fn = lambda: ReplayBuffer(action_size, int(1e6), BATCH_SIZE, SEED, DEVICE) actor_network_fn = lambda: Actor(action_size, state_size, (256,128), SEED).to(DEVICE) critic_network_fn = lambda: Critic(action_size, state_size, (256,128), SEED).to(DEVICE) self.seed = SEED self.actor_local = actor_network_fn() self.actor_target = actor_network_fn() self.actor_optimizer = optimizer_fn(self.actor_local.parameters()) self.critic_local = critic_network_fn() self.critic_target = critic_network_fn() self.critic_optimizer = optimizer_fn(self.critic_local.parameters()) self.soft_update(self.critic_local, self.critic_target, 1) self.soft_update(self.actor_local, self.actor_target, 1) self.noise = noise_fn() if shared_replay_buffer: self.memory = memory else: self.memory = memory_fn()
class IAC(): def __init__(self, action_dim, state_dim): self.actor = Actor(action_dim, state_dim) self.critic = Critic(state_dim) self.action_dim = action_dim self.state_dim = state_dim self.optimizerA = torch.optim.Adam(self.actor.parameters(), lr=0.001) self.optimizerC = torch.optim.Adam(self.critic.parameters(), lr=0.01) self.lr_scheduler = { "optA": torch.optim.lr_scheduler.StepLR(self.optimizerA, step_size=1000, gamma=0.9, last_epoch=-1), "optC": torch.optim.lr_scheduler.StepLR(self.optimizerC, step_size=1000, gamma=0.9, last_epoch=-1) } # self.act_prob # self.act_log_prob def choose_action(self, s): s = torch.Tensor(s).squeeze(0) self.act_prob = self.actor(s) + 0.00001 m = torch.distributions.Categorical(self.act_prob) # self.act_log_prob = m.log_prob(m.sample()) temp = m.sample() return temp def cal_tderr(self, s, r, s_): s = torch.Tensor(s).unsqueeze(0) s_ = torch.Tensor(s_).unsqueeze(0) v_ = self.critic(s_).detach() v = self.critic(s) return r + 0.9 * v_ - v def learnCritic(self, s, r, s_): td_err = self.cal_tderr(s, r, s_) loss = torch.mul(td_err, td_err) #torch.square(td_err) self.optimizerC.zero_grad() loss.backward() self.optimizerC.step() self.lr_scheduler["optC"].step() def learnActor(self, s, r, s_, a): td_err = self.cal_tderr(s, r, s_) m = torch.log(self.act_prob[a]) temp = m * td_err.detach() loss = -torch.mean(temp) self.optimizerA.zero_grad() loss.backward() self.optimizerA.step() self.lr_scheduler["optA"].step() def update(self, s, r, s_, a): self.learnCritic(s, r, s_) self.learnActor(s, r, s_, a)
def __init__(self, gamma, tau, num_inputs, action_space, replay_size, normalize_obs=True, normalize_returns=False, critic_l2_reg=1e-2, num_outputs=1, entropy_coeff=0.1, action_coeff=0.1): super(DDPG, self).__init__(gamma=gamma, tau=tau, num_inputs=num_inputs, action_space=action_space, replay_size=replay_size, normalize_obs=normalize_obs, normalize_returns=normalize_returns) self.num_outputs = num_outputs self.entropy_coeff = entropy_coeff self.action_coeff = action_coeff self.critic_l2_reg = critic_l2_reg self.actor = Actor(self.num_inputs, self.action_space, self.num_outputs).to(self.device) self.actor_target = Actor(self.num_inputs, self.action_space, self.num_outputs).to(self.device) self.actor_perturbed = Actor(self.num_inputs, self.action_space, self.num_outputs).to(self.device) self.actor_optim = Adam(self.actor.parameters(), lr=1e-4) self.critic = Critic(self.num_inputs + self.action_space.shape[0]).to( self.device) self.critic_target = Critic(self.num_inputs + self.action_space.shape[0]).to(self.device) self.critic_optim = Adam(self.critic.parameters(), lr=1e-3, weight_decay=critic_l2_reg) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic)
class DDPGAgent: def __init__(self, action_size, action_type, state_size, hidden_in_size, hidden_out_size, num_atoms, lr_actor, lr_critic, l2_decay, noise_type, OU_mu, OU_theta, OU_sigma): super(DDPGAgent, self).__init__() # creating actors, critics and targets using the specified layer sizes. Note for the critics we assume 2 agents self.actor = Actor(action_size, state_size, hidden_in_size, hidden_out_size, action_type).to(device) self.critic = Critic(2 * action_size, 2 * state_size, hidden_in_size, hidden_out_size, num_atoms).to(device) self.target_actor = Actor(action_size, state_size, hidden_in_size, hidden_out_size, action_type).to(device) self.target_critic = Critic(2 * action_size, 2 * state_size, hidden_in_size, hidden_out_size, num_atoms).to(device) self.noise_type = noise_type self.action_type = action_type if noise_type == 'OUNoise': # if we're using OUNoise it needs to be initialised as it is an autocorrelated process self.noise = OUNoise(action_size, OU_mu, OU_theta, OU_sigma) # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) # initialize optimisers using specigied learning rates self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor, weight_decay=l2_decay) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=l2_decay) def act(self, obs, noise_scale=0.0): obs = obs.to(device) action = self.actor(obs) if noise_scale == 0.0: # if no noise then just return action as is pass elif self.noise_type == 'OUNoise': noise = 1.5 * self.noise.noise() action += noise_scale * (noise - action) action = torch.clamp(action, -1, 1) elif self.noise_type == 'BetaNoise': action = BetaNoise(action, noise_scale) elif self.noise_type == 'GaussNoise': action = GaussNoise(action, noise_scale) elif self.noise_type == 'WeightedNoise': action = WeightedNoise(action, noise_scale, self.action_type) return action # target actor is only used for updates not exploration and so has no noise def target_act(self, obs): obs = obs.to(device) action = self.target_actor(obs) return action
def __init__(self, action_dim, state_dim): self.actor = Actor(action_dim, state_dim) self.critic = Critic(state_dim) self.action_dim = action_dim self.state_dim = state_dim self.optimizerA = torch.optim.Adam(self.actor.parameters(), lr=0.001) self.optimizerC = torch.optim.Adam(self.critic.parameters(), lr=0.01) self.lr_scheduler = { "optA": torch.optim.lr_scheduler.StepLR(self.optimizerA, step_size=1000, gamma=0.9, last_epoch=-1), "optC": torch.optim.lr_scheduler.StepLR(self.optimizerC, step_size=1000, gamma=0.9, last_epoch=-1) }
def __init__(self, n_states, n_actions): # hyper parameters self.replay_size = 1000000 self.experience_replay = deque(maxlen=self.replay_size) self.n_actions = n_actions self.n_states = n_states self.lr = 0.0003 self.batch_size = 128 self.gamma = 0.99 self.H = -2 self.Tau = 0.01 # actor network self.actor = Actor(n_states=n_states, n_actions=n_actions).to(DEVICE) # dual critic network, with corresponding targets self.critic = Critic(n_states=n_states, n_actions=n_actions).to(DEVICE) self.critic2 = Critic(n_states=n_states, n_actions=n_actions).to(DEVICE) self.target_critic = Critic(n_states=n_states, n_actions=n_actions).to(DEVICE) self.target_critic2 = Critic(n_states=n_states, n_actions=n_actions).to(DEVICE) # make the target critics start off same as the main networks for target_param, local_param in zip(self.target_critic.parameters(), self.critic.parameters()): target_param.data.copy_(local_param) for target_param, local_param in zip(self.target_critic2.parameters(), self.critic2.parameters()): target_param.data.copy_(local_param) # temperature variable self.log_alpha = torch.tensor(0.0, device=DEVICE, requires_grad=True) self.optim_alpha = Adam(params=[self.log_alpha], lr=self.lr) self.alpha = 0.2 self.optim_actor = Adam(params=self.actor.parameters(), lr=self.lr) self.optim_critic = Adam(params=self.critic.parameters(), lr=self.lr) self.optim_critic_2 = Adam(params=self.critic2.parameters(), lr=self.lr)
def __init__(self, state_size, action_size, num_agents, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) self.eps = eps_start self.eps_decay = 1 / (eps_p * LEARN_NUM ) # set decay rate based on epsilon end target # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise((num_agents, action_size), random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
def __init__(self,state_dim,action_dim,hidden_dim = 64, learning_rate = 3e-4,entropy_coef = 1e-2,critic_coef =0.5, gamma = 0.99, lmbda =0.95,eps_clip= 0.2,K_epoch = 10,minibatch_size = 64,device = 'cpu'): super(PPO,self).__init__() self.entropy_coef = entropy_coef self.critic_coef = critic_coef self.gamma = gamma self.lmbda = lmbda self.eps_clip = eps_clip self.K_epoch = K_epoch self.minibatch_size = minibatch_size self.max_grad_norm = 0.5 self.data = Rollouts() self.actor = Actor(state_dim,action_dim,hidden_dim) self.critic = Critic(state_dim,hidden_dim) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=learning_rate) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=learning_rate) self.device = device
def __init__(self, state_size, action_size, replay_memory, random_seed=0, nb_agent = 20, bs = 128, gamma=0.99, tau=1e-3, lr_actor=1e-4, lr_critic=1e-4, wd_actor=0, wd_critic=0, clip_actor = None, clip_critic=None, update_interval = 20, update_times = 10): self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.nb_agent = nb_agent self.bs = bs self.update_interval = update_interval self.update_times = update_times self.timestep = 0 self.gamma = gamma self.tau = tau self.lr_actor = lr_actor self.lr_critic = lr_critic self.wd_critic = wd_critic self.wd_actor = wd_actor self.clip_critic=clip_critic self.clip_actor = clip_actor self.actor_losses = [] self.critic_losses = [] # Actor #0 self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor,weight_decay=self.wd_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic,weight_decay=self.wd_critic) # Noise process self.noise = OUNoise((self.nb_agent, action_size), random_seed) # Replay memory self.memory = replay_memory
def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) self.noise = OUNoise((action_size), random_seed) # Make sure target is initialized with the same weight as the source (found on slack to make big difference) self.hard_update(self.actor_target, self.actor_local) self.hard_update(self.critic_target, self.critic_local)
def __init__(self, s_dim, a_dim, n_agents, **kwargs): self.s_dim = s_dim self.a_dim = a_dim self.config = kwargs['config'] self.n_agents = n_agents self.device = 'cuda' if self.config.use_cuda else 'cpu' # Networks self.policy = Actor(s_dim, a_dim, n_agents) self.policy_target = Actor(s_dim, a_dim, n_agents) self.critic = Critic(s_dim, a_dim, n_agents) self.critic_target = Critic(s_dim, a_dim, n_agents) if self.config.use_cuda: self.policy.cuda() self.policy_target.cuda() self.critic.cuda() self.critic_target.cuda() self.policy_optimizer = torch.optim.Adam(self.policy.parameters(), lr=self.config.a_lr) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.config.c_lr) hard_update(self.policy, self.policy_target) hard_update(self.critic, self.critic_target) self.random_process = OrnsteinUhlenbeckProcess( size=self.a_dim, theta=self.config.ou_theta, mu=self.config.ou_mu, sigma=self.config.ou_sigma) self.replay_buffer = list() self.epsilon = 1. self.depsilon = self.epsilon / self.config.epsilon_decay self.c_loss = None self.a_loss = None self.action_log = list()
def __init__(self, state_size, action_size, aid=0, num_agents=2, seed=1234): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, seed=seed).to(device) self.actor_target = Actor(state_size, action_size, seed=seed).to(device) self.actor_optimizer = Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, num_agents=num_agents, seed=seed).to(device) self.critic_target = Critic(state_size, action_size, num_agents=num_agents, seed=seed).to(device) self.critic_optimizer = Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, seed=seed)
def __init__(self, replay_buffer, noise, state_dim, action_dim, seed, fc1_units = 256, fc2_units = 128, device="cpu", lr_actor=1e-4, lr_critic=1e-3, batch_size=128, discount=0.99, tau=1e-3): torch.manual_seed(seed) self.actor_local = Actor(state_dim, action_dim, fc1_units, fc2_units, seed).to(device) self.critic_local = Critic(state_dim, action_dim, fc1_units, fc2_units, seed).to(device) self.actor_optimizer = optim.Adam(params=self.actor_local.parameters(), lr=lr_actor) self.critic_optimizer = optim.Adam(params=self.critic_local.parameters(), lr=lr_critic) self.actor_target = Actor(state_dim, action_dim, fc1_units, fc2_units, seed).to(device) self.critic_target = Critic(state_dim, action_dim, fc1_units, fc2_units, seed).to(device) self.buffer = replay_buffer self.noise = noise self.device = device self.batch_size = batch_size self.discount = discount self.tau = tau Agent.hard_update(model_local=self.actor_local, model_target=self.actor_target) Agent.hard_update(model_local=self.critic_local, model_target=self.critic_target)
def __init__(self, env, lr=3e-4, gamma=0.99, polyak=5e-3, alpha=0.2, reward_scale=1.0, cuda=True, writer=None): state_size = env.observation_space.shape[0] action_size = env.action_space.shape[0] self.actor = Actor(state_size, action_size) self.critic = Critic(state_size, action_size) self.target_critic = Critic(state_size, action_size).eval() self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr) self.q1_optimizer = optim.Adam(self.critic.q1.parameters(), lr=lr) self.q2_optimizer = optim.Adam(self.critic.q2.parameters(), lr=lr) self.target_critic.load_state_dict(self.critic.state_dict()) for param in self.target_critic.parameters(): param.requires_grad = False self.memory = ReplayMemory() self.gamma = gamma self.alpha = alpha self.polyak = polyak # Always between 0 and 1, usually close to 1 self.reward_scale = reward_scale self.writer = writer self.cuda = cuda if cuda: self.actor = self.actor.to('cuda') self.critic = self.critic.to('cuda') self.target_critic = self.target_critic.to('cuda')
class BiCNet(): def __init__(self, s_dim, a_dim, n_agents, **kwargs): self.s_dim = s_dim self.a_dim = a_dim self.config = kwargs['config'] self.n_agents = n_agents self.device = 'cuda' if self.config.use_cuda else 'cpu' # Networks self.policy = Actor(s_dim, a_dim, n_agents) self.policy_target = Actor(s_dim, a_dim, n_agents) self.critic = Critic(s_dim, a_dim, n_agents) self.critic_target = Critic(s_dim, a_dim, n_agents) if self.config.use_cuda: self.policy.cuda() self.policy_target.cuda() self.critic.cuda() self.critic_target.cuda() self.policy_optimizer = torch.optim.Adam(self.policy.parameters(), lr=self.config.a_lr) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.config.c_lr) hard_update(self.policy, self.policy_target) hard_update(self.critic, self.critic_target) self.random_process = OrnsteinUhlenbeckProcess( size=self.a_dim, theta=self.config.ou_theta, mu=self.config.ou_mu, sigma=self.config.ou_sigma) self.replay_buffer = list() self.epsilon = 1. self.depsilon = self.epsilon / self.config.epsilon_decay self.c_loss = None self.a_loss = None self.action_log = list() def choose_action(self, obs, noisy=True): obs = torch.Tensor([obs]).to(self.device) action = self.policy(obs).cpu().detach().numpy()[0] self.action_log.append(action) if noisy: for agent_idx in range(self.n_agents): pass # action[agent_idx] += self.epsilon * self.random_process.sample() self.epsilon -= self.depsilon self.epsilon = max(self.epsilon, 0.001) np.clip(action, -1., 1.) return action def reset(self): self.random_process.reset_states() self.action_log.clear() def prep_train(self): self.policy.train() self.critic.train() self.policy_target.train() self.critic_target.train() def prep_eval(self): self.policy.eval() self.critic.eval() self.policy_target.eval() self.critic_target.eval() def random_action(self): return np.random.uniform(low=-1, high=1, size=(self.n_agents, 2)) def memory(self, s, a, r, s_, done): self.replay_buffer.append((s, a, r, s_, done)) if len(self.replay_buffer) >= self.config.memory_length: self.replay_buffer.pop(0) def get_batches(self): experiences = random.sample(self.replay_buffer, self.config.batch_size) state_batches = np.array([_[0] for _ in experiences]) action_batches = np.array([_[1] for _ in experiences]) reward_batches = np.array([_[2] for _ in experiences]) next_state_batches = np.array([_[3] for _ in experiences]) done_batches = np.array([_[4] for _ in experiences]) return state_batches, action_batches, reward_batches, next_state_batches, done_batches def train(self): state_batches, action_batches, reward_batches, next_state_batches, done_batches = self.get_batches( ) state_batches = torch.Tensor(state_batches).to(self.device) action_batches = torch.Tensor(action_batches).to(self.device) reward_batches = torch.Tensor(reward_batches).reshape( self.config.batch_size, self.n_agents, 1).to(self.device) next_state_batches = torch.Tensor(next_state_batches).to(self.device) done_batches = torch.Tensor( (done_batches == False) * 1).reshape(self.config.batch_size, self.n_agents, 1).to(self.device) target_next_actions = self.policy_target.forward(next_state_batches) target_next_q = self.critic_target.forward(next_state_batches, target_next_actions) main_q = self.critic(state_batches, action_batches) ''' How to concat each agent's Q value? ''' #target_next_q = target_next_q #main_q = main_q.mean(dim=1) ''' Reward Norm ''' # reward_batches = (reward_batches - reward_batches.mean(dim=0)) / reward_batches.std(dim=0) / 1024 # Critic Loss self.critic.zero_grad() baselines = reward_batches + done_batches * self.config.gamma * target_next_q loss_critic = torch.nn.MSELoss()(main_q, baselines.detach()) loss_critic.backward() torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5) self.critic_optimizer.step() # Actor Loss self.policy.zero_grad() clear_action_batches = self.policy.forward(state_batches) loss_actor = -self.critic.forward(state_batches, clear_action_batches).mean() loss_actor += (clear_action_batches**2).mean() * 1e-3 loss_actor.backward() torch.nn.utils.clip_grad_norm_(self.policy.parameters(), 0.5) self.policy_optimizer.step() # This is for logging self.c_loss = loss_critic.item() self.a_loss = loss_actor.item() soft_update(self.policy, self.policy_target, self.config.tau) soft_update(self.critic, self.critic_target, self.config.tau) def get_loss(self): return self.c_loss, self.a_loss def get_action_std(self): return np.array(self.action_log).std(axis=-1).mean()
def __init__(self, state_size: int, action_size: int, seed: int, n_agent: int): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.n_agent = n_agent self.seed = random.seed(seed) self.global_step = 0 self.update_step = 0 # Initialize actor and critic local and target networks self.actor = Actor(state_size, action_size, seed, ACTOR_NETWORK_LINEAR_SIZES, batch_normalization=ACTOR_BATCH_NORM).to(device) self.actor_target = Actor( state_size, action_size, seed, ACTOR_NETWORK_LINEAR_SIZES, batch_normalization=ACTOR_BATCH_NORM).to(device) self.critic = Critic(state_size, action_size, seed, CRITIC_NETWORK_LINEAR_SIZES, batch_normalization=CRITIC_BATCH_NORM).to(device) self.critic_second = Critic( state_size, action_size, seed, CRITIC_SECOND_NETWORK_LINEAR_SIZES, batch_normalization=CRITIC_BATCH_NORM).to(device) self.critic_second_target = Critic( state_size, action_size, seed, CRITIC_SECOND_NETWORK_LINEAR_SIZES, batch_normalization=CRITIC_BATCH_NORM).to(device) self.critic_target = Critic( state_size, action_size, seed, CRITIC_NETWORK_LINEAR_SIZES, batch_normalization=CRITIC_BATCH_NORM).to(device) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=ACTOR_LEARNING_RATE) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=CRITIC_LEARNING_RATE) self.critic_second_optimizer = optim.Adam( self.critic_second.parameters(), lr=CRITIC_LEARNING_RATE) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = [0] * n_agent self.noise = OUNoise(action_size, seed, decay_period=50) # Copy parameters from local network to target network for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.critic_second_target.parameters(), self.critic_second.parameters()): target_param.data.copy_(param.data)