def __init__(self, state_size, action_size, agent_id): self.state_size = state_size self.action_size = action_size self.seed = args['seed'] self.device = args['device'] #self.args = args # Q-Network self.actor_network = ActorNetwork(state_size, action_size).to(self.device) self.actor_target = ActorNetwork(state_size, action_size).to(self.device) self.actor_optimizer = optim.Adam(self.actor_network.parameters(), lr=args['LR_ACTOR']) #Model takes too long to run --> load model weights from previous run (took > 24hours on my machine) #if not agent_id: # self.actor_network.load_state_dict(torch.load(args['agent_p0_path']), strict=False) # self.actor_target.load_state_dict(torch.load(args['agent_p0_path']), strict=False) #else: # self.actor_network.load_state_dict(torch.load(args['agent_p1_path']), strict=False) # self.actor_target.load_state_dict(torch.load(args['agent_p1_path']), strict=False) # Replay memory self.memory = ReplayBuffer(action_size, args['BUFFER_SIZE'], args['BATCH_SIZE'], self.device, self.seed) # Noise process self.noise = OUNoise(action_size, self.seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.mCriticLoss = 0 self.actorLoss = 0
def __init__(self, state_size, action_size, state_size_full, action_size_full, random_seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic = Critic(state_size_full, action_size_full, random_seed).to(device) self.critic_target = Critic(state_size_full, action_size_full, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # initialize targets same as original networks self.hard_update(self.actor_target, self.actor) self.hard_update(self.critic_target, self.critic) # Noise process self.noise = OUNoise(action_size, random_seed)
def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size # Actor Networks both Local and Target. self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Networks both Local and Target. self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC) # Noise process self.noise = OUNoise(action_size, random_seed) self.noise_modulation = 1 self.noise_decay = NOISE_DECAY # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) # Count number of steps self.n_steps = 0 self.update_every = UPDATE_EVERY
def __init__(self, state_size, action_size, num_agents, noise, learning_rate, memory, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.noise = noise self.learning_rate = learning_rate self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.learning_rate) # Noise process self.noise = OUNoise(action_size, seed=random_seed) # Replay memory #self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) self.memory = memory
def __init__(self, state_space, action_space, buffer_size, batch_size,learning_rate_actor, learning_rate_critic,update_rate, gamma, tau, device, seed, num_agents, epsilon, epsilon_decay, epsilon_min): self.num_agents = num_agents self.action_space = action_space self.state_space = state_space self.buffer_size = buffer_size self.batch_size = batch_size self.step_count = 0. self.update_rate = update_rate self.tau = tau self.seed = seed self.device= device self.gamma = gamma self.actor_local_network = ActorNetwork(state_space, action_space, device, seed).to(device) self.actor_target_network = ActorNetwork(state_space, action_space, device, seed).to(device) self.critic_local_network = CriticNetwork(state_space, action_space, device, seed).to(device) self.critic_target_network = CriticNetwork(state_space, action_space, device, seed).to(device) self.actor_optimizer = torch.optim.Adam(self.actor_local_network.parameters(), lr=learning_rate_actor) self.critic_optimizer = torch.optim.Adam(self.critic_local_network.parameters(), lr=learning_rate_critic) self.noise = OUNoise(action_space, seed) self.memory = ReplayBuffer(buffer_size = self.buffer_size, batch_size=self.batch_size, device=device, seed=seed) self.epsilon = epsilon self.epsilon_decay = epsilon_decay self.epsilon_min = epsilon_min
def __init__(self, state_size, action_size): # Constants self.buffer_size = int(1e6) self.batch_size = 128 self.learning_rate = 1e-4 self.learn_every = 2 self.learning_rounds = 4 self.gamma = 0.99 self.tau = 1e-3 self.t = 0 self.state_size = state_size self.action_size = action_size self.eps = 5.0 self.eps_decay = 1 / (300 * self.learning_rounds) self.actor_local = Actor(state_size, action_size).to(device) self.actor_target = Actor(state_size, action_size).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.learning_rate) self.critic_local = Critic(state_size, action_size).to(device) self.critic_target = Critic(state_size, action_size).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.learning_rate) self.noise = OUNoise((1, action_size)) self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size)
def __init__(self, config): self.config = config self.state_size = config.state_size self.action_size = config.action_size self.actor_local = Actor(self.state_size, self.action_size, 2).to(device) self.actor_target = Actor(self.state_size, self.action_size, 2).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=config.LR_ACTOR) self.critic_local = Critic(self.state_size, self.action_size, 2).to(device) self.critic_target = Critic(self.state_size, self.action_size, 2).to(device) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=config.LR_CRITIC, ) self.memory = ReplayBuffer(config.random_seed, config.BUFFER_SIZE) self.noise = OUNoise(self.action_size, config.random_seed) self.t_step = 0 self.soft_update(self.critic_local, self.critic_target, 1) self.soft_update(self.actor_local, self.actor_target, 1)
def __init__(self, state_size, action_size, seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.t_step = 0 # counter for activating learning every few steps self.running_c_loss = 0 self.running_a_loss = 0 self.training_cnt = 0 # Actor network (w/ target network) self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic network (w/ target network) self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, seed) # Prioritized replay memory self.prioritized_memory = PrioritizedMemory(BATCH_SIZE, BUFFER_SIZE, seed)
def __init__(self, state_size, action_size, num_agents, actor_network_units, critic_network_units, optimizer_learning_rate_actor=1e-3, optimizer_learning_rate_critic=1e-3, actor_weight_decay=0, critic_weight_decay=0, noise_scale=0.1, noise_theta=0.2, noise_sigma=0.2, device=None): """ Initializes the training instance for a single agent. :param state_size: (int) Space size for state observations per agent :param action_size: (int) Space size for actions per agent :param num_agents: (int) Number of agents used in problem :param actor_network_units: (list of ints) Network topology for actor networks :param critic_network_units: (list of ints) Network topology for critic networks :param optimizer_learning_rate_actor: (float) Learning rate for actor loss optimizer :param optimizer_learning_rate_critic: (float) Learning rate for critic loss optimizer :param optimizer_weight_decay_actor: (float) Weight decay for actor loss optimizer :param optimizer_weight_decay_critic: (float) Weight decay for critic loss optimizer :param noise_scale: (float) Scale for noise process :param noise_theta: (float) Theta parameter for noise process :param noise_sigma: (float) Sigma parameter for noise process :param device: (torch.device) Object representing the device where to allocate tensors """ if device is None: device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.actor = Actor(state_size, action_size, actor_network_units).to(device) self.target_actor = Actor(state_size, action_size, actor_network_units).to(device) self.critic = Critic(state_size * num_agents, action_size * num_agents, critic_network_units).to(device) self.target_critic = Critic(state_size * num_agents, action_size * num_agents, critic_network_units).to(device) self.noise = OUNoise(device, action_size, scale=noise_scale, mu=0, theta=noise_theta, sigma=noise_sigma) self.actor_optimizer = Adam(self.actor.parameters(), lr=optimizer_learning_rate_actor, weight_decay=actor_weight_decay) self.critic_optimizer = Adam(self.critic.parameters(), lr=optimizer_learning_rate_critic, weight_decay=critic_weight_decay) self.hard_update()
def __init__(self, engine): self.task = engine self.width = engine.width self.height = engine.height self.state_size = engine.state_size self.action_size = engine.action_size self.action_low = engine.action_low self.action_high = engine.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high,self.width,self.height) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high,self.width,self.height) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size,self.width,self.height) self.critic_target = Critic(self.state_size, self.action_size,self.width,self.height) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters
def __init__(self, action_size, action_type, state_size, hidden_in_size, hidden_out_size, num_atoms, lr_actor, lr_critic, l2_decay, noise_type, OU_mu, OU_theta, OU_sigma): super(DDPGAgent, self).__init__() # creating actors, critics and targets using the specified layer sizes. Note for the critics we assume 2 agents self.actor = Actor(action_size, state_size, hidden_in_size, hidden_out_size, action_type).to(device) self.critic = Critic(2 * action_size, 2 * state_size, hidden_in_size, hidden_out_size, num_atoms).to(device) self.target_actor = Actor(action_size, state_size, hidden_in_size, hidden_out_size, action_type).to(device) self.target_critic = Critic(2 * action_size, 2 * state_size, hidden_in_size, hidden_out_size, num_atoms).to(device) self.noise_type = noise_type self.action_type = action_type if noise_type == 'OUNoise': # if we're using OUNoise it needs to be initialised as it is an autocorrelated process self.noise = OUNoise(action_size, OU_mu, OU_theta, OU_sigma) # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) # initialize optimisers using specigied learning rates self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor, weight_decay=l2_decay) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=l2_decay)
def __init__(self, state_size, action_size): """ Initializes Agent object. @Param: 1. state_size: dimension of each state. 2. action_size: number of actions. """ self.state_size = state_size self.action_size = action_size #Actor network self.actor_local = Actor(self.state_size, self.action_size).to(device) #local model self.actor_target = Actor(self.state_size, self.action_size).to(device) #target model, TD-target self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) #initialize optimizer using Adam as regularizer for Actor network. #Critic network self.critic_local = Critic(self.state_size, self.action_size).to(device) #local model self.critic_target = Critic(self.state_size, self.action_size).to(device) #target model, TD-target self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) #initialize optimizer using Adam as regularizer for Critic network. #Noise proccess self.noise = OUNoise(action_size) #define Ornstein-Uhlenbeck process #Replay memory self.memory = ReplayBuffer(self.action_size, BUFFER_SIZE, MINI_BATCH) #define experience replay buffer object
def __init__(self, name, state_size, action_size, joint_state_size, joint_action_size, actor_lr, critic_lr, device): self.name = name self.device = device self.noise = OUNoise(action_size, sigma=0.1) self.actor_local = Actor(state_size, action_size, fc1=64, fc2=64).to(device) self.actor_target = Actor(state_size, action_size, fc1=64, fc2=64).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=actor_lr) self.critic_local = Critic(joint_state_size, joint_action_size, fc1=64, fc2=64).to(device) self.critic_target = Critic(joint_state_size, joint_action_size, fc1=64, fc2=64).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=critic_lr) self.hard_copy_weights(self.actor_target, self.actor_local) self.hard_copy_weights(self.critic_target, self.critic_local)
def __init__(self, state_size, action_size, random_seed, num_agents, device, hps): self.noise = OUNoise(action_size, random_seed) self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.count = 0 # setting the hyperparameters self.batch_size = hps.batch_size self.tau = hps.tau self.lr_actor = hps.lr_actor self.lr_critic = hps.lr_critic self.update_every = hps.update_every # shared replay buffer self.memory = ReplayBuffer(BUFFER_SIZE, self.batch_size, random_seed) # Critic networks - 1 network (local + target) per agent self.critics = [ Critic(state_size, action_size, random_seed, self.lr_critic, WEIGHT_DECAY, device) for i in range(num_agents) ] # Actor networks - 1 network (local + target) per agent self.actors = [ Actor(state_size, action_size, random_seed, self.lr_actor, self.noise, device) for i in range(num_agents) ]
def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Count number of steps self.n_steps = 0 ###
def __init__(self, in_actor, out_actor, in_critic, # e.g. = n_agent * (state_size + action_size) lr_actor=1e-4, lr_critic=1e-3, # better learn faster than actor random_seed=2): self.state_size = in_actor self.action_size = out_actor self.seed = random.seed(random_seed) self.params = {"lr_actor": lr_actor, "lr_critic": lr_critic, "optimizer": "adam"} self.local_actor = Actor(in_shape=in_actor, out_shape=out_actor).to(device) self.target_actor = Actor(in_shape=in_actor, out_shape=out_actor).to(device) self.actor_optimizer = optim.Adam(self.local_actor.parameters(), lr=lr_actor) # for a single agent, critic takes global observations as input, and output action-value Q # e.g. global_states = all_states + all_actions self.local_critic = Critic(in_shape=in_critic).to(device) self.target_critic = Critic(in_shape=in_critic).to(device) self.critic_optimizer = optim.Adam(self.local_critic.parameters(), lr=lr_critic) # Q: should local/target start with same weights ? synchronized after first copy after all # A: better hard copy at the beginning hard_update_A_from_B(self.target_actor, self.local_actor) hard_update_A_from_B(self.target_critic, self.local_critic) # Noise process self.noise = OUNoise(out_actor, scale=1.0)
def __init__(self, action_size, state_size,buffer_size, batch_size,actor_lr,critic_lr,device,weight_decay, tau,shared_memory,noise, share_memory_flag, seed=0): self.state_size = state_size self.action_size = action_size self.buffer_size = buffer_size self.batch_size = batch_size self.actor_lr = actor_lr self.weight_decay = weight_decay self.device = device self.seed= seed self.actor_loss =[] #self.critic_loss =[] torch.manual_seed(seed) np.random.seed(seed) self.tau = tau self.noise= OUNoise(self.action_size,self.seed) #self.noise = noise self.share_memory_flag = share_memory_flag if self.share_memory_flag: self.memory = shared_memory else: self.memory = ReplayBuffer(action_size, buffer_size, batch_size, self.device) ## Actor self.actor_local = ActorNN(self.state_size,self.action_size).to(self.device) self.actor_target = ActorNN(self.state_size,self.action_size).to(self.device) self.actor_optimizer = Adam(self.actor_local.parameters(), lr = self.actor_lr) ## Critic #self.critic_local = Critic(self.state_size,self.action_size).to(self.device) #self.critic_target = Critic(self.state_size,self.action_size).to(self.device) #self.critic_optimizer = Adam(self.critic_local.parameters(), lr = self.critic_lr, weight_decay=self.weight_decay) # initialize targets same as original networks self.hard_update(self.actor_target, self.actor_local)
def __init__(self, config): """Initialize an Agent object. Args: param1: (config) """ self.state_size = config.state_dim self.action_size = config.action_dim self.seed = np.random.seed(config.seed) self.n_agents = config.n_agents self.batch_size = config.batch_size self.tau = config.tau self.gamma = config.gamma self.device = config.device # Actor Network (w/ Target Network) self.actor_local = Actor(config).to(config.device) self.actor_target = Actor(config).to(config.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=config.lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(config).to(config.device) self.critic_target = Critic(config).to(config.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=config.lr_critic) # Noise process self.noise = OUNoise(config) # Replay memory self.memory = ReplayBuffer(config)
def __init__(self, state_size, action_size, agent_id, random_seed): """Initialize a ddpg_agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action agent_id (int): identifier for this agent """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.agent_id = agent_id self.actor_local = Actor(state_size, action_size).to(device) self.actor_target = Actor(state_size, action_size).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) self.critic_local = Critic(state_size, action_size).to(device) self.critic_target = Critic(state_size, action_size).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Make sure that the target-local model pairs are initialized to the # same weights self.hard_update(self.actor_local, self.actor_target) self.hard_update(self.critic_local, self.critic_target) self.noise = OUNoise(action_size, random_seed) self.noise_amplification = NOISE_AMPLIFICATION self.noise_amplification_decay = NOISE_AMPLIFICATION_DECAY
def __init__(self, state_size, action_size, memory, device='cpu', params=None): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action memory (obj): Memory buffer to sample device (str): device string between cuda:0 and cpu params (dict): hyper-parameters """ self.state_size = state_size self.action_size = action_size self.device = device self.step_t = 0 self.update_every = params['update_every'] # Set parameters self.gamma = params['gamma'] self.tau = params['tau'] self.seed = random.seed(params['seed']) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, params['seed'], params['actor_units'][0], params['actor_units'][1]).to(device) self.actor_target = Actor(state_size, action_size, params['seed'], params['actor_units'][0], params['actor_units'][1]).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=params['lr_actor']) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, params['seed'], params['critic_units'][0], params['critic_units'][1]).to(device) self.critic_target = Critic(state_size, action_size, params['seed'], params['critic_units'][0], params['critic_units'][1]).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=params['lr_critic'], weight_decay=params['weight_decay']) # Noise process self.noise = OUNoise(action_size, params['seed'], theta=params['noise_theta'], sigma=params['noise_sigma']) # Replay memory self.memory = memory
def __init__(self, num_agents, state_size, action_size, gamma, tau, learning_rate_actor, learning_rate_critic, weight_decay, device, random_seed=42): """Initialize an Agent object (used my MultiAgent for MADDPG). Params ====== num_agents (list): number of agents acting in the environment state_size (int): dimension of each state action_size (int): dimension of each action gamma (float): discount factor tau (float): used for soft update of target parameters learning_rate_actor (float): learning rate for the actor learning_rate_critic (float): learning rate for the critic weight_decay (float): weight decay for the optimizers device (torch.Device): pytorch device random_seed (int): random seed """ self.gamma = gamma self.tau = tau self.device = device self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=learning_rate_actor, weight_decay=weight_decay) # Critic Network (w/ Target Network) self.critic_local = Critic(num_agents, state_size, action_size, random_seed).to(device) self.critic_target = Critic(num_agents, state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=learning_rate_critic, weight_decay=weight_decay) #0.0001 # Noise process self.noise = OUNoise(size=action_size, seed=random_seed) self.timestep = 0
def __init__(self, state_size, action_size, random_seed, num_agents): self.num_agents = num_agents self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) self.noise = OUNoise(action_size, random_seed) self.actors = [ ActorAgent(i, state_size, action_size, random_seed, LR_ACTOR, self.noise, self.memory) for i in range(num_agents) ] self.critic = CriticAgent(state_size, action_size, random_seed, LR_CRITIC, WEIGHT_DECAY, TAU) self.count = 0
class ActorAgent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_agents, noise, learning_rate, memory, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.noise = noise self.learning_rate = learning_rate self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.learning_rate) # Noise process self.noise = OUNoise(action_size, seed=random_seed) # Replay memory #self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) self.memory = memory def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset()
def __init__(self, state_size, action_size, device, actor_args={}, critic_args={}): """Initializes the DQN agent. Args: state_size (int): Dimension of each state action_size (int): Dimension of each action device (torch.device): Device to use for calculations actor_args (dict): Arguments describing the actor network critic_args (dict): Arguments describing the critic network """ self.state_size = state_size """Dimension of each state""" self.action_size = action_size """Dimension of each action""" self.device = device """Device to use for calculations""" self.t_step = 0 """Timestep between training updates""" # Parameters # Actor network self.actor_local = Actor(state_size, action_size, **actor_args).to(device) self.actor_target = Actor(state_size, action_size, **actor_args).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic network self.critic_local = Critic(state_size, action_size, **critic_args).to(device) self.critic_target = Critic(state_size, action_size, **critic_args).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process for exploration self.noise = OUNoise(action_size, sigma=NOISE_SD) # Replay memory self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, self.device)
class DDPG: def __init__(self, in_actor, out_actor, in_critic, # e.g. = n_agent * (state_size + action_size) lr_actor=1e-4, lr_critic=1e-3, # better learn faster than actor random_seed=2): self.state_size = in_actor self.action_size = out_actor self.seed = random.seed(random_seed) self.params = {"lr_actor": lr_actor, "lr_critic": lr_critic, "optimizer": "adam"} self.local_actor = Actor(in_shape=in_actor, out_shape=out_actor).to(device) self.target_actor = Actor(in_shape=in_actor, out_shape=out_actor).to(device) self.actor_optimizer = optim.Adam(self.local_actor.parameters(), lr=lr_actor) # for a single agent, critic takes global observations as input, and output action-value Q # e.g. global_states = all_states + all_actions self.local_critic = Critic(in_shape=in_critic).to(device) self.target_critic = Critic(in_shape=in_critic).to(device) self.critic_optimizer = optim.Adam(self.local_critic.parameters(), lr=lr_critic) # Q: should local/target start with same weights ? synchronized after first copy after all # A: better hard copy at the beginning hard_update_A_from_B(self.target_actor, self.local_actor) hard_update_A_from_B(self.target_critic, self.local_critic) # Noise process self.noise = OUNoise(out_actor, scale=1.0) def act(self, obs, noise_scale=0.0): obs = obs.to(device) # debug noise # noise = torch.from_numpy(noise_scale*0.5*np.random.randn(1, self.action_size)).float().to(device) # action = self.local_actor(obs) + noise action = self.local_actor(obs) + noise_scale * self.noise.noise().to(device) return action def target_act(self, obs, noise_scale=0.0): obs = obs.to(device) # noise = torch.from_numpy(noise_scale*0.5 * np.random.randn(1, self.action_size)).float().to(device) # action = self.target_actor(obs) + noise_scale * noise action = self.target_actor(obs) + noise_scale * self.noise.noise().to(device) return action def reset(self): self.noise.reset()
def __init__(self, state_size, action_size, random_seed, mnoise=True, split_state=True): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.mnoise = mnoise self.split_state = split_state # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # initialize targets same as original networks self.hard_update(self.actor_target, self.actor_local) self.hard_update(self.critic_target, self.critic_local) # Noise process if self.mnoise: self.noise = OUNoise((2, action_size), random_seed) else: self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
def __init__(self, env, gamma, tau, buffer_maxlen, batch_size, critic_learning_rate, actor_learning_rate, update_per_step, seed): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # hyperparameters self.num_replay_updates_per_step = update_per_step self.batch_size = batch_size self.gamma = gamma self.tau = tau # initialize actor and critic networks self.critic = Critic(env.observation_space.shape[0], env.action_space.shape[0], seed).to(self.device) self.critic_target = Critic(env.observation_space.shape[0], env.action_space.shape[0], seed).to(self.device) self.actor = Actor(env.observation_space.shape[0], env.action_space.shape[0], seed).to(self.device) self.actor_target = Actor(env.observation_space.shape[0], env.action_space.shape[0], seed).to(self.device) # optimizers self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_learning_rate) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_learning_rate) self.buffer = ReplayBuffer(buffer_maxlen, batch_size, seed) self.noise = OUNoise(env.action_space.shape[0])
def __init__(self, state_size, action_size, n_agents, lr_actor=0.01, lr_critic=0.01): super(DDPGAgent, self).__init__() self.actor = Actor(state_size, action_size, seed=0).to(device) self.critic = Critic(state_size, action_size, n_agents).to(device) self.target_actor = Actor(state_size, action_size, seed=0).to(device) self.target_critic = Critic(state_size, action_size, n_agents).to(device) self.noise = OUNoise(action_size, scale=1.0) # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=1.e-5)
def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = [OUNoise(action_size, random_seed) for i in range(NUM_AGENTS)] # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self, env): self.env = env #self.stateDim = obs2state(env.reset().observation).size()[1] #self.actionDim = env.action_spec().shape[0] self.stateDim = env.observation_space.shape[0] self.actionDim = env.action_space.shape[0] self.actor = Actor(self.env) self.critic = Critic(self.env) self.targetActor = deepcopy(Actor(self.env)) self.targetCritic = deepcopy(Critic(self.env)) self.actorOptim = optim.Adam(self.actor.parameters(), lr=ACTOR_LR) self.criticOptim = optim.Adam(self.critic.parameters(), lr=CRITIC_LR) self.criticLoss = nn.MSELoss() self.noise = OUNoise(mu=np.zeros(self.actionDim), sigma=SIGMA) self.replayBuffer = Buffer(BUFFER_SIZE) self.batchSize = MINIBATCH_SIZE self.checkpoint_dir = CHECKPOINT_DIR self.discount = DISCOUNT self.warmup = WARMUP self.epsilon = EPSILON self.epsilon_decay = EPSILON_DECAY self.rewardgraph = [] self.stepgraph = [] self.start = 0 self.end = NUM_EPISODES