def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) hard_update(self.actor_target, self.actor_local) hard_update(self.critic_target, self.critic_local) # Noise process self.noise = OUNoise(action_size, random_seed)
def __init__(self, action_size, discount_factor=0.95, tau=0.02): super(MADDPG, self).__init__() # Create the multi agent as a list of ddpg agents self.maddpg_agents = [AgentDDPG(24, 2, 0), AgentDDPG(24, 2, 0)] self.discount_factor = discount_factor self.tau = tau self.iter = 0 self.total_reward = 0.0 self.count = 0 self.update_every = 1 self.batch_size = 128 self.agent_number = len(self.maddpg_agents) self.t_step = 0 # Initialize the Replay Memory self.buffer_size = 1000000 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) self.action_size = action_size self.total_reward = np.zeros((1, 2)) # Initialize the Gaussian Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters
def __init__( self, task, actor_params={}, critic_params={}, noise_params={}, replay_memory_params={}, algo_params = {} ): # Default Params default_actor_params = {'lr': .001} default_critic_params= {'lr': .001} default_noise_params= {'mu': 0, 'theta': .15, 'sigma': .2} default_replay_memory_params= {'buffer_size': 100000, 'batch_size': 64} default_algo_params = {'gamma': .99, 'tau': .1} # Final Params final_actor_params= {**default_actor_params, **actor_params} final_critic_params={**default_critic_params, **critic_params} final_noise_params={**default_noise_params, **noise_params} final_replay_memory_params={**default_replay_memory_params, **replay_memory_params, } final_algo_params = {**default_algo_params, **algo_params} self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, final_actor_params) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, final_actor_params) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, final_critic_params) self.critic_target = Critic(self.state_size, self.action_size, final_critic_params) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.noise = OUNoise( self.action_size, final_noise_params['mu'], final_noise_params['theta'], final_noise_params['sigma'] ) # Replay memory self.batch_size = final_replay_memory_params['batch_size'] self.memory = ReplayBuffer( final_replay_memory_params['buffer_size'], final_replay_memory_params['batch_size'] ) # Algorithm parameters self.gamma = final_algo_params['gamma'] # discount factor self.tau = final_algo_params['tau'] # for soft update of target parameters
def __init__(self, in_actor, in_critic, action_size, num_agents, random_seed): super(DDPG_agent, self).__init__() """init the agent""" self.action_size = action_size self.seed = random_seed # Fully connected actor network self.actor_local = Actor(in_actor, self.action_size, self.seed).to(device) self.actor_target = Actor(in_actor, self.action_size, self.seed).to(device) self.actor_optimizer = Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Fully connected critic network self.critic_local = Critic(in_critic, num_agents * self.action_size, self.seed).to(device) self.critic_target = Critic(in_critic, num_agents * self.action_size, self.seed).to(device) self.critic_optimizer = Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Ornstein-Uhlenbeck noise process for exploration self.noise = OUNoise((action_size), random_seed)
def __init__(self, ob_sp, act_sp, alow, ahigh, writer, args): self.args = args self.alow = alow self.ahigh = ahigh self.policy = Policy_net(ob_sp, act_sp) self.policy_targ = Policy_net(ob_sp, act_sp) self.qnet = Q_net(ob_sp, act_sp) self.qnet_targ = Q_net(ob_sp, act_sp) self.policy.to(device) self.qnet.to(device) self.policy_targ.to(device) self.qnet_targ.to(device) self.MSE_loss = nn.MSELoss() self.noise = OUNoise(1, 1) hard_update(self.policy_targ, self.policy) hard_update(self.qnet_targ, self.qnet) self.p_optimizer = optim.Adam(self.policy.parameters(), lr=LR) self.q_optimizer = optim.Adam(self.qnet.parameters(), lr=LR) self.memory = ReplayMemory(int(1e6)) self.epsilon_scheduler = LinearSchedule(E_GREEDY_STEPS, FINAL_STD, INITIAL_STD, warmup_steps=WARMUP_STEPS) self.n_steps = 0 self.n_updates = 0 self.writer = writer
def __init__(self, state_size=24, action_size=2, random_seed=0): """ Initializes Agent object. @Param: 1. state_size: dimension of each state. 2. action_size: number of actions. """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) #Actor network self.actor_local = Actor(self.state_size, self.action_size, random_seed).to(device) self.actor_target = Actor(self.state_size, self.action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) #Critic network self.critic_local = Critic(self.state_size, self.action_size, random_seed).to(device) self.critic_target = Critic(self.state_size, self.action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC) #Noise proccess self.noise = OUNoise(action_size, random_seed) #define Ornstein-Uhlenbeck process #Replay memory self.memory = ReplayBuffer( self.action_size, BUFFER_SIZE, MINI_BATCH, random_seed) #define experience replay buffer object
def __init__(self, state_size, action_size, random_seed): """ Initializes Agent object. @Param: 1. state_size: dimension of each state. 2. action_size: number of actions. """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) #Actor network self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) #Critic network self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) #Perform hard copy self.hard_copy_weights(self.actor_target, self.actor_local) self.hard_copy_weights(self.critic_target, self.critic_local) #Noise proccess self.noise = OUNoise(action_size, random_seed) #define Ornstein-Uhlenbeck process
def __init__(self, state_size, action_size, random_seed): """ Args: ====== state_size (int): state dim action_size (int): action dim random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # actor net initialization self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # critic net initialization self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Ornstein-Uhlenbeck Exploration Noise Process self.noise = OUNoise(action_space=action_size, seed=random_seed) # Replay memory init self.memory = Memory(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
def __init__(self, state_size: int, action_size: int, agent_no: int, params: dict): """Initialize an Agent object. Args: state_size: dimension of each state action_size: dimension of each action agent_no: agent id params: architecture and hyperparameters """ self.state_size = state_size self.action_size = action_size self.seed = params['agent_seed'] self.batch_size = params['batch_size'] self.lr_actor = params['lr_actor'] self.lr_critic = params['lr_critic'] self.critic_weight_decay = params['critic_weight_decay'] self.gamma = params['gamma'] self.tau = params['tau'] self.update_step = params['update_step'] self.num_agents = params['num_agents'] random.seed(self.seed) self.t_step = 0 self.agent_no = agent_no # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, params['first_hidden_units'], params['second_hidden_units'], self.seed).to(device) self.actor_target = Actor(state_size, action_size, params['first_hidden_units'], params['second_hidden_units'], self.seed).to(device) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size * self.num_agents, action_size * self.num_agents, params['first_hidden_units'], params['second_hidden_units'], self.seed).to(device) self.critic_target = Critic(state_size * self.num_agents, action_size * self.num_agents, params['first_hidden_units'], params['second_hidden_units'], self.seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.critic_weight_decay) # Noise process self.noise = OUNoise(action_size, self.seed, sigma=params['noise_sigma'])
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) hard_update(self.actor_target, self.actor_local) hard_update(self.critic_target, self.critic_local) # Noise process self.noise = OUNoise(action_size, random_seed) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def target_act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_target.eval() with torch.no_grad(): action = self.actor_target(state).cpu().data.numpy() self.actor_target.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset()
def __init__(self, state_size: int, action_size: int, num_agents: int, epsilon, random_seed: int): """ Initialize a DDPG Agent Object :param state_size: dimension of state (input) :param action_size: dimension of action (output) :param num_agents: number of concurrent agents in the environment :param epsilon: initial value of epsilon for exploration :param random_seed: random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.t_step = 0 # Hyperparameters self.buffer_size = 1000000 self.batch_size = 128 self.update_every = 10 self.num_updates = 10 self.gamma = 0.99 self.tau = 0.001 self.lr_actor = 0.0001 self.lr_critic = 0.001 self.weight_decay = 0 self.epsilon = epsilon self.epsilon_decay = 0.97 self.epsilon_min = 0.005 # Networks (Actor: State -> Action, Critic: (State,Action) -> Value) self.actor_local = Actor(self.state_size, self.action_size, random_seed).to(self.device) self.actor_target = Actor(self.state_size, self.action_size, random_seed).to(self.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor) self.critic_local = Critic(self.state_size, self.action_size, random_seed).to(self.device) self.critic_target = Critic(self.state_size, self.action_size, random_seed).to(self.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay) # Initialize actor and critic networks to start with same parameters self.soft_update(self.actor_local, self.actor_target, tau=1) self.soft_update(self.critic_local, self.critic_target, tau=1) # Noise Setup self.noise = OUNoise(self.action_size, random_seed) # Replay Buffer Setup self.memory = ReplayBuffer(self.buffer_size, self.batch_size)
def __init__(self, state_size, action_size, cfg, num_agents=1, agent_id=0): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action cfg (config object): main configuration with other passed settings num_agents (int): optional (default: 1). If >1 will multiply state and action space sizes for critic. Used for usage with MADDPG. agent_id (int): optional (default: 0). Set agent id for MADDPG. """ print("Initializing single DDPG agent!") self.state_size = state_size self.action_size = action_size self.seed = random.seed(cfg.random_seed) self.n_agents = num_agents self.agent_id = agent_id self.cfg = cfg # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, cfg.random_seed, cfg.dense_layers_actor).to(device) self.actor_target = Actor(state_size, action_size, cfg.random_seed, cfg.dense_layers_actor).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=cfg.lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size * num_agents, action_size * num_agents, cfg.random_seed, cfg.dense_layers_critic).to(device) self.critic_target = Critic(state_size * num_agents, action_size * num_agents, cfg.random_seed, cfg.dense_layers_critic).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=cfg.lr_critic, weight_decay=cfg.weight_decay) self.hard_copy_weights(self.critic_local, self.critic_target) self.hard_copy_weights(self.actor_local, self.actor_target) self.t_step = 0 # Noise process self.noise = OUNoise(action_size, cfg.random_seed, theta=cfg.theta_ou, sigma=cfg.sigma_ou) # Replay memory self.memory = ReplayBuffer(action_size, cfg.buffer_size, cfg.batch_size, cfg.random_seed, cfg)
def __init__(self, env): """ :param task: (class instance) Instructions about the goal and reward """ self.env = env self.state_size = env.observation_space.shape[0] self.action_size = env.action_space.shape[0] self.action_low = env.action_space.low self.action_high = env.action_space.high self.score = 0.0 self.best = 0.0 # Instances of the policy function or actor and the value function or critic # Actor critic with Advantage # Actor local and target self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Save actor model for future use actor_local_model_yaml = self.actor_local.model.to_yaml() with open("actor_local_model.yaml", "w") as yaml_file: yaml_file.write(actor_local_model_yaml) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic local and target self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model with local model self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Initialize the Gaussin Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Initialize the Replay Memory self.buffer_size = 100000 self.batch_size = 64 # original 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Parameters for the Algorithm self.gamma = 0.99 # Discount factor self.tau = 0.01 # Soft update for target parameters Actor Critic with Advantage
def __init__(self, num_in_pol, num_out_pol, num_in_critic, hidden_dim_actor=120, hidden_dim_critic=64,lr_actor=0.01,lr_critic=0.01,batch_size=64, max_episode_len=100,tau=0.02,gamma = 0.99,agent_name='one', discrete_action=False): """ Inputs: num_in_pol (int): number of dimensions for policy input num_out_pol (int): number of dimensions for policy output num_in_critic (int): number of dimensions for critic input """ self.policy = Actor(num_in_pol, num_out_pol, hidden_dim=hidden_dim_actor, discrete_action=discrete_action) self.critic = Critic(num_in_pol, 1,num_out_pol, hidden_dim=hidden_dim_critic) self.target_policy = Actor(num_in_pol, num_out_pol, hidden_dim=hidden_dim_actor, discrete_action=discrete_action) self.target_critic = Critic(num_in_pol, 1,num_out_pol, hidden_dim=hidden_dim_critic) hard_update(self.target_policy, self.policy) hard_update(self.target_critic, self.critic) self.policy_optimizer = Adam(self.policy.parameters(), lr=lr_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic,weight_decay=0) self.policy = self.policy.float() self.critic = self.critic.float() self.target_policy = self.target_policy.float() self.target_critic = self.target_critic.float() self.agent_name = agent_name self.gamma = gamma self.tau = tau self.batch_size = batch_size #self.replay_buffer = ReplayBuffer(1e7) self.replay_buffer = ReplayBufferOption(500000,self.batch_size,12) self.max_replay_buffer_len = batch_size * max_episode_len self.replay_sample_index = None self.niter = 0 self.eps = 5.0 self.eps_decay = 1/(250*5) self.exploration = OUNoise(num_out_pol) self.discrete_action = discrete_action self.num_history = 2 self.states = [] self.actions = [] self.rewards = [] self.next_states = [] self.dones = []
def __init__(self, index, config, filenames=None): random.seed(config.general.seed) np.random.seed(config.general.seed) self.noise = OUNoise(config) self.index = index self.action_size = config.environment.action_size self.tau = config.hyperparameters.tau self.actor_local = Network(config.actor, config.general.seed) self.actor_target = Network(config.actor, config.general.seed) self.actor_optimizer = Adam(self.actor_local.parameters(), lr=config.actor.lr) self.critic_local = Network(config.critic, config.general.seed) self.critic_target = Network(config.critic, config.general.seed) self.critic_optimizer = Adam(self.critic_local.parameters(), lr=config.critic.lr, weight_decay=config.hyperparameters.weight_decay)
class DDPG_agent(nn.Module): def __init__(self, in_actor, in_critic, action_size, num_agents, random_seed): super(DDPG_agent, self).__init__() """init the agent""" self.action_size = action_size self.seed = random_seed # Fully connected actor network self.actor_local = Actor(in_actor, self.action_size, self.seed).to(device) self.actor_target = Actor(in_actor, self.action_size, self.seed).to(device) self.actor_optimizer = Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Fully connected critic network self.critic_local = Critic(in_critic, num_agents * self.action_size, self.seed).to(device) self.critic_target = Critic(in_critic, num_agents * self.action_size, self.seed).to(device) self.critic_optimizer = Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Ornstein-Uhlenbeck noise process for exploration self.noise = OUNoise((action_size), random_seed) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def target_act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" action = self.actor_target(state) return action def reset(self): """ Resets noise """ self.noise.reset()
def __init__(self, state_shape, action_shape, batch_size=128, gamma=0.995, tau=0.005, actor_lr=0.0001, critic_lr=0.001, use_layer_norm=True): self.state_shape = state_shape self.action_shape = action_shape self.num_actions = np.prod(self.action_shape) # Replay memory self.buffer_size = 100000 self.batch_size = batch_size self.memory = ReplayBuffer(self.buffer_size, self.action_shape, self.state_shape) # Noise process self.noise = OUNoise(self.num_actions) # Algorithm parameters self.gamma = gamma # discount factor self.tau = tau #soft update self.actor_lr = actor_lr self.critic_lr = critic_lr #initialize self.models = Models(self.state_shape, self.action_shape, actor_lr=self.actor_lr, critic_lr=self.critic_lr, gamma=self.gamma, use_layer_norm=use_layer_norm) self.initialize() self.saver = tf.train.Saver() self.current_path = os.getcwd() #initial episode vars self.last_state = None self.last_action = None self.total_reward = 0.0 self.count = 0 self.episode_num = 0
def __init__(self, in_actor, hidden_in_actor, hidden_out_actor, out_actor, in_critic, hidden_in_critic, hidden_out_critic, lr_actor=1.0e-3, lr_critic=1.0e-3, noise_dist: str = 'normal', checkpoint_path=None) -> None: super(DDPGAgent, self).__init__() self.actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) self.critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device) self.target_actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) self.target_critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device) self.noise = OUNoise(out_actor, scale=1.0, noise_dist=noise_dist) # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=1.e-5) if checkpoint_path: checkpoint = torch.load(checkpoint_path) self.actor.load_state_dict(checkpoint[0]['actor_params']) self.target_actor.load_state_dict(checkpoint[0]['actor_params']) self.critic.load_state_dict(checkpoint[0]['critic_params']) self.target_critic.load_state_dict(checkpoint[0]['critic_params'])
def __init__(self, state_size, action_size, num_agents, cfg): """Initialize a MADDPG Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): Number of agents in environment cfg (config object): main configuration with other settings """ print("Initializing MADDPG agent with {:d} agents!".format(num_agents)) self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(cfg.random_seed) self.cfg = cfg # initializing list of single agents (2 for tennis) self.agents = [] for aid in range(num_agents): agent = SingleDDPGAgent(state_size, action_size, cfg, num_agents=num_agents, agent_id=aid) self.agents.append(agent) self.t_step = 0 # Noise process self.noise_scale = self.cfg.noise_scale self.noise = OUNoise(action_size, cfg.random_seed, theta=cfg.theta_ou, sigma=cfg.sigma_ou) # as long as active, will fill replay buffer with random memories, no learning self.prefetching = True # Replay memory for shared experiences (all agents) self.memory = ReplayBuffer(action_size, cfg.buffer_size, cfg.batch_size, cfg.random_seed, cfg)
def __init__(self, params, name, task): super(Twin_DDPG, self).__init__(params, name, task) self.aPars = params['actPars'] self.aTrain = params['actTrain'] if self.trainMode: self.values = [ Network(self.vPars, self.vTrain), Network(self.vPars, self.vTrain) ] self.policyNet = TD3Network(self.aPars, self.aTrain) self.tarPolicy = TD3Network(self.aPars, self.aTrain) if self.load: self.load_nets() self.tarPolicy.load_state_dict(self.policyNet.state_dict()) self.tar = [ Network(self.vPars, self.vTrain), Network(self.vPars, self.vTrain) ] for i in range(len(self.values)): self.tar[i].load_state_dict(self.values[i].state_dict()) else: self.policyNet = Network(self.aPars, self.aTrain) self.policyNet.load_state_dict( torch.load( "/home/austinnguyen517/Documents/Research/BML/MultiRobot/AN_Bridging/TD3_goal_policy2.txt" )) self.base = self.vTrain['baseExplore'] self.step = self.vTrain['decay'] self.expSize = self.vTrain['buffer'] self.exp = Replay(self.expSize) self.a = self.vTrain['a'] self.tau = self.vPars['tau'] self.smooth = self.vTrain['smooth'] self.clip = self.vTrain['clip'] self.delay = self.vTrain['policy_delay'] self.mean_range = self.aPars['mean_range'] self.noise = OUNoise(self.out_n, mu=0, theta=.15, max_sigma=self.explore, min_sigma=self.base, decay=self.step) self.valueLoss = [] self.actorLoss = [] self.avgLoss = 0 self.avgActLoss = 0 task.initAgent(self) while (not self.stop): x = 1 + 1 task.postTraining()
def __init__(self, state_size, action_size, num_agents, random_seed): """ Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # for MADDPG self.num_agents = num_agents # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise((num_agents, action_size), random_seed) self.eps = EPS_START self.eps_decay = 1 / (EPS_EP_END * LEARN_NUM) self.timestep = 0 # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
def build_agent(self): # build the actor-critic network and also their target networks self.actor = Actor(self.state_dim, self.action_dim, self.l1_dim, self.l2_dim,self.alpha) self.target_actor = copy.deepcopy(self.actor) self.critic = Critic(self.state_dim, self.action_dim, self.l1_dim, self.l2_dim,self.beta) self.target_critic = copy.deepcopy(self.critic) # build the replaybuffer self.replaybuffer = ReplayBuffer(self.max_replay_size, self.state_dim, self.action_dim) # build the OUNoise for action selection self.noise = OUNoise(self.action_dim)
class DDPGAgent(): def __init__(self, index, config, filenames=None): random.seed(config.general.seed) np.random.seed(config.general.seed) self.noise = OUNoise(config) self.index = index self.action_size = config.environment.action_size self.tau = config.hyperparameters.tau self.actor_local = Network(config.actor, config.general.seed) self.actor_target = Network(config.actor, config.general.seed) self.actor_optimizer = Adam(self.actor_local.parameters(), lr=config.actor.lr) self.critic_local = Network(config.critic, config.general.seed) self.critic_target = Network(config.critic, config.general.seed) self.critic_optimizer = Adam(self.critic_local.parameters(), lr=config.critic.lr, weight_decay=config.hyperparameters.weight_decay) def act(self, state, noise, random): self.actor_local.eval() with torch.no_grad(): action = self.actor_local(torch.from_numpy(state).float().to(device)).cpu().data.numpy() self.actor_local.train() if noise is not None: action += self.noise.sample() * noise if random is not None: action = (1 - random) * action + random * (np.random.rand(self.action_size) - 0.5) * 2.0 return np.clip(action, -1, 1) def learn(self, index, experiences, gamma, all_next_actions, all_actions): states, actions, rewards, next_states, dones = experiences self.critic_optimizer.zero_grad() index = torch.tensor([index]).to(device) actions_next = torch.cat(all_next_actions, dim=1).to(device) with torch.no_grad(): q_next = self.critic_target(critic_input(next_states, actions_next)) q_exp = self.critic_local(critic_input(states, actions)) q_t = rewards.index_select(1, index) + (gamma * q_next * (1 - dones.index_select(1, index))) F.mse_loss(q_exp, q_t.detach()).backward() self.critic_optimizer.step() self.actor_optimizer.zero_grad() actions_pred = [actions if i == self.index else actions.detach() for i, actions in enumerate(all_actions)] actions_pred = torch.cat(actions_pred, dim=1).to(device) actor_loss = -self.critic_local(critic_input(states, actions_pred)).mean() actor_loss.backward() self.actor_optimizer.step() self.actor_target.soft_update(self.actor_local, self.tau) self.critic_target.soft_update(self.critic_local, self.tau)
def __init__(self, state_size, action_size, replay_memory, random_seed=0, nb_agent = 20, bs = 128, gamma=0.99, tau=1e-3, lr_actor=1e-4, lr_critic=1e-4, wd_actor=0, wd_critic=0, clip_actor = None, clip_critic=None, update_interval = 20, update_times = 10): self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.nb_agent = nb_agent self.bs = bs self.update_interval = update_interval self.update_times = update_times self.timestep = 0 self.gamma = gamma self.tau = tau self.lr_actor = lr_actor self.lr_critic = lr_critic self.wd_critic = wd_critic self.wd_actor = wd_actor self.clip_critic=clip_critic self.clip_actor = clip_actor self.actor_losses = [] self.critic_losses = [] # Actor #0 self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor,weight_decay=self.wd_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic,weight_decay=self.wd_critic) # Noise process self.noise = OUNoise((self.nb_agent, action_size), random_seed) # Replay memory self.memory = replay_memory
def __init__(self, params, name, task): self.name = name self.task = task self.vPars = params['valPars'] self.vTrain = params['valTrain'] self.mPars = params['mPars'] self.mTrain = params['mTrain'] self.wPars = params['actPars'] self.wTrain = params['actTrain'] self.w_vPars = params['w_vPars'] self.w_vTrain = params['w_vTrain'] self.agents = params['agents'] self.pubs = {} for key in self.agents.keys(): bot = self.agents[key] self.pubs[key] = rospy.Publisher(bot['pub'], Vector3, queue_size = 1) rospy.Subscriber("/finished", Int8, self.receiveDone, queue_size = 1) self.valueLoss = [] self.manager = Network(self.mPars, self.mTrain) self.m_critic = Network(self.vPars, self.vTrain) self.m_critic_target= Network(self.vPars, self.vTrain) self.worker = Network(self.wPars, self.wTrain) self.w_critic = Network(self.w_vPars, self.w_vTrain) self.w_critic_target= Network(self.w_vPars, self.w_vTrain) self.m_discount = self.vTrain['m_gamma'] self.w_discount = self.vTrain['w_gamma'] self.lr = self.vTrain['lr'] self.trainMode = self.vPars['trainMode'] self.step = self.vTrain['step'] self.stop = False self.c = self.mTrain['c'] self.tau = .005 self.noise = Noise(self.manager.neurons[-1], theta = .4, max_sigma = .2, min_sigma = 0, decay = 1) self.exp = Memory() self.temp = [] self.totalSteps = 0 self.soft = nn.Softmax(dim=1) self.reset() task.initAgent(self) while(not self.stop): x = 1+1 task.postTraining()
def __init__(self, model_name, state_size, action_size, random_seed=0): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.model_name = model_name self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.rewards = list() self.losses = deque(maxlen=100) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
def __init__(self, env): """Class initialization.""" self.env = env self.state_size = env.observation_space.shape[0] self.action_size = env.action_space.shape[0] self.action_low = env.action_space.high[0] self.action_high = env.action_space.low[0] # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters
def __init__(self, state_size, action_size, seed=0, lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, gamma=GAMMA, checkpoint_path='./checkpoints/', pretrained=False): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.gamma = gamma self.checkpoint_path = checkpoint_path # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic) # If pretrained, load weights if pretrained: actor_dict = torch.load(os.path.join(self.checkpoint_path,'checkpoint_actor.pth')) critic_dict = torch.load(os.path.join(self.checkpoint_path,'checkpoint_critic.pth')) self.actor_local.load_state_dict(actor_dict) self.actor_target.load_state_dict(actor_dict) self.critic_local.load_state_dict(critic_dict) self.critic_target.load_state_dict(critic_dict) # Noise process self.noise = OUNoise(action_size, seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, device)
def __init__(self, num_agents=2, obs_size=24, act_size=2, gamma=0.99, tau=1e-3, lr_actor=1.0e-4, lr_critic=1.0e-3, weight_decay_actor=1e-5, weight_decay_critic=1e-4, clip_grad=1.0): super(MADDPGAgent, self).__init__() # Write parameters self.num_agents = num_agents self.gamma = gamma self.tau = tau self.clip_grad = clip_grad # Create all the networks self.actor = ActorNetwork(obs_size, act_size).to(device) self.critic = CriticNetwork(num_agents, obs_size, act_size).to(device) self.target_actor = ActorNetwork(obs_size, act_size).to(device) self.target_critic = CriticNetwork(num_agents, obs_size, act_size).to(device) # Copy initial network parameters to target networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) # Initialize training optimizers and OU noise self.noise = OUNoise(act_size, scale=1.0) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor, weight_decay=weight_decay_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=weight_decay_critic)