def __init__(self, config): self.config = config self.state_size = config.state_size self.action_size = config.action_size self.actor_local = Actor(self.state_size, self.action_size, 2).to(device) self.actor_target = Actor(self.state_size, self.action_size, 2).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=config.LR_ACTOR) self.critic_local = Critic(self.state_size, self.action_size, 2).to(device) self.critic_target = Critic(self.state_size, self.action_size, 2).to(device) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=config.LR_CRITIC, ) self.memory = ReplayBuffer(config.random_seed, config.BUFFER_SIZE) self.noise = OUNoise(self.action_size, config.random_seed) self.t_step = 0 self.soft_update(self.critic_local, self.critic_target, 1) self.soft_update(self.actor_local, self.actor_target, 1)
def __init__(self, state_shape, action_shape, stats): # self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.device = torch.device("cpu") self.state_shape = state_shape self.action_shape = action_shape self.stats = stats self.learn_rate = 3e-4 self.num_epochs = 8 self.entropy_weight = 0.001 self.kl_clip = 0.1 self.deterministic_test_mode = False self.hidden_state_size = 16 self.lstm = LSTM(self.state_shape, self.hidden_state_size) self.actor = Actor(self.hidden_state_size, self.action_shape).to(self.device) self.critic = Critic(self.hidden_state_size).to(self.device) self.optimizer = torch.optim.Adam(list(self.actor.parameters()) + list(self.critic.parameters()), lr=self.learn_rate)
def __init__(self, state_size, action_size, agent_num, random_seed): """ Initialize an Agent object. :param state_size (int): dimension of each state :param action_size (int): dimension of each action :param random_seed (int): random seed """ # Actor Networks self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Networks self.critic_local = Critic(state_size, action_size, agent_num, random_seed).to(device) self.critic_target = Critic(state_size, action_size, agent_num, random_seed).to(device) self.critic_optimizer = Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed, scale=0.1)
def __init__(self, idx, params): """Initialize an Agent object. Params ====== params (dict-like): dictionary of parameters for the agent """ super().__init__(params) self.idx = idx self.params = params self.update_every = params['update_every'] self.gamma = params['gamma'] self.num_agents = params['num_agents'] # Actor Network (w/ Target Network) self.actor_local = Actor(params['actor_params']).to(device) self.actor_target = Actor(params['actor_params']).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=params['actor_params']['lr']) # Critic Network (w/ Target Network) self.critic_local = Critic(params['critic_params']).to(device) self.critic_target = Critic(params['critic_params']).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=params['critic_params']['lr'], weight_decay=params['critic_params']['weight_decay']) # Noise process self.noise = OUNoise(self.params['noise_params']) # Replay memory self.memory = params['experience_replay']
def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor( state_size, action_size, random_seed).to(device) self.actor_target = Actor( state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam( self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic( state_size, action_size, random_seed).to(device) self.critic_target = Critic( state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer( action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
def __init__(self, state_size, action_size): self.discount = 0.99 self.target_mix = 5e-3 self.online_actor = Actor(state_size, action_size, fc1_units=256, fc2_units=128).to(DEVICE) self.target_actor = Actor(state_size, action_size, fc1_units=256, fc2_units=128).to(DEVICE) self.actor_opt = optim.Adam(self.online_actor.parameters(), lr=3e-3) self.online_critic = Critic(state_size, action_size, fc1_units=256, fc2_units=128).to(DEVICE) self.target_critic = Critic(state_size, action_size, fc1_units=256, fc2_units=128).to(DEVICE) self.critic_opt = optim.Adam(self.online_critic.parameters(), lr=3e-3) self.noise = OrnsteinUhlenbeck(action_size, mu=0., theta=0.15, sigma=0.05) self.replay = Replay(action_size, buffer_size=int(1e6), batch_size=128)
def __init__(self, config: ac_parm, device, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.config = config self.seed = random.seed(random_seed) self.name = config.name self.device = device # Actor Network (w/ Target Network) self.actor_local = Actor(config, random_seed).to(device) self.actor_target = Actor(config, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=config.lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(config, random_seed).to(device) self.critic_target = Critic(config, random_seed).to(device) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=config.lr_critic, weight_decay=config.weight_decay) # Noise process self.noise = OUNoise(config.action_size, random_seed) # Replay memory self.memory = ReplayBuffer(config, device, random_seed) self.step_number = 0
def __init__(self, params): action_size = params['action_size'] state_size = params['state_size'] buf_params = params['buf_params'] nn_params = params['nn_params'] nn_params['nn_actor']['l1'][0] = state_size nn_params['nn_actor']['l3'][1] = action_size nn_params['nn_critic']['l1'][0] = state_size + action_size self.__actor_local = Actor(nn_params['nn_actor']).to(device) self.__actor_target = Actor(nn_params['nn_actor']).to(device) self.__critic_local = Critic(nn_params['nn_critic']).to(device) self.__critic_target = Critic(nn_params['nn_critic']).to(device) self.__action_size = action_size self.__state_size = state_size self.__memory = ReplayBuffer(buf_params) self.__t = 0 self.gamma = params['gamma'] self.learning_rate_actor = params['learning_rate_actor'] self.learning_rate_critic = params['learning_rate_critic'] self.tau = params['tau'] self.__optimiser_actor = optim.Adam(self.__actor_local.parameters(), self.learning_rate_actor) self.__optimiser_critic = optim.Adam(self.__critic_local.parameters(), self.learning_rate_critic) self.__uo_process = UOProcess() # other parameters self.agent_loss = 0.0
def __init__(self, config): super(PPO, self).__init__() self.config = config torch.manual_seed(self.config['seed']) np.random.seed(self.config['seed']) if self.config['experiment'][ 'orthogonal_initialization_and_layer_scaling']: weight_init_scheme = 'orthogonal' else: weight_init_scheme = 'normal' self.actor = Actor( device=self.config['device'], input_dim=self.config['env']['nS'], output_dim=self.config['env']['nA'], hidden_dims=self.config['model']['actor']['hidden_dims'], hidden_activation_fn=self.config['model']['actor'] ['hidden_acivation_fn'], weight_init_scheme=weight_init_scheme) self.actor_optimizer = optim.Adam( self.actor.parameters(), lr=self.config['model']['actor']['lr'], betas=self.config['model']['actor']['betas']) self.critic = Critic( device=self.config['device'], input_dim=self.config['env']['nS'], hidden_dims=self.config['model']['critic']['hidden_dims'], hidden_activation_fn=self.config['model']['critic'] ['hidden_acivation_fn'], weight_init_scheme=weight_init_scheme) self.critic_optimizer = optim.Adam( self.critic.parameters(), lr=self.config['model']['critic']['lr'], betas=self.config['model']['critic']['betas']) if self.config['train']['gail']: self.discriminator = Discriminator( device=self.config['device'], state_dim=self.config['env']['nS'], action_dim=self.config['env']['nA'], hidden_dims=self.config['model']['discriminator'] ['hidden_dims'], hidden_activation_fn=self.config['model']['discriminator'] ['hidden_acivation_fn'], weight_init_scheme=weight_init_scheme) self.discriminator_optimizer = optim.Adam( self.discriminator.parameters(), lr=self.config['model']['discriminator']['lr'], betas=self.config['model']['discriminator']['betas']) # [EXPERIMENT] - reward scaler: r / rs.std() if self.config['experiment']['reward_standardization']: self.reward_scaler = RewardScaler( gamma=self.config['train']['gamma']) # [EXPERIMENT] - observation scaler: (ob - ob.mean()) / (ob.std()) if self.config['experiment']['observation_normalization']: self.observation_scaler = ObservationScaler()
def __init__(self, env, gamma, tau, buffer_maxlen, batch_size, critic_learning_rate, actor_learning_rate, update_per_step, seed): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # hyperparameters self.num_replay_updates_per_step = update_per_step self.batch_size = batch_size self.gamma = gamma self.tau = tau # initialize actor and critic networks self.critic = Critic(env.observation_space.shape[0], env.action_space.shape[0], seed).to(self.device) self.critic_target = Critic(env.observation_space.shape[0], env.action_space.shape[0], seed).to(self.device) self.actor = Actor(env.observation_space.shape[0], env.action_space.shape[0], seed).to(self.device) self.actor_target = Actor(env.observation_space.shape[0], env.action_space.shape[0], seed).to(self.device) # optimizers self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_learning_rate) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_learning_rate) self.buffer = ReplayBuffer(buffer_maxlen, batch_size, seed) self.noise = OUNoise(env.action_space.shape[0])
def __init__(self,env, env_params, args, models=None, record_episodes=[0,.1,.25,.5,.75,1.]): self.env= env self.env_params = env_params self.args = args # networks if models == None: self.actor = Actor(self.env_params).double() self.critic = Critic(self.env_params).double() else: self.actor , self.critic = self.LoadModels() # target networks used to predict env actions with self.actor_target = Actor(self.env_params,).double() self.critic_target = Critic(self.env_params).double() self.actor_target.load_state_dict(self.actor.state_dict()) self.critic_target.load_state_dict(self.critic.state_dict()) if self.args.cuda: self.actor.cuda() self.critic.cuda() self.actor_target.cuda() self.critic_target.cuda() self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=0.001) self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=0.001) self.normalize = Normalizer(env_params,self.args.gamma) self.buffer = ReplayBuffer(1_000_000, self.env_params) self.tensorboard = ModifiedTensorBoard(log_dir = f"logs") self.record_episodes = [int(eps * self.args.n_epochs) for eps in record_episodes]
def __init__(self, state_dim, action_dim, max_action, memory, args): # misc self.criterion = nn.MSELoss() self.state_dim = state_dim self.action_dim = action_dim self.max_action = max_action self.memory = memory self.n = args.n_actor # actors self.actors = [ Actor(state_dim, action_dim, max_action, layer_norm=args.layer_norm) for i in range(self.n) ] self.actors_target = [ Actor(state_dim, action_dim, max_action, layer_norm=args.layer_norm) for i in range(self.n) ] self.actors_optimizer = [ torch.optim.Adam(self.actors[i].parameters(), lr=args.actor_lr) for i in range(self.n) ] for i in range(self.n): self.actors_target[i].load_state_dict(self.actors[i].state_dict()) # crtic self.critic = Critic(state_dim, action_dim, layer_norm=args.layer_norm) self.critic_target = Critic(state_dim, action_dim, layer_norm=args.layer_norm) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=args.critic_lr) # cuda if torch.cuda.is_available(): for i in range(self.n): self.actors[i] = self.actors[i].cuda() self.actors_target[i] = self.actors_target[i].cuda() self.critic = self.critic.cuda() self.critic_target = self.critic_target.cuda() # shared memory for i in range(self.n): self.actors[i].share_memory() self.actors_target[i].share_memory() self.critic.share_memory() self.critic_target.share_memory() # hyper-parameters self.tau = args.tau self.discount = args.discount self.batch_size = args.batch_size self.reward_scale = args.reward_scale
def __init__(self, rows, columns, num_actions, l_rate=1e-4, gamma=0.99, lam=0.95, policy_kl_range=0.0008, policy_params=20, value_clip=1.0, loss_coefficient=1.0, entropy_coefficient=0.05): self.rows = rows self.columns = columns self.num_actions = num_actions self.actor = Actor(self.num_actions) self.critic = Critic() self.actor_old = Actor(self.num_actions) self.critic_old = Critic() self.optimizer = tf.keras.optimizers.Adam(l_rate) self.gamma = gamma self.lam = lam self.policy_kl_range = policy_kl_range self.policy_params = policy_params self.value_clip = value_clip self.loss_coefficient = loss_coefficient self.entropy_coefficient = entropy_coefficient
def __init__(self, n_s, n_a, a_bound, gamma=0.99, memory_size=10000, tau=0.01, lr_a=0.001, lr_c=0.002, batch_size=64, var=3, var_decay=0.9995): self.n_s = n_s self.n_a = n_a self.a_bound = a_bound self.gamma = gamma self.memory_size = memory_size self.tau = tau self.batch_size = batch_size self.var = var self.var_decay = var_decay # memory self.replay_buffer = ReplayBuffer(n_s, n_a, memory_size) # actor self.eval_actor = Actor(n_s, n_a, a_bound) self.target_actor = deepcopy(self.eval_actor) self.actor_optim = torch.optim.Adam(self.eval_actor.parameters(), lr=lr_a) # critic self.eval_critic = Critic(n_s, n_a) self.target_critic = deepcopy(self.eval_critic) self.critic_optim = torch.optim.Adam(self.eval_critic.parameters(), lr=lr_c)
def __init__(self): self.discount_factor = 0.99 self.num_steps = 1000000 self.reset_interval = 100 self.update_interval = 1 self.batch_size = 100 self.replay_buffer_length = 1000 self.replay_buffer = [] self.env = PendulumEnv() self.noise = SmoothNoise((1, )) self.draw_env = False self.actor = Actor() self.critic = Critic() self.actor_target = Actor() self.critic_target = Critic() self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=0.0001) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=0.001) self.actor_criterion = nn.MSELoss() self.critic_criterion = nn.MSELoss() self.update_target(self.actor_target, self.actor, 1.0) self.update_target(self.critic_target, self.critic, 1.0) self.env_reset()
def __init__(self, state_size, action_size, num_agents, seed): self.state_size = state_size self.action_size = action_size self.seed = seed self.num_agents = num_agents self.actor = Actor(self.state_size, self.action_size, self.seed).to(device) self.actor_target = Actor(self.state_size, self.action_size, self.seed).to(device) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=LR_ACTOR) self.critic = Critic(self.state_size, self.action_size, self.seed).to(device) self.critic_target = Critic(self.state_size, self.action_size, self.seed).to(device) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=LR_CRITIC) self.copy_init_weights(self.actor, self.actor_target) self.copy_init_weights(self.critic, self.critic_target) self.noise = OUNoise((num_agents, action_size), seed) self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, seed)
def __init__(self, seed, nA, nS, L2, index): self.seed = seed self.nA = nA self.nS = nS self.nO = 52 # 24 * 2 state space + 2 * 2 action space self.L2 = L2 self.index = index self.noise = OUnoise(nA, seed) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.local_critic = Critic(seed, self.nO, nA).to(self.device) self.target_critic = Critic(seed, self.nO, nA).to(self.device) self.local_actor = Actor(seed, nS, nA).to(self.device) self.target_actor = Actor(seed, nS, nA).to(self.device) # Copy the weights from local to target hard_update(self.local_critic, self.target_critic) hard_update(self.local_actor, self.target_actor) self.critic_optimizer = optim.Adam(self.local_critic.parameters(), lr=1e-3, weight_decay=self.L2) self.actor_optimizer = optim.Adam(self.local_actor.parameters(), lr=1e-4)
def __init__(self, agent_id, state_size, action_size, rand_seed, meta_agent): """ Creates a new DDPG Agent """ self.agent_id = agent_id self.action_size = action_size # Defines the Actor Networks self.actor_local = Actor(state_size, action_size, rand_seed).to(device) self.actor_target = Actor(state_size, action_size, rand_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Defines the Critic Networks self.critic_local = Critic(state_size, action_size, meta_agent.agents_qty, rand_seed).to(device) self.critic_target = Critic(state_size, action_size, meta_agent.agents_qty, rand_seed).to(device) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=LR_CRITIC) #, weight_decay=WEIGHT_DECAY) self.noise = OUNoise(action_size, rand_seed) # Refers to the MA agent memory self.memory = meta_agent.memory self.t_step = 0
def __init__(self, gamma, tau,num_inputs, env,device, results_path=None): self.gamma = gamma self.tau = tau self.min_action,self.max_action = env.action_range() self.device = device self.num_actions = env.action_space() self.noise_stddev = 0.3 self.results_path = results_path self.checkpoint_path = os.path.join(self.results_path, 'checkpoint/') os.makedirs(self.checkpoint_path, exist_ok=True) # Define the actor self.actor = Actor(num_inputs, self.num_actions).to(device) self.actor_target = Actor(num_inputs, self.num_actions).to(device) # Define the critic self.critic = Critic(num_inputs, self.num_actions).to(device) self.critic_target = Critic(num_inputs, self.num_actions).to(device) # Define the optimizers for both networks self.actor_optimizer = Adam(self.actor.parameters(), lr=1e-4 ) # optimizer for the actor network self.critic_optimizer = Adam(self.critic.parameters(), lr=1e-4, weight_decay=0.002) # optimizer for the critic network self.hard_swap() self.ou_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(self.num_actions), sigma=float(self.noise_stddev) * np.ones(self.num_actions)) self.ou_noise.reset()
def __init__(self, env, gamma=0.99, tau=1e-3, pol_lr=1e-4, q_lr=5e-3, batch_size=64, buffer_size=10000, target_noise=0.2, action_noise=0.1, clip_range=0.5, update_delay=2): # environment stuff self.env = env self.num_act = env.action_space.shape[0] self.num_obs = env.observation_space.shape[0] self.eval_env = copy.deepcopy(env) # hyper parameters self.gamma = gamma self.tau = tau self.pol_lr = pol_lr self.q_lr = q_lr self.batch_size = batch_size self.buffer_size = buffer_size self.target_noise = target_noise self.action_noise = action_noise self.clip_range = clip_range self.update_delay = 2 # networks self.pol = Actor(self.num_obs, self.num_act, [400, 300]).double() self.q1 = Critic(self.num_obs, self.num_act, [400, 300]).double() self.q2 = Critic(self.num_obs, self.num_act, [400, 300]).double() self.pol.init_weights() self.q1.init_weights() self.q2.init_weights() self.target_pol = copy.deepcopy(self.pol).double() self.target_q1 = copy.deepcopy(self.q1).double() self.target_q2 = copy.deepcopy(self.q2).double() # optimizers, buffer self.pol_opt = torch.optim.Adam(self.pol.parameters(), lr=self.pol_lr) self.q1_opt = torch.optim.Adam( self.q1.parameters(), lr=self.q_lr, ) self.q2_opt = torch.optim.Adam( self.q2.parameters(), lr=self.q_lr, ) self.buffer = ReplayBuffer(self.buffer_size, 1000) self.mse_loss = torch.nn.MSELoss() self.cum_q1_loss = 0 self.cum_q2_loss = 0 self.cum_obj = 0
def __init__(self, state_dim, action_dim, max_action, device, memory_capacity=10000, discount=0.99, update_freq=2, tau=0.005, policy_noise_std=0.2, policy_noise_clip=0.5, actor_lr=1e-3, critic_lr=1e-3, train_mode=True): self.train_mode = train_mode # whether the agent is in training or testing mode self.state_dim = state_dim # dimension of the state space self.action_dim = action_dim # dimension of the action space self.device = device # defines which cuda or cpu device is to be used to run the networks self.discount = discount # denoted a gamma in the equation for computation of the Q-value self.update_freq = update_freq # defines how frequently should the actor and target be updated self.tau = tau # defines the factor used for Polyak averaging (i.e., soft updating of the target networks) self.max_action = max_action # the max value of the range in the action space (assumes a symmetric range in the action space) self.policy_noise_clip = policy_noise_clip # max range within which the noise for the target policy smoothing must be contained self.policy_noise_std = policy_noise_std # standard deviation, i.e. sigma, of the Gaussian noise applied for target policy smoothing # create an instance of the replay buffer self.memory = ReplayMemory(memory_capacity) # instances of the networks for the actor and the two critics self.actor = Actor(state_dim, action_dim, max_action, actor_lr) self.critic = Critic( state_dim, action_dim, critic_lr ) # the critic class encapsulates two copies of the neural network for the two critics used in TD3 # instance of the target networks for the actor and the two critics self.target_actor = Actor(state_dim, action_dim, max_action, actor_lr) self.target_critic = Critic(state_dim, action_dim, critic_lr) # initialise the targets to the same weight as their corresponding current networks self.target_actor.load_state_dict(self.actor.state_dict()) self.target_critic.load_state_dict(self.critic.state_dict()) # since we do not learn/train on the target networks self.target_actor.eval() self.target_critic.eval() # for test mode if not self.train_mode: self.actor.eval() self.critic.eval() self.actor.to(self.device) self.critic.to(self.device) self.target_actor.to(self.device) self.target_critic.to(self.device)
def __init__(self, state_dim, action_dim, max_action, args): self.actor = Actor(state_dim, action_dim, max_action).to(args.device) self.actor_target = Actor(state_dim, action_dim, max_action).to(args.device) self.actor_target.load_state_dict(self.actor.state_dict()) self.actor_optimizer = torch.optim.Adam(self.actor.parameters()) self.critic = Critic(state_dim, action_dim).to(args.device) self.critic_optimizer = torch.optim.Adam(self.critic.parameters()) self.list_target_critic = [] # create the different for c in range(args.num_q_target): critic_target = Critic(state_dim, action_dim).to(args.device) critic_target.load_state_dict(critic_target.state_dict()) self.list_target_critic.append(critic_target) self.target_critic = Critic(state_dim, action_dim).to(args.device) self.target_critic.load_state_dict(self.target_critic.state_dict()) self.max_action = max_action self.num_q_target = args.num_q_target self.batch_size = args.batch_size self.discount = args.discount self.tau = args.tau self.policy_noise = args.policy_noise self.noise_clip = args.noise_clip self.policy_freq = args.policy_freq self.device = args.device self.update_counter = 0 self.step = 0 self.currentQNet = 0
def __init__(self, state_size, action_size, cfg, num_agents=1, agent_id=0): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action cfg (config object): main configuration with other passed settings num_agents (int): optional (default: 1). If >1 will multiply state and action space sizes for critic. Used for usage with MADDPG. agent_id (int): optional (default: 0). Set agent id for MADDPG. """ print("Initializing single DDPG agent!") self.state_size = state_size self.action_size = action_size self.seed = random.seed(cfg.random_seed) self.n_agents = num_agents self.agent_id = agent_id self.cfg = cfg # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, cfg.random_seed, cfg.dense_layers_actor).to(device) self.actor_target = Actor(state_size, action_size, cfg.random_seed, cfg.dense_layers_actor).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=cfg.lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size * num_agents, action_size * num_agents, cfg.random_seed, cfg.dense_layers_critic).to(device) self.critic_target = Critic(state_size * num_agents, action_size * num_agents, cfg.random_seed, cfg.dense_layers_critic).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=cfg.lr_critic, weight_decay=cfg.weight_decay) self.hard_copy_weights(self.critic_local, self.critic_target) self.hard_copy_weights(self.actor_local, self.actor_target) self.t_step = 0 # Noise process self.noise = OUNoise(action_size, cfg.random_seed, theta=cfg.theta_ou, sigma=cfg.sigma_ou) # Replay memory self.memory = ReplayBuffer(action_size, cfg.buffer_size, cfg.batch_size, cfg.random_seed, cfg)
def __init__(self, n_states, n_actions, n_goals, action_bounds, capacity, env, k_future, batch_size, action_size=1, tau=0.05, actor_lr=1e-3, critic_lr=1e-3, gamma=0.98): self.device = device("cpu") self.n_states = n_states self.n_actions = n_actions self.n_goals = n_goals self.k_future = k_future self.action_bounds = action_bounds self.action_size = action_size self.env = env self.actor = Actor(self.n_states, n_actions=self.n_actions, n_goals=self.n_goals).to(self.device) self.critic = Critic(self.n_states, action_size=self.action_size, n_goals=self.n_goals).to(self.device) self.sync_networks(self.actor) self.sync_networks(self.critic) self.actor_target = Actor(self.n_states, n_actions=self.n_actions, n_goals=self.n_goals).to(self.device) self.critic_target = Critic(self.n_states, action_size=self.action_size, n_goals=self.n_goals).to(self.device) self.init_target_networks() self.tau = tau self.gamma = gamma self.capacity = capacity self.memory = Memory(self.capacity, self.k_future, self.env) self.batch_size = batch_size self.actor_lr = actor_lr self.critic_lr = critic_lr self.actor_optim = Adam(self.actor.parameters(), self.actor_lr) self.critic_optim = Adam(self.critic.parameters(), self.critic_lr) self.state_normalizer = Normalizer(self.n_states[0], default_clip_range=5) self.goal_normalizer = Normalizer(self.n_goals, default_clip_range=5)
def __init__(self, act_dim, env_dim, act_range, buffer_size=20000, gamma=0.99, lr=0.00005, tau=0.001): """Initialization""" # Environment and A2C parameters self.act_dim = act_dim self.act_range = act_range self.env_dim = env_dim self.gamma = gamma self.lr = lr # Create actor and critic networks self.actor = Actor(self.env_dim, act_dim, act_range, 0.1 * lr, tau) self.critic = Critic(self.env_dim, act_dim, lr, tau) self.buffer = MemoryBuffer(buffer_size)
def __init__(self, state_size, action_size, seed, policy_lr=LR, critic_lr=LR): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.epsilon = EPSILON # Noise process self.noise = OUNoise(action_size, seed) random.seed(seed) # Networks self.policy_local = Actor(self.state_size, self.action_size, seed) self.policy_target = Actor(self.state_size, self.action_size, seed) self.critic_local = Critic(self.state_size + self.action_size, self.action_size, seed) self.critic_target = Critic(self.state_size + self.action_size, self.action_size, seed) # initialize target networks weights for target_param, param in zip(self.policy_target.parameters(), self.policy_local.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.critic_target.parameters(), self.critic_local.parameters()): target_param.data.copy_(param.data) # optimizer self.policy_optimizer = optim.Adam(self.policy_local.parameters(), lr=policy_lr) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=critic_lr) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self, state_size, hidden_size, action_size, actor_learning_rate=1e-4, critic_learning_rate=1e-3, gamma=0.99, tau=1e-2, use_cuda=False, actor_path=None, critic_path=None): # Params self.state_size, self.hidden_size, self.action_size = state_size, hidden_size, action_size self.gamma, self.tau = gamma, tau self.use_cuda = use_cuda # Networks self.actor = Actor(state_size, hidden_size, action_size) self.actor_target = Actor(state_size, hidden_size, action_size) self.critic = Critic(state_size + action_size, hidden_size, action_size) self.critic_target = Critic(state_size + action_size, hidden_size, action_size) # Load model state_dicts from saved file if actor_path and path.exists(actor_path): self.actor.load_state_dict(torch.load(actor_path)) if critic_path and path.exists(critic_path): self.critic.load_state_dict(torch.load(critic_path)) # Hard copy params from original networks to target networks copy_params(self.actor, self.actor_target) copy_params(self.critic, self.critic_target) if self.use_cuda: self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() # Create replay buffer for storing experience self.replay_buffer = ReplayBuffer(cache_size=int(1e6)) # Training self.critic_criterion = nn.MSELoss() self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_learning_rate) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_learning_rate)
def __init__(self, M, eh1, eh2, dh2, ci, lr_ac=0.001, lr_cr=0.001): ## Network initializations # Actor self.actor = VAE( M, eh1, eh2, dh2 ) # Number of inputs, units in encoder_hidden_layer1, encoder_hidden_layer2, #decoder_hidden_layer1 # Critic self.critic = Critic(ci) # Length of feature vector # Optimizers self.optim_actor = torch.optim.Adam(self.actor.parameters(), lr=lr_ac) self.optim_critic = torch.optim.Adam(self.critic.parameters(), lr=lr_cr) self.mse = torch.nn.MSELoss()
def __init__(self, state_size, action_size, random_seed, **kwargs): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ for key, value in kwargs.items(): setattr(self, key, value) self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed, self.nhidden_actor).to(self.device) self.actor_target = getattr( self, "actor_target", Actor(state_size, action_size, random_seed, self.nhidden_actor).to(self.device)) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed, self.nhidden_critic).to(self.device) self.critic_target = Critic(state_size, action_size, random_seed, self.nhidden_critic).to(self.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay) # Copying weights from target to source neural networks self.copy_weights(self.actor_target, self.actor_local) self.copy_weights(self.critic_target, self.critic_local) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = getattr( self, "memory", ReplayBuffer(action_size, self.buffer_size, self.batch_size, random_seed, self.device)) self.nsteps = 0
def __init__(self, config: DefaultMunch): self.config = config self.memory = self.config.memory self.n_agents = self.config.n_agents self.action_size = self.config.action_size self.state_size = self.config.state_size self.critic_local = Critic(self.state_size, self.config.action_size, self.config.n_agents).to(self.config.device) self.critic_target = Critic(self.state_size, self.config.action_size, self.config.n_agents).to( self.config.device) self.critic_target.load_state_dict(self.critic_local.state_dict()) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.config.lr_critic) self.agents = [Agent(self.config, self) for i in range(self.n_agents)]