예제 #1
0
    def __init__(self, config):
        self.config = config
        self.state_size = config.state_size
        self.action_size = config.action_size

        self.actor_local = Actor(self.state_size, self.action_size,
                                 2).to(device)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  2).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=config.LR_ACTOR)

        self.critic_local = Critic(self.state_size, self.action_size,
                                   2).to(device)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    2).to(device)
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=config.LR_CRITIC,
        )

        self.memory = ReplayBuffer(config.random_seed, config.BUFFER_SIZE)
        self.noise = OUNoise(self.action_size, config.random_seed)

        self.t_step = 0

        self.soft_update(self.critic_local, self.critic_target, 1)
        self.soft_update(self.actor_local, self.actor_target, 1)
예제 #2
0
    def __init__(self, state_shape, action_shape, stats):
        # self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.device = torch.device("cpu")

        self.state_shape = state_shape
        self.action_shape = action_shape

        self.stats = stats

        self.learn_rate = 3e-4
        self.num_epochs = 8

        self.entropy_weight = 0.001
        self.kl_clip = 0.1

        self.deterministic_test_mode = False

        self.hidden_state_size = 16
        self.lstm = LSTM(self.state_shape, self.hidden_state_size)
        self.actor = Actor(self.hidden_state_size,
                           self.action_shape).to(self.device)
        self.critic = Critic(self.hidden_state_size).to(self.device)

        self.optimizer = torch.optim.Adam(list(self.actor.parameters()) +
                                          list(self.critic.parameters()),
                                          lr=self.learn_rate)
예제 #3
0
    def __init__(self, state_size, action_size, agent_num, random_seed):
        """
        Initialize an Agent object.
        :param state_size (int): dimension of each state
        :param action_size (int): dimension of each action
        :param random_seed (int): random seed
        """

        # Actor Networks
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Networks
        self.critic_local = Critic(state_size, action_size, agent_num,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, agent_num,
                                    random_seed).to(device)
        self.critic_optimizer = Adam(self.critic_local.parameters(),
                                     lr=LR_CRITIC,
                                     weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed, scale=0.1)
    def __init__(self, idx, params):
        """Initialize an Agent object.
        
        Params
        ======
            params (dict-like): dictionary of parameters for the agent
        """
        super().__init__(params)

        self.idx = idx
        self.params = params
        self.update_every = params['update_every']
        self.gamma = params['gamma']
        self.num_agents = params['num_agents']
        
        # Actor Network (w/ Target Network)
        self.actor_local = Actor(params['actor_params']).to(device)
        self.actor_target = Actor(params['actor_params']).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=params['actor_params']['lr'])
        
        # Critic Network (w/ Target Network)
        self.critic_local = Critic(params['critic_params']).to(device)
        self.critic_target = Critic(params['critic_params']).to(device)

        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=params['critic_params']['lr'],
                                           weight_decay=params['critic_params']['weight_decay'])

        # Noise process
        self.noise = OUNoise(self.params['noise_params'])

        # Replay memory
        self.memory = params['experience_replay']
예제 #5
0
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(
            state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(
            state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(
            self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(
            state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(
            state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(
            action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
예제 #6
0
    def __init__(self, state_size, action_size):

        self.discount = 0.99
        self.target_mix = 5e-3

        self.online_actor = Actor(state_size,
                                  action_size,
                                  fc1_units=256,
                                  fc2_units=128).to(DEVICE)
        self.target_actor = Actor(state_size,
                                  action_size,
                                  fc1_units=256,
                                  fc2_units=128).to(DEVICE)
        self.actor_opt = optim.Adam(self.online_actor.parameters(), lr=3e-3)

        self.online_critic = Critic(state_size,
                                    action_size,
                                    fc1_units=256,
                                    fc2_units=128).to(DEVICE)
        self.target_critic = Critic(state_size,
                                    action_size,
                                    fc1_units=256,
                                    fc2_units=128).to(DEVICE)
        self.critic_opt = optim.Adam(self.online_critic.parameters(), lr=3e-3)

        self.noise = OrnsteinUhlenbeck(action_size,
                                       mu=0.,
                                       theta=0.15,
                                       sigma=0.05)
        self.replay = Replay(action_size, buffer_size=int(1e6), batch_size=128)
예제 #7
0
    def __init__(self, config: ac_parm, device, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.config = config
        self.seed = random.seed(random_seed)
        self.name = config.name
        self.device = device

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(config, random_seed).to(device)
        self.actor_target = Actor(config, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=config.lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(config, random_seed).to(device)
        self.critic_target = Critic(config, random_seed).to(device)
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(), lr=config.lr_critic, weight_decay=config.weight_decay)

        # Noise process
        self.noise = OUNoise(config.action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(config, device, random_seed)

        self.step_number = 0
예제 #8
0
    def __init__(self, params):

        action_size = params['action_size']
        state_size = params['state_size']
        buf_params = params['buf_params']

        nn_params = params['nn_params']
        nn_params['nn_actor']['l1'][0] = state_size
        nn_params['nn_actor']['l3'][1] = action_size
        nn_params['nn_critic']['l1'][0] = state_size + action_size

        self.__actor_local = Actor(nn_params['nn_actor']).to(device)
        self.__actor_target = Actor(nn_params['nn_actor']).to(device)
        self.__critic_local = Critic(nn_params['nn_critic']).to(device)
        self.__critic_target = Critic(nn_params['nn_critic']).to(device)

        self.__action_size = action_size
        self.__state_size = state_size
        self.__memory = ReplayBuffer(buf_params)
        self.__t = 0

        self.gamma = params['gamma']
        self.learning_rate_actor = params['learning_rate_actor']
        self.learning_rate_critic = params['learning_rate_critic']
        self.tau = params['tau']

        self.__optimiser_actor = optim.Adam(self.__actor_local.parameters(),
                                            self.learning_rate_actor)
        self.__optimiser_critic = optim.Adam(self.__critic_local.parameters(),
                                             self.learning_rate_critic)
        self.__uo_process = UOProcess()
        # other parameters
        self.agent_loss = 0.0
예제 #9
0
    def __init__(self, config):
        super(PPO, self).__init__()
        self.config = config
        torch.manual_seed(self.config['seed'])
        np.random.seed(self.config['seed'])

        if self.config['experiment'][
                'orthogonal_initialization_and_layer_scaling']:
            weight_init_scheme = 'orthogonal'
        else:
            weight_init_scheme = 'normal'

        self.actor = Actor(
            device=self.config['device'],
            input_dim=self.config['env']['nS'],
            output_dim=self.config['env']['nA'],
            hidden_dims=self.config['model']['actor']['hidden_dims'],
            hidden_activation_fn=self.config['model']['actor']
            ['hidden_acivation_fn'],
            weight_init_scheme=weight_init_scheme)
        self.actor_optimizer = optim.Adam(
            self.actor.parameters(),
            lr=self.config['model']['actor']['lr'],
            betas=self.config['model']['actor']['betas'])

        self.critic = Critic(
            device=self.config['device'],
            input_dim=self.config['env']['nS'],
            hidden_dims=self.config['model']['critic']['hidden_dims'],
            hidden_activation_fn=self.config['model']['critic']
            ['hidden_acivation_fn'],
            weight_init_scheme=weight_init_scheme)
        self.critic_optimizer = optim.Adam(
            self.critic.parameters(),
            lr=self.config['model']['critic']['lr'],
            betas=self.config['model']['critic']['betas'])

        if self.config['train']['gail']:
            self.discriminator = Discriminator(
                device=self.config['device'],
                state_dim=self.config['env']['nS'],
                action_dim=self.config['env']['nA'],
                hidden_dims=self.config['model']['discriminator']
                ['hidden_dims'],
                hidden_activation_fn=self.config['model']['discriminator']
                ['hidden_acivation_fn'],
                weight_init_scheme=weight_init_scheme)
            self.discriminator_optimizer = optim.Adam(
                self.discriminator.parameters(),
                lr=self.config['model']['discriminator']['lr'],
                betas=self.config['model']['discriminator']['betas'])

        # [EXPERIMENT] - reward scaler: r / rs.std()
        if self.config['experiment']['reward_standardization']:
            self.reward_scaler = RewardScaler(
                gamma=self.config['train']['gamma'])

        # [EXPERIMENT] - observation scaler: (ob - ob.mean()) / (ob.std())
        if self.config['experiment']['observation_normalization']:
            self.observation_scaler = ObservationScaler()
예제 #10
0
    def __init__(self, env, gamma, tau, buffer_maxlen, batch_size,
                 critic_learning_rate, actor_learning_rate, update_per_step,
                 seed):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        # hyperparameters
        self.num_replay_updates_per_step = update_per_step
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau

        # initialize actor and critic networks
        self.critic = Critic(env.observation_space.shape[0],
                             env.action_space.shape[0], seed).to(self.device)
        self.critic_target = Critic(env.observation_space.shape[0],
                                    env.action_space.shape[0],
                                    seed).to(self.device)

        self.actor = Actor(env.observation_space.shape[0],
                           env.action_space.shape[0], seed).to(self.device)
        self.actor_target = Actor(env.observation_space.shape[0],
                                  env.action_space.shape[0],
                                  seed).to(self.device)

        # optimizers
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=critic_learning_rate)
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=actor_learning_rate)

        self.buffer = ReplayBuffer(buffer_maxlen, batch_size, seed)
        self.noise = OUNoise(env.action_space.shape[0])
예제 #11
0
    def __init__(self,env, env_params, args, models=None, record_episodes=[0,.1,.25,.5,.75,1.]):
        self.env= env
        self.env_params = env_params
        self.args = args


        # networks
        if models == None:
                self.actor = Actor(self.env_params).double()
                self.critic = Critic(self.env_params).double()
        else:
                self.actor , self.critic = self.LoadModels()
        # target networks used to predict env actions with
        self.actor_target = Actor(self.env_params,).double()
        self.critic_target = Critic(self.env_params).double()

        self.actor_target.load_state_dict(self.actor.state_dict())
        self.critic_target.load_state_dict(self.critic.state_dict())

        if self.args.cuda:
            self.actor.cuda()
            self.critic.cuda()
            self.actor_target.cuda()
            self.critic_target.cuda()


        self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=0.001)
        self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=0.001)

        self.normalize = Normalizer(env_params,self.args.gamma)
        self.buffer = ReplayBuffer(1_000_000, self.env_params)
        self.tensorboard = ModifiedTensorBoard(log_dir = f"logs")
        self.record_episodes = [int(eps * self.args.n_epochs) for eps in record_episodes]
예제 #12
0
파일: ddpg.py 프로젝트: marsXyr/DP-ERL
    def __init__(self, state_dim, action_dim, max_action, memory, args):

        # misc
        self.criterion = nn.MSELoss()
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.max_action = max_action
        self.memory = memory
        self.n = args.n_actor

        # actors
        self.actors = [
            Actor(state_dim,
                  action_dim,
                  max_action,
                  layer_norm=args.layer_norm) for i in range(self.n)
        ]
        self.actors_target = [
            Actor(state_dim,
                  action_dim,
                  max_action,
                  layer_norm=args.layer_norm) for i in range(self.n)
        ]
        self.actors_optimizer = [
            torch.optim.Adam(self.actors[i].parameters(), lr=args.actor_lr)
            for i in range(self.n)
        ]

        for i in range(self.n):
            self.actors_target[i].load_state_dict(self.actors[i].state_dict())

        # crtic
        self.critic = Critic(state_dim, action_dim, layer_norm=args.layer_norm)
        self.critic_target = Critic(state_dim,
                                    action_dim,
                                    layer_norm=args.layer_norm)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=args.critic_lr)

        # cuda
        if torch.cuda.is_available():
            for i in range(self.n):
                self.actors[i] = self.actors[i].cuda()
                self.actors_target[i] = self.actors_target[i].cuda()
            self.critic = self.critic.cuda()
            self.critic_target = self.critic_target.cuda()

        # shared memory
        for i in range(self.n):
            self.actors[i].share_memory()
            self.actors_target[i].share_memory()
        self.critic.share_memory()
        self.critic_target.share_memory()

        # hyper-parameters
        self.tau = args.tau
        self.discount = args.discount
        self.batch_size = args.batch_size
        self.reward_scale = args.reward_scale
예제 #13
0
    def __init__(self,
                 rows,
                 columns,
                 num_actions,
                 l_rate=1e-4,
                 gamma=0.99,
                 lam=0.95,
                 policy_kl_range=0.0008,
                 policy_params=20,
                 value_clip=1.0,
                 loss_coefficient=1.0,
                 entropy_coefficient=0.05):
        self.rows = rows
        self.columns = columns
        self.num_actions = num_actions

        self.actor = Actor(self.num_actions)
        self.critic = Critic()
        self.actor_old = Actor(self.num_actions)
        self.critic_old = Critic()

        self.optimizer = tf.keras.optimizers.Adam(l_rate)

        self.gamma = gamma
        self.lam = lam
        self.policy_kl_range = policy_kl_range
        self.policy_params = policy_params
        self.value_clip = value_clip
        self.loss_coefficient = loss_coefficient
        self.entropy_coefficient = entropy_coefficient
예제 #14
0
    def __init__(self,
                 n_s,
                 n_a,
                 a_bound,
                 gamma=0.99,
                 memory_size=10000,
                 tau=0.01,
                 lr_a=0.001,
                 lr_c=0.002,
                 batch_size=64,
                 var=3,
                 var_decay=0.9995):
        self.n_s = n_s
        self.n_a = n_a
        self.a_bound = a_bound
        self.gamma = gamma
        self.memory_size = memory_size
        self.tau = tau
        self.batch_size = batch_size
        self.var = var
        self.var_decay = var_decay

        # memory
        self.replay_buffer = ReplayBuffer(n_s, n_a, memory_size)
        # actor
        self.eval_actor = Actor(n_s, n_a, a_bound)
        self.target_actor = deepcopy(self.eval_actor)
        self.actor_optim = torch.optim.Adam(self.eval_actor.parameters(),
                                            lr=lr_a)

        # critic
        self.eval_critic = Critic(n_s, n_a)
        self.target_critic = deepcopy(self.eval_critic)
        self.critic_optim = torch.optim.Adam(self.eval_critic.parameters(),
                                             lr=lr_c)
예제 #15
0
    def __init__(self):
        self.discount_factor = 0.99
        self.num_steps = 1000000
        self.reset_interval = 100
        self.update_interval = 1
        self.batch_size = 100
        self.replay_buffer_length = 1000

        self.replay_buffer = []
        self.env = PendulumEnv()
        self.noise = SmoothNoise((1, ))
        self.draw_env = False

        self.actor = Actor()
        self.critic = Critic()

        self.actor_target = Actor()
        self.critic_target = Critic()

        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=0.0001)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=0.001)

        self.actor_criterion = nn.MSELoss()
        self.critic_criterion = nn.MSELoss()

        self.update_target(self.actor_target, self.actor, 1.0)
        self.update_target(self.critic_target, self.critic, 1.0)

        self.env_reset()
    def __init__(self, state_size, action_size, num_agents, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = seed
        self.num_agents = num_agents

        self.actor = Actor(self.state_size, self.action_size,
                           self.seed).to(device)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=LR_ACTOR)

        self.critic = Critic(self.state_size, self.action_size,
                             self.seed).to(device)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    self.seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=LR_CRITIC)

        self.copy_init_weights(self.actor, self.actor_target)
        self.copy_init_weights(self.critic, self.critic_target)

        self.noise = OUNoise((num_agents, action_size), seed)

        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, seed)
예제 #17
0
    def __init__(self, seed, nA, nS, L2, index):
        self.seed = seed
        self.nA = nA
        self.nS = nS
        self.nO = 52  # 24 * 2 state space + 2 * 2 action space
        self.L2 = L2
        self.index = index
        self.noise = OUnoise(nA, seed)

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.local_critic = Critic(seed, self.nO, nA).to(self.device)
        self.target_critic = Critic(seed, self.nO, nA).to(self.device)
        self.local_actor = Actor(seed, nS, nA).to(self.device)
        self.target_actor = Actor(seed, nS, nA).to(self.device)

        # Copy the weights from local to target
        hard_update(self.local_critic, self.target_critic)
        hard_update(self.local_actor, self.target_actor)

        self.critic_optimizer = optim.Adam(self.local_critic.parameters(),
                                           lr=1e-3,
                                           weight_decay=self.L2)
        self.actor_optimizer = optim.Adam(self.local_actor.parameters(),
                                          lr=1e-4)
    def __init__(self, agent_id, state_size, action_size, rand_seed,
                 meta_agent):
        """ Creates a new DDPG Agent """

        self.agent_id = agent_id
        self.action_size = action_size

        # Defines the Actor Networks
        self.actor_local = Actor(state_size, action_size, rand_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  rand_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Defines the Critic Networks
        self.critic_local = Critic(state_size, action_size,
                                   meta_agent.agents_qty, rand_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    meta_agent.agents_qty,
                                    rand_seed).to(device)

        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=LR_CRITIC)  #, weight_decay=WEIGHT_DECAY)

        self.noise = OUNoise(action_size, rand_seed)

        # Refers to the MA agent memory
        self.memory = meta_agent.memory

        self.t_step = 0
예제 #19
0
    def __init__(self, gamma, tau,num_inputs, env,device, results_path=None):

        self.gamma = gamma
        self.tau = tau
        self.min_action,self.max_action = env.action_range()
        self.device = device
        self.num_actions = env.action_space()
        self.noise_stddev = 0.3

        self.results_path = results_path
        self.checkpoint_path = os.path.join(self.results_path, 'checkpoint/')
        os.makedirs(self.checkpoint_path, exist_ok=True)

        # Define the actor
        self.actor = Actor(num_inputs, self.num_actions).to(device)
        self.actor_target = Actor(num_inputs, self.num_actions).to(device)

        # Define the critic
        self.critic = Critic(num_inputs, self.num_actions).to(device)
        self.critic_target = Critic(num_inputs, self.num_actions).to(device)

        # Define the optimizers for both networks
        self.actor_optimizer  = Adam(self.actor.parameters(),  lr=1e-4 )                          # optimizer for the actor network
        self.critic_optimizer = Adam(self.critic.parameters(), lr=1e-4,   weight_decay=0.002)  # optimizer for the critic network

        self.hard_swap()

        self.ou_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(self.num_actions),
                                            sigma=float(self.noise_stddev) * np.ones(self.num_actions))
        self.ou_noise.reset()
예제 #20
0
    def __init__(self,
                 env,
                 gamma=0.99,
                 tau=1e-3,
                 pol_lr=1e-4,
                 q_lr=5e-3,
                 batch_size=64,
                 buffer_size=10000,
                 target_noise=0.2,
                 action_noise=0.1,
                 clip_range=0.5,
                 update_delay=2):

        # environment stuff
        self.env = env
        self.num_act = env.action_space.shape[0]
        self.num_obs = env.observation_space.shape[0]
        self.eval_env = copy.deepcopy(env)

        # hyper parameters
        self.gamma = gamma
        self.tau = tau
        self.pol_lr = pol_lr
        self.q_lr = q_lr
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.target_noise = target_noise
        self.action_noise = action_noise
        self.clip_range = clip_range
        self.update_delay = 2

        # networks
        self.pol = Actor(self.num_obs, self.num_act, [400, 300]).double()
        self.q1 = Critic(self.num_obs, self.num_act, [400, 300]).double()
        self.q2 = Critic(self.num_obs, self.num_act, [400, 300]).double()
        self.pol.init_weights()
        self.q1.init_weights()
        self.q2.init_weights()
        self.target_pol = copy.deepcopy(self.pol).double()
        self.target_q1 = copy.deepcopy(self.q1).double()
        self.target_q2 = copy.deepcopy(self.q2).double()

        # optimizers, buffer
        self.pol_opt = torch.optim.Adam(self.pol.parameters(), lr=self.pol_lr)
        self.q1_opt = torch.optim.Adam(
            self.q1.parameters(),
            lr=self.q_lr,
        )
        self.q2_opt = torch.optim.Adam(
            self.q2.parameters(),
            lr=self.q_lr,
        )
        self.buffer = ReplayBuffer(self.buffer_size, 1000)
        self.mse_loss = torch.nn.MSELoss()

        self.cum_q1_loss = 0
        self.cum_q2_loss = 0
        self.cum_obj = 0
예제 #21
0
    def __init__(self,
                 state_dim,
                 action_dim,
                 max_action,
                 device,
                 memory_capacity=10000,
                 discount=0.99,
                 update_freq=2,
                 tau=0.005,
                 policy_noise_std=0.2,
                 policy_noise_clip=0.5,
                 actor_lr=1e-3,
                 critic_lr=1e-3,
                 train_mode=True):
        self.train_mode = train_mode  # whether the agent is in training or testing mode

        self.state_dim = state_dim  # dimension of the state space
        self.action_dim = action_dim  # dimension of the action space

        self.device = device  # defines which cuda or cpu device is to be used to run the networks
        self.discount = discount  # denoted a gamma in the equation for computation of the Q-value
        self.update_freq = update_freq  # defines how frequently should the actor and target be updated
        self.tau = tau  # defines the factor used for Polyak averaging (i.e., soft updating of the target networks)
        self.max_action = max_action  # the max value of the range in the action space (assumes a symmetric range in the action space)
        self.policy_noise_clip = policy_noise_clip  # max range within which the noise for the target policy smoothing must be contained
        self.policy_noise_std = policy_noise_std  # standard deviation, i.e. sigma, of the Gaussian noise applied for target policy smoothing

        # create an instance of the replay buffer
        self.memory = ReplayMemory(memory_capacity)

        # instances of the networks for the actor and the two critics
        self.actor = Actor(state_dim, action_dim, max_action, actor_lr)
        self.critic = Critic(
            state_dim, action_dim, critic_lr
        )  # the critic class encapsulates two copies of the neural network for the two critics used in TD3

        # instance of the target networks for the actor and the two critics
        self.target_actor = Actor(state_dim, action_dim, max_action, actor_lr)
        self.target_critic = Critic(state_dim, action_dim, critic_lr)

        # initialise the targets to the same weight as their corresponding current networks
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.target_critic.load_state_dict(self.critic.state_dict())

        # since we do not learn/train on the target networks
        self.target_actor.eval()
        self.target_critic.eval()

        # for test mode
        if not self.train_mode:
            self.actor.eval()
            self.critic.eval()

        self.actor.to(self.device)
        self.critic.to(self.device)
        self.target_actor.to(self.device)
        self.target_critic.to(self.device)
 def __init__(self, state_dim, action_dim, max_action, args):
     self.actor = Actor(state_dim, action_dim, max_action).to(args.device)
     self.actor_target = Actor(state_dim, action_dim, max_action).to(args.device)
     self.actor_target.load_state_dict(self.actor.state_dict())
     self.actor_optimizer = torch.optim.Adam(self.actor.parameters())
     self.critic = Critic(state_dim, action_dim).to(args.device)
     self.critic_optimizer = torch.optim.Adam(self.critic.parameters())
     self.list_target_critic = []
     # create the different 
     for c in range(args.num_q_target):
         critic_target = Critic(state_dim, action_dim).to(args.device)
         critic_target.load_state_dict(critic_target.state_dict())
         self.list_target_critic.append(critic_target)
     
     self.target_critic = Critic(state_dim, action_dim).to(args.device)
     self.target_critic.load_state_dict(self.target_critic.state_dict())
     self.max_action = max_action
     self.num_q_target = args.num_q_target
     self.batch_size = args.batch_size
     self.discount = args.discount
     self.tau = args.tau 
     self.policy_noise = args.policy_noise
     self.noise_clip = args.noise_clip
     self.policy_freq = args.policy_freq
     self.device = args.device
     self.update_counter = 0
     self.step = 0 
     self.currentQNet = 0
예제 #23
0
    def __init__(self, state_size, action_size, cfg, num_agents=1, agent_id=0):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            cfg (config object): main configuration with other passed settings
            num_agents (int): optional (default: 1). If >1 will multiply state and action
                            space sizes for critic. Used for usage with MADDPG.
            agent_id (int): optional (default: 0). Set agent id for MADDPG.
        """
        print("Initializing single DDPG agent!")

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(cfg.random_seed)
        self.n_agents = num_agents
        self.agent_id = agent_id

        self.cfg = cfg

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, cfg.random_seed,
                                 cfg.dense_layers_actor).to(device)
        self.actor_target = Actor(state_size, action_size, cfg.random_seed,
                                  cfg.dense_layers_actor).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=cfg.lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size * num_agents,
                                   action_size * num_agents, cfg.random_seed,
                                   cfg.dense_layers_critic).to(device)
        self.critic_target = Critic(state_size * num_agents,
                                    action_size * num_agents, cfg.random_seed,
                                    cfg.dense_layers_critic).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=cfg.lr_critic,
                                           weight_decay=cfg.weight_decay)

        self.hard_copy_weights(self.critic_local, self.critic_target)
        self.hard_copy_weights(self.actor_local, self.actor_target)

        self.t_step = 0

        # Noise process
        self.noise = OUNoise(action_size,
                             cfg.random_seed,
                             theta=cfg.theta_ou,
                             sigma=cfg.sigma_ou)

        # Replay memory
        self.memory = ReplayBuffer(action_size, cfg.buffer_size,
                                   cfg.batch_size, cfg.random_seed, cfg)
예제 #24
0
    def __init__(self,
                 n_states,
                 n_actions,
                 n_goals,
                 action_bounds,
                 capacity,
                 env,
                 k_future,
                 batch_size,
                 action_size=1,
                 tau=0.05,
                 actor_lr=1e-3,
                 critic_lr=1e-3,
                 gamma=0.98):
        self.device = device("cpu")
        self.n_states = n_states
        self.n_actions = n_actions
        self.n_goals = n_goals
        self.k_future = k_future
        self.action_bounds = action_bounds
        self.action_size = action_size
        self.env = env

        self.actor = Actor(self.n_states,
                           n_actions=self.n_actions,
                           n_goals=self.n_goals).to(self.device)
        self.critic = Critic(self.n_states,
                             action_size=self.action_size,
                             n_goals=self.n_goals).to(self.device)
        self.sync_networks(self.actor)
        self.sync_networks(self.critic)
        self.actor_target = Actor(self.n_states,
                                  n_actions=self.n_actions,
                                  n_goals=self.n_goals).to(self.device)
        self.critic_target = Critic(self.n_states,
                                    action_size=self.action_size,
                                    n_goals=self.n_goals).to(self.device)
        self.init_target_networks()
        self.tau = tau
        self.gamma = gamma

        self.capacity = capacity
        self.memory = Memory(self.capacity, self.k_future, self.env)

        self.batch_size = batch_size
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.actor_optim = Adam(self.actor.parameters(), self.actor_lr)
        self.critic_optim = Adam(self.critic.parameters(), self.critic_lr)

        self.state_normalizer = Normalizer(self.n_states[0],
                                           default_clip_range=5)
        self.goal_normalizer = Normalizer(self.n_goals, default_clip_range=5)
예제 #25
0
 def __init__(self, act_dim, env_dim, act_range, buffer_size=20000, gamma=0.99, lr=0.00005, tau=0.001):
     """Initialization"""
     # Environment and A2C parameters
     self.act_dim = act_dim
     self.act_range = act_range
     self.env_dim = env_dim
     self.gamma = gamma
     self.lr = lr
     # Create actor and critic networks
     self.actor = Actor(self.env_dim, act_dim, act_range, 0.1 * lr, tau)
     self.critic = Critic(self.env_dim, act_dim, lr, tau)
     self.buffer = MemoryBuffer(buffer_size)
예제 #26
0
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 policy_lr=LR,
                 critic_lr=LR):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size

        self.epsilon = EPSILON

        # Noise process
        self.noise = OUNoise(action_size, seed)

        random.seed(seed)

        # Networks
        self.policy_local = Actor(self.state_size, self.action_size, seed)
        self.policy_target = Actor(self.state_size, self.action_size, seed)
        self.critic_local = Critic(self.state_size + self.action_size,
                                   self.action_size, seed)
        self.critic_target = Critic(self.state_size + self.action_size,
                                    self.action_size, seed)

        # initialize target networks weights
        for target_param, param in zip(self.policy_target.parameters(),
                                       self.policy_local.parameters()):
            target_param.data.copy_(param.data)
        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic_local.parameters()):
            target_param.data.copy_(param.data)

        # optimizer
        self.policy_optimizer = optim.Adam(self.policy_local.parameters(),
                                           lr=policy_lr)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=critic_lr)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
예제 #27
0
    def __init__(self,
                 state_size,
                 hidden_size,
                 action_size,
                 actor_learning_rate=1e-4,
                 critic_learning_rate=1e-3,
                 gamma=0.99,
                 tau=1e-2,
                 use_cuda=False,
                 actor_path=None,
                 critic_path=None):
        # Params
        self.state_size, self.hidden_size, self.action_size = state_size, hidden_size, action_size
        self.gamma, self.tau = gamma, tau
        self.use_cuda = use_cuda

        # Networks
        self.actor = Actor(state_size, hidden_size, action_size)
        self.actor_target = Actor(state_size, hidden_size, action_size)

        self.critic = Critic(state_size + action_size, hidden_size,
                             action_size)
        self.critic_target = Critic(state_size + action_size, hidden_size,
                                    action_size)

        # Load model state_dicts from saved file
        if actor_path and path.exists(actor_path):
            self.actor.load_state_dict(torch.load(actor_path))

        if critic_path and path.exists(critic_path):
            self.critic.load_state_dict(torch.load(critic_path))

        # Hard copy params from original networks to target networks
        copy_params(self.actor, self.actor_target)
        copy_params(self.critic, self.critic_target)

        if self.use_cuda:
            self.actor.cuda()
            self.actor_target.cuda()
            self.critic.cuda()
            self.critic_target.cuda()

        # Create replay buffer for storing experience
        self.replay_buffer = ReplayBuffer(cache_size=int(1e6))

        # Training
        self.critic_criterion = nn.MSELoss()
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=actor_learning_rate)
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=critic_learning_rate)
예제 #28
0
 def __init__(self, M, eh1, eh2, dh2, ci, lr_ac=0.001, lr_cr=0.001):
     ## Network initializations
     # Actor
     self.actor = VAE(
         M, eh1, eh2, dh2
     )  # Number of inputs, units in encoder_hidden_layer1, encoder_hidden_layer2,
     #decoder_hidden_layer1
     # Critic
     self.critic = Critic(ci)  # Length of feature vector
     # Optimizers
     self.optim_actor = torch.optim.Adam(self.actor.parameters(), lr=lr_ac)
     self.optim_critic = torch.optim.Adam(self.critic.parameters(),
                                          lr=lr_cr)
     self.mse = torch.nn.MSELoss()
예제 #29
0
    def __init__(self, state_size, action_size, random_seed, **kwargs):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        for key, value in kwargs.items():
            setattr(self, key, value)

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed,
                                 self.nhidden_actor).to(self.device)
        self.actor_target = getattr(
            self, "actor_target",
            Actor(state_size, action_size, random_seed,
                  self.nhidden_actor).to(self.device))
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed,
                                   self.nhidden_critic).to(self.device)
        self.critic_target = Critic(state_size, action_size, random_seed,
                                    self.nhidden_critic).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.lr_critic,
                                           weight_decay=self.weight_decay)

        # Copying weights from target to source neural networks
        self.copy_weights(self.actor_target, self.actor_local)
        self.copy_weights(self.critic_target, self.critic_local)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = getattr(
            self, "memory",
            ReplayBuffer(action_size, self.buffer_size, self.batch_size,
                         random_seed, self.device))

        self.nsteps = 0
예제 #30
0
 def __init__(self, config: DefaultMunch):
     self.config = config
     self.memory = self.config.memory
     self.n_agents = self.config.n_agents
     self.action_size = self.config.action_size
     self.state_size = self.config.state_size
     self.critic_local = Critic(self.state_size, self.config.action_size,
                                self.config.n_agents).to(self.config.device)
     self.critic_target = Critic(self.state_size, self.config.action_size,
                                 self.config.n_agents).to(
                                     self.config.device)
     self.critic_target.load_state_dict(self.critic_local.state_dict())
     self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                        lr=self.config.lr_critic)
     self.agents = [Agent(self.config, self) for i in range(self.n_agents)]