示例#1
0
    def __init__(self, input_dims, env, n_actions):
        self.memory = ReplayBuffer(input_dims)
        self.n_actions = n_actions

        self.actor_nn = ActorNetwork(input_dims,
                                     n_actions=n_actions,
                                     name=Constants.env_id + '_actor',
                                     max_action=env.action_space.n)
        self.critic_local_1_nn = CriticNetwork(input_dims,
                                               n_actions=n_actions,
                                               name=Constants.env_id +
                                               '_critic_local_1')
        self.critic_local_2_nn = CriticNetwork(input_dims,
                                               n_actions=n_actions,
                                               name=Constants.env_id +
                                               '_critic_local_2')
        self.critic_target_1_nn = CriticNetwork(input_dims,
                                                n_actions=n_actions,
                                                name=Constants.env_id +
                                                '_critic_target_1')
        self.critic_target_2_nn = CriticNetwork(input_dims,
                                                n_actions=n_actions,
                                                name=Constants.env_id +
                                                '_critic_target_2')
        self.value_nn = ValueNetwork(input_dims,
                                     name=Constants.env_id + '_value')
        self.target_value_nn = ValueNetwork(input_dims,
                                            name=Constants.env_id +
                                            '_target_value')
        self.update_network_parameters(tau=1)
示例#2
0
    def __init__(self, alpha, beta, input_dims, tau, env,
            env_id, gamma=0.99, 
            n_actions=2, max_size=1000000, layer1_size=256,
            layer2_size=256, batch_size=100, reward_scale=2):
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions

        self.actor = ActorNetwork(alpha, input_dims, layer1_size,
                                  layer2_size, n_actions=n_actions,
                                  name=env_id+'_actor', 
                                  max_action=env.action_space.high)
        self.critic_1 = CriticNetwork(beta, input_dims, layer1_size,
                                      layer2_size, n_actions=n_actions,
                                      name=env_id+'_critic_1')
        self.critic_2 = CriticNetwork(beta, input_dims, layer1_size,
                                      layer2_size, n_actions=n_actions,
                                      name=env_id+'_critic_2')
       
        self.value = ValueNetwork(beta, input_dims, layer1_size,
                                      layer2_size, name=env_id+'_value')
        self.target_value = ValueNetwork(beta, input_dims, layer1_size,
                                         layer2_size, name=env_id+'_target_value')

        self.scale = reward_scale
        self.update_network_parameters(tau=1)
示例#3
0
    def __init__(self, config: Config):
        self.config = config
        self.is_training = True
        if self.config.prioritized_replay:
            self.buffer = PrioritizedReplayBuffer(
                self.config.max_buff,
                alpha=self.config.prioritized_replay_alpha)
            if self.config.prioritized_replay_beta_iters is None:
                prioritized_replay_beta_iters = self.config.frames
            self.beta_schedule = LinearSchedule(
                prioritized_replay_beta_iters,
                initial_p=self.config.prioritized_replay_beta0,
                final_p=1.0)
        else:
            self.buffer = ReplayBuffer(self.config.max_buff)
            self.beta_schedule = None

        self.model = CnnDQN(self.config.state_shape, self.config.action_dim)
        self.target_model = CnnDQN(self.config.state_shape,
                                   self.config.action_dim)
        self.target_model.load_state_dict(self.model.state_dict())
        self.model_optim = Adam(self.model.parameters(),
                                lr=self.config.learning_rate)

        if self.config.use_cuda:
            self.cuda()
示例#4
0
    def __init__(self,
                 num_agents,
                 state_size,
                 action_size,
                 hidden_layers,
                 seed,
                 gamma=GAMMA,
                 tau=TAU,
                 lr_actor=LR_ACTOR,
                 lr_critic=LR_CRITIC,
                 weight_decay=WEIGHT_DECAY,
                 buffer_size=BUFFER_SIZE,
                 batch_size=BATCH_SIZE):
        """Initialize MADDPG agent."""
        super(MADDPG, self).__init__()

        self.seed = random.seed(seed)

        self.num_agents = num_agents
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma
        self.tau = tau
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.weight_decay = weight_decay
        self.buffer_size = buffer_size
        self.batch_size = batch_size

        self.agents = [DDPGAgent(state_size, action_size, hidden_layers, gamma, \
                                 tau, lr_actor, lr_critic, weight_decay, seed) \
                                     for _ in range(num_agents)]

        self.replay_buffer = ReplayBuffer(num_agents, buffer_size, batch_size)
    def __init__(self,
                 net,
                 o_dim,
                 a_dim,
                 lr=1e-3,
                 batch_size=16,
                 algorithm="ddqn",
                 gamma=0.99,
                 tau=1e-3,
                 buffer_size=int(1e6)):
        """
        o_dim: observation space dim (or # of channels)
        a_dim: action space dimension
        """
        self.o_dim = o_dim
        self.a_dim = a_dim
        self.lr = lr
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.buffer_size = buffer_size

        if algorithm.lower() in ("dqn"):
            self.algorithm = "dqn"
        elif algorithm.lower() in ("ddqn", "double dqn", "doubledqn"):
            self.algorithm = "ddqn"
        else:
            raise TypeError("cannot recognize algorithm")

        self.buffer = ReplayBuffer(buffer_size, batch_size)

        self.online_net = net(o_dim, a_dim).to(self.device)
        self.target_net = net(o_dim, a_dim).to(self.device)

        self.optimizer = optim.Adam(self.online_net.parameters(), lr=lr)
示例#6
0
    def __init__(self, state_size, action_size, random_seed):

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random_seed

        # ------------------ actor ------------------ #
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)

        # ------------------ critic ----------------- #
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)

        # ------------------ optimizers ------------- #
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC)

        # ----------------------- initialize target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, 1)
        self.soft_update(self.actor_local, self.actor_target, 1)
        self.t_step = 0

        # Noise process
        self.noise = OUNoise(action_size, random_seed)
        # Replay Buffer
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   device, random_seed)
    def __init__(self, env_id, action_space, action_bound):

        self.env_id = env_id

        self.action_space = action_space

        self.action_bound = action_bound

        self.env = gym.make(self.env_id)

        self.replay_buffer = ReplayBuffer(max_len=self.MAX_EXPERIENCES)

        self.policy = GaussianPolicy(action_space=self.action_space,
                                     action_bound=self.action_bound)

        self.duqlqnet = DualQNetwork()

        self.target_dualqnet = DualQNetwork()

        self.log_alpha = tf.Variable(0.)  #: alpha=1

        self.alpha_optimizer = tf.keras.optimizers.Adam(3e-4)

        self.target_entropy = -0.5 * self.action_space

        self.global_steps = 0

        self._initialize_weights()
示例#8
0
文件: SAC.py 项目: bhargavCSSE/adv-RL
    def __init__(self, alpha=0.0003, beta= 0.0003, input_dims=[8], env=None, 
                gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, 
                ent_alpha = 0.0001, batch_size=256, reward_scale=2, 
                layer1_size=256, layer2_size=256, chkpt_dir='tmp/sac'):
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions
        self.ent_alpha = ent_alpha
        self.reward_scale = reward_scale

        self.actor = ActorNetwork(alpha, input_dims, n_actions=n_actions, 
                                fc1_dims=layer1_size, fc2_dims=layer2_size ,
                                name='actor', chkpt_dir=chkpt_dir)

        self.critic_1 = CriticNetwork(beta, input_dims, n_actions=n_actions, 
                                fc1_dims=layer1_size, fc2_dims=layer2_size ,name='critic_1',
                                chkpt_dir=chkpt_dir)
        self.critic_2 = CriticNetwork(beta, input_dims, n_actions=n_actions, 
                                fc1_dims=layer1_size, fc2_dims=layer2_size ,name='critic_2',
                                chkpt_dir=chkpt_dir)
        self.target_critic_1 = CriticNetwork(beta, input_dims, n_actions=n_actions, 
                                fc1_dims=layer1_size, fc2_dims=layer2_size ,name='target_critic_1',
                                chkpt_dir=chkpt_dir)
        self.target_critic_2 = CriticNetwork(beta, input_dims, n_actions=n_actions, 
                                fc1_dims=layer1_size, fc2_dims=layer2_size ,name='target_critic_2',
                                chkpt_dir=chkpt_dir)


        self.update_network_parameters(tau=1)
示例#9
0
    def __init__(self, state_shape, action_size, seed, cnn=False):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            cnn (bool): whether to use convolutional NN
        """
        self.state_shape = state_shape
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.cnn = cnn

        if cnn:
            self.qnetwork_local = QNetworkFullyConvolutional(
                state_shape, action_size, seed).to(device)
            self.qnetwork_target = QNetworkFullyConvolutional(
                state_shape, action_size, seed).to(device)
        else:
            self.qnetwork_local = QNetworkFullyConnected(
                state_shape, action_size, seed).to(device)
            self.qnetwork_target = QNetworkFullyConnected(
                state_shape, action_size, seed).to(device)

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    def __init__(self, env, model, target_model, config, name_agent="dqn"):

        self.name_agent = name_agent

        self.dim_space = env.observation_space.shape[0]
        self.nb_actions = env.action_space.n
        self.epsilon = config.epsilon_start

        self.epsilon_final = config.epsilon_final
        self.epsilon_start = config.epsilon_start
        self.epsilon_decay = config.epsilon_decay

        self.gamma = config.gamma
        self.replay_buffer = ReplayBuffer(10000, config.batch_size)
        self.environment = env
        self.batch_size = config.batch_size
        self.update_nb_iter = config.update_nb_iter

        # q0
        self.model = model
        # q0_barre
        self.target_model = target_model
        self.optimizer = optim.Adam(self.model.parameters(),
                                    lr=config.learning_rate)

        #
        self.loss_data = []
        self.rewards = []
示例#11
0
    def __init__(self, config: Config):
        self.config = config
        self.is_training = True
        # self.buffer = deque(maxlen=self.config.max_buff)
        self.buffer = ReplayBuffer(self.config.max_buff)

        self.actor = Actor(self.config.state_dim, self.config.action_dim,
                           self.config.max_action)
        self.actor_target = Actor(self.config.state_dim,
                                  self.config.action_dim,
                                  self.config.max_action)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = Adam(self.actor.parameters(),
                                    lr=self.config.learning_rate)

        self.critic_1 = Critic(self.config.state_dim, self.config.action_dim)
        self.critic_1_target = Critic(self.config.state_dim,
                                      self.config.action_dim)
        self.critic_1_target.load_state_dict(self.critic_1.state_dict())
        self.critic_1_optimizer = Adam(self.critic_1.parameters(),
                                       lr=self.config.learning_rate)

        self.critic_2 = Critic(self.config.state_dim, self.config.action_dim)
        self.critic_2_target = Critic(self.config.state_dim,
                                      self.config.action_dim)
        self.critic_2_target.load_state_dict(self.critic_2.state_dict())
        self.critic_2_optimizer = Adam(self.critic_2.parameters(),
                                       lr=self.config.learning_rate)

        self.MseLoss = nn.MSELoss()

        if self.config.use_cuda:
            self.cuda()
示例#12
0
 def __init__(self, n_agents, act_spcs, ob_spcs, writer, args):
     self.args = args
     self.memory = ReplayBuffer(args.buffer_length, n_agents, device)
     # self.memory = ReplayMemory(args.buffer_length, n_agents, device)
     self.use_maddpg = args.algo == "maddpg"
     self.use_sac = args.use_sac
     self.use_td3 = args.use_td3
     self.use_single_q = args.single_q
     self.all_obs = args.all_obs
     self.n_agents = n_agents
     self.act_spcs = act_spcs
     self.ob_spcs = ob_spcs
     qnet_actspcs = [np.sum(self.act_spcs) if self.use_maddpg else self.act_spcs[i]
                     for i in range(n_agents)]
     qnet_obspcs = [np.sum(self.ob_spcs) if self.use_maddpg else self.ob_spcs[i]
                     for i in range(n_agents)]
     if self.use_sac and not self.use_td3:
         self.agents = [SAC_agent(self.act_spcs[i], qnet_obspcs[i] if self.all_obs
                                  else self.ob_spcs[i], qnet_obspcs[i],
                        qnet_actspcs[i]) for i in range(n_agents)]
     elif self.use_td3:
         self.agents = [TD3_agent(self.act_spcs[i], qnet_obspcs[i] if self.all_obs
                                   else self.ob_spcs[i], qnet_obspcs[i],
                        qnet_actspcs[i]) for i in range(n_agents)]
     else:
         self.agents = [DDPG_agent(self.act_spcs[i], qnet_obspcs[i] if self.all_obs
                                   else self.ob_spcs[i], qnet_obspcs[i],
                        qnet_actspcs[i]) for i in range(n_agents)]
     self.n_steps = 0
     self.n_updates = 0
     self.writer = writer
     self.criterion = nn.MSELoss()
     self.sac_alpha = args.sac_alpha
     self.agent_actions = [[] for i in range(self.n_agents)]
    def __init__(self,
                 alpha=0.0003,
                 beta=0.0003,
                 input_dims=[8],
                 env=None,
                 gamma=0.99,
                 n_actions=2,
                 max_size=1000000,
                 tau=0.005,
                 layer1_size=256,
                 layer2_size=256,
                 batch_size=256,
                 reward_scale=2):
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions

        self.actor = ActorNetwork(n_actions=n_actions,
                                  name='actor',
                                  max_action=env.action_space.high)
        self.critic_1 = CriticNetwork(n_actions=n_actions, name='critic_1')
        self.critic_2 = CriticNetwork(n_actions=n_actions, name='critic_2')
        self.value = ValueNetwork(name='value')
        self.target_value = ValueNetwork(name='target_value')

        self.actor.compile(optimizer=Adam(learning_rate=alpha))
        self.critic_1.compile(optimizer=Adam(learning_rate=beta))
        self.critic_2.compile(optimizer=Adam(learning_rate=beta))
        self.value.compile(optimizer=Adam(learning_rate=beta))
        self.target_value.compile(optimizer=Adam(learning_rate=beta))

        self.scale = reward_scale
        self.update_network_parameters(tau=1)
示例#14
0
    def __init__(self,
                 num_agents,
                 x_dim,
                 o_dim,
                 a_dim,
                 lr_actor=1e-3,
                 lr_critic=1e-3,
                 batch_size=16,
                 gamma=0.99,
                 tau=0.001,
                 buffer_size=int(1e5),
                 seed=1234):

        self.num_agents = num_agents
        self.x_dim = x_dim
        self.o_dim = o_dim
        self.a_dim = a_dim
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.buffer_size = buffer_size
        self.seed = seed

        self.buffer = ReplayBuffer(buffer_size, batch_size, seed)
        self.agents = [DDPGAgent(num_agents, id, x_dim, o_dim, a_dim, lr_actor, lr_critic, gamma, seed) \
                       for id in range(num_agents)]
    def __init__(self,
                 input_dims,
                 alpha=0.001,
                 beta=0.002,
                 env=None,
                 gamma=0.99,
                 n_actions=2,
                 max_size=1000000,
                 tau=0.005,
                 fc1=400,
                 fc2=300,
                 batch_size=64,
                 noise=0.1):
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions
        self.noise = noise
        self.max_action = env.action_space.high[0]
        self.min_action = env.action_space.low[0]

        self.actor = ActorNetwork(n_actions=n_actions, name='actor')
        self.critic = CriticNetwork(name='critic')
        self.target_actor = ActorNetwork(n_actions=n_actions,
                                         name='target_actor')
        self.target_critic = CriticNetwork(name='target_critic')

        self.actor.compile(optimizer=Adam(learning_rate=alpha))
        self.critic.compile(optimizer=Adam(learning_rate=beta))
        self.target_actor.compile(optimizer=Adam(learning_rate=alpha))
        self.target_critic.compile(optimizer=Adam(learning_rate=beta))

        self.update_network_parameters(tau=1)
示例#16
0
    def __init__(self,
                 alpha=3e-4,
                 beta=3e-4,
                 input_dims=[8],
                 env=None,
                 gamma=0.99,
                 n_actions=2,
                 max_size=1000000,
                 tau=5e-3,
                 fc1_dim=256,
                 fc2_dim=256,
                 batch_size=256,
                 reward_scale=2):
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions

        self.actor = ActorNetwork(alpha, input_dims, n_actions,
                                  env.action_space.high)
        self.critic1 = CriticNetwork(beta,
                                     input_dims,
                                     n_actions,
                                     name='critic1')
        self.critic2 = CriticNetwork(beta,
                                     input_dims,
                                     n_actions,
                                     name='critic2')
        self.value = ValueNetwork(beta, input_dims, name='value')
        self.target_value = ValueNetwork(beta, input_dims, name='target_value')

        self.scale = reward_scale
        self.update_network_parameters(tau=1)
示例#17
0
	def __init__(self, actor, critic, reward_fun, gamma=0.99, tau=0.005, # policy_noise=0.2, noise_clip=0.5,
				 policy_freq=2, max_buffer_size=1e6, batch_size=64, lr=3e-4
				 ):

		self._actor = actor
		self._actor_target = copy.deepcopy(self._actor)
		self._actor_optimizer = torch.optim.Adam(self._actor.parameters(), lr=lr)

		self._critic = critic
		self._critic_target = copy.deepcopy(self._critic)
		self._critic_loss = nn.MSELoss()
		self._critic_optimizer = torch.optim.Adam(self._critic.parameters(), lr=lr)

		self.reward_fun = reward_fun

		self._gamma = gamma
		self._tau = tau
		self._policy_freq = policy_freq

		self._rbuffer_max_size = max_buffer_size
		self._replay_buffer = ReplayBuffer(self._rbuffer_max_size)
		self._batch_size = batch_size

		self._steps = 0
		self._run = 0
示例#18
0
    def __init__(self, args):
        """
            init function
            Args:
                - args: class with args parameter
        """
        self.state_size = args.state_size
        self.action_size = args.action_size
        self.bs = args.bs
        self.gamma = args.gamma
        self.epsilon = args.epsilon
        self.tau = args.tau
        self.discrete = args.discrete
        self.randomer = OUNoise(args.action_size)
        self.buffer = ReplayBuffer(args.max_buff)

        self.actor = Actor(self.state_size, self.action_size)
        self.actor_target = Actor(self.state_size, self.action_size)
        self.actor_opt = AdamW(self.actor.parameters(), args.lr_actor)

        self.critic = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)
        self.critic_opt = AdamW(self.critic.parameters(), args.lr_critic)

        hard_update(self.actor_target, self.actor)
        hard_update(self.critic_target, self.critic)
示例#19
0
    def __init__(self,
                 state_size=37,
                 action_size=4,
                 gamma=0.99,
                 lr=0.001,
                 update_every=5):
        """
        Initializes the model.
        ----
        @param:
        1. state_size: size of input # of states.
        2. action_size: size of # of actions.
        3. gamma: discounted return rate.
        4. lr: learning rate for the model.
        5. update_every: update target_model every X time-steps.
        """
        self.state_size = state_size
        self.action_size = action_size

        self.gamma = gamma  #define dicsounted return

        #Q-network : defines the 2 DQN (using doubling Q-learning architecture via fixed Q target)
        self.qnetwork_local = DQNetwork()
        self.qnetwork_target = DQNetwork()

        #define the optimizer
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr)

        #replay memory
        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE)
        self.update_every = update_every
        self.target_update_counter = 0
class MADDPGAgent():
    def __init__(self, seed, checkpoint_filename=None):

        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, DEVICE, seed)
        self.t = 0

        self.agents = [
            DDPGAgent(index, NUM_AGENTS, seed, DEVICE)
            for index in range(NUM_AGENTS)
        ]

        if checkpoint_filename:
            for i, to_load in enumerate(self.agents):
                f"{os.getcwd()}/models/{checkpoint_filename}_actor_{i}.weights"
                actor_file = torch.load(
                    f"{os.getcwd()}/models/{checkpoint_filename}_actor_{i}.weights",
                    map_location=DEVICE)
                critic_file = torch.load(
                    f"{os.getcwd()}/models/{checkpoint_filename}_critic_{i}.weights",
                    map_location=DEVICE)
                to_load.actor_local.load_state_dict(actor_file)
                to_load.actor_target.load_state_dict(actor_file)
                to_load.critic_local.load_state_dict(critic_file)
                to_load.critic_target.load_state_dict(critic_file)
            print(f'Files loaded with prefix {checkpoint_filename}')

    def step(self, all_states, all_actions, all_rewards, all_next_states,
             all_dones):
        all_states = all_states.reshape(1, -1)
        all_next_states = all_next_states.reshape(1, -1)
        self.memory.add(all_states, all_actions, all_rewards, all_next_states,
                        all_dones)
        self.t = (self.t + 1) % UPDATE_FREQUENCY
        if self.t == 0 and (len(self.memory) > BATCH_SIZE):
            experiences = [self.memory.sample() for _ in range(NUM_AGENTS)]
            self.learn(experiences, GAMMA)

    def act(self, all_states, random):
        all_actions = []
        for agent, state in zip(self.agents, all_states):
            action = agent.act(state, random=random)
            all_actions.append(action)
        return np.array(all_actions).reshape(1, -1)

    def learn(self, experiences, gamma):
        all_actions = []
        all_next_actions = []
        for i, agent in enumerate(self.agents):
            states, _, _, next_states, _ = experiences[i]
            agent_id = torch.tensor([i]).to(DEVICE)
            state = states.reshape(-1, 2, 24).index_select(1,
                                                           agent_id).squeeze(1)
            next_state = next_states.reshape(-1, 2, 24).index_select(
                1, agent_id).squeeze(1)
            all_actions.append(agent.actor_local(state))
            all_next_actions.append(agent.actor_target(next_state))
        for i, agent in enumerate(self.agents):
            agent.learn(i, experiences[i], gamma, all_next_actions,
                        all_actions)
示例#21
0
 def __init__(self, state_size, action_size, num_agents, random_seed):
     self.agents = [
         DDPGAgent(state_size, action_size, random_seed)
         for _ in range(num_agents)
     ]
     self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                device, random_seed)
     self.t_step = 0
示例#22
0
 def __init__(self, state_size, action_size, num_agents):
     self.state_size = state_size
     self.action_size = action_size
     self.num_agents = num_agents
     self.whole_action_dim = self.action_size*self.num_agents
     self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE) # Replay memory
     self.maddpg_agents = [DDPG(state_size, action_size, num_agents), DDPG(state_size, action_size, num_agents)] #create agents
     self.episodes_before_training = EPISODES_BEFORE_TRAINING
示例#23
0
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents=2,
                 eps_before_train=500,
                 gamma=0.99,
                 batch_size=128,
                 buffer_size=int(1e5),
                 lr_actor=1e-4,
                 lr_critic=1e-3,
                 weight_decay=0,
                 tau=1e-3,
                 noise_weight=1.0,
                 noise_decay=0.999998,
                 noise_min=1e-3,
                 seed=0,
                 device="cuda:0"):

        # (self, state_size, action_size, num_agents=2, random_seed=1, lr_actor=2e-4, lr_critic=1e-3,
        #          weight_decay=0, tau=2e-3, device=device)

        torch.manual_seed(seed)
        np.random.seed(seed)

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.action_dim = action_size * num_agents

        self.eps_before_train = eps_before_train
        self.gamma = gamma
        self.batch_size = batch_size
        self.buffer_size = buffer_size

        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.weight_decay = weight_decay
        self.tau = tau

        self.noise_weight = noise_weight
        self.noise_decay = noise_decay
        self.noise_min = noise_min
        self.device = device
        self.i_episode = 0

        self.agents = [
            DDPG(self.state_size,
                 self.action_size,
                 self.num_agents,
                 random_seed=2 * i * seed,
                 lr_actor=self.lr_actor,
                 lr_critic=self.lr_critic,
                 weight_decay=self.weight_decay,
                 tau=self.tau,
                 device=self.device) for i in range(self.num_agents)
        ]
        self.memory = ReplayBuffer(self.action_size, self.buffer_size, seed)
    def __init__(self, env_name,
                 num_quantiles=32, fqf_factor=0.000001*0.1, ent_coef=0.001,
                 state_embedding_dim=3136, quantile_embedding_dim=64,
                 gamma=0.99, n_frames=4, batch_size=32,
                 buffer_size=1000000,
                 update_period=8,
                 target_update_period=10000):

        self.env_name = env_name

        self.num_quantiles = num_quantiles

        self.state_embedding_dim = state_embedding_dim

        self.quantile_embedding_dim = quantile_embedding_dim

        self.k = 1.0

        self.ent_coef = ent_coef

        self.n_frames = n_frames

        self.action_space = gym.make(self.env_name).action_space.n

        self.fqf_network = FQFNetwork(
            action_space=self.action_space,
            num_quantiles=self.num_quantiles,
            state_embedding_dim=self.state_embedding_dim,
            quantile_embedding_dim=self.quantile_embedding_dim)

        self.target_fqf_network = FQFNetwork(
            action_space=self.action_space,
            num_quantiles=self.num_quantiles,
            state_embedding_dim=self.state_embedding_dim,
            quantile_embedding_dim=self.quantile_embedding_dim)

        self._define_network()

        self.optimizer = tf.keras.optimizers.Adam(
            lr=0.00015, epsilon=0.01/32)

        #: fpl; fraction proposal layer
        self.optimizer_fpl = tf.keras.optimizers.Adam(
            learning_rate=0.00005 * fqf_factor,
            epsilon=0.0003125)

        self.gamma = gamma

        self.replay_buffer = ReplayBuffer(max_len=buffer_size)

        self.batch_size = batch_size

        self.update_period = update_period

        self.target_update_period = target_update_period

        self.steps = 0
示例#25
0
    def reset_parameters(self):
        self._q.reset_parameters()
        self._q_target.reset_parameters()

        hard_update(self._q_target, self._q)

        self._pi.reset_parameters()
        if self._use_rbuffer:
            self._replay_buffer = ReplayBuffer(self._rbuffer_max_size)
示例#26
0
    def __init__(
        self,
        policy,
        env,
        gamma,
        learning_rate,
        buffer_size,
        exploration_type,
        exploration_frac,
        exploration_ep,
        exploration_initial_eps,
        exploration_final_eps,
        double_q,
        policy_kwargs,
        seed,
        intent
        ):

        super(TabularRLModel, self).__init__(
            policy=policy,
            env=env, 
            policy_kwargs=policy_kwargs,
            seed=seed)

        self.gamma = gamma
        self.learning_rate = learning_rate
        self.buffer_size = buffer_size
        self.exploration_type = exploration_type
        self.exploration_frac = exploration_frac
        self.exploration_ep = exploration_ep
        self.exploration_initial_eps = exploration_initial_eps
        self.exploration_final_eps = exploration_final_eps
        self.double_q = double_q
        self.intent = intent
        # self.policy_kwargs = {} if policy_kwargs is None else policy_kwargs
        # self.policy = policy(self.observation_space, self.action_space, intent=True)

        self.policy_kwargs = get_default_args(self.policy)
        self.policy_kwargs['ob_space'] = self.observation_space
        self.policy_kwargs['ac_space'] = self.action_space
        self.policy_kwargs['intent'] = self.intent

        if policy_kwargs is not None:
            for key, val in policy_kwargs.items():
                self.policy_kwargs[key] = val
        # self.policy_kwargs['transform_func'] = transform_func

        # if policy_kwargs is None:
        #     self.policy = policy(self.observation_space, self.action_space,
        #                          intent=True, device=self.device)
        # else:
        self.policy = policy(**self.policy_kwargs)

        if self.buffer_size is None:
            self.replay_buffer = None
        else:
            self.replay_buffer = ReplayBuffer(self.buffer_size)
示例#27
0
文件: agent.py 项目: mfsuve/TORCS-RL
    def __init__(self, load_from=None, will_train=True):
        self.env = TorcsEnv(
            path='/usr/local/share/games/torcs/config/raceman/quickrace.xml')
        self.args = SAC_args()
        self.buffer = ReplayBuffer(self.args.buffer_size)

        action_dim = self.env.action_space.shape[0]
        state_dim = self.env.observation_space.shape[0]
        hidden_dim = 256

        self.action_size = action_dim
        self.state_size = state_dim

        self.value_net = ValueNetwork(state_dim,
                                      hidden_dim).to(self.args.device)
        self.target_value_net = ValueNetwork(state_dim,
                                             hidden_dim).to(self.args.device)

        self.soft_q_net1 = SoftQNetwork(state_dim, action_dim,
                                        hidden_dim).to(self.args.device)
        self.soft_q_net2 = SoftQNetwork(state_dim, action_dim,
                                        hidden_dim).to(self.args.device)

        self.policy_net = PolicyNetwork(state_dim, action_dim,
                                        hidden_dim).to(self.args.device)

        self.target_value_net.load_state_dict(self.value_net.state_dict())

        self.value_criterion = nn.MSELoss()
        self.soft_q_loss1 = nn.MSELoss()
        self.soft_q_loss2 = nn.MSELoss()

        self.value_opt = optim.Adam(self.value_net.parameters(),
                                    lr=self.args.lr)
        self.soft_q_opt1 = optim.Adam(self.soft_q_net1.parameters(),
                                      lr=self.args.lr)
        self.soft_q_opt2 = optim.Adam(self.soft_q_net2.parameters(),
                                      lr=self.args.lr)
        self.policy_opt = optim.Adam(self.policy_net.parameters(),
                                     lr=self.args.lr)

        if will_train:
            current_time = time.strftime('%d-%b-%y-%H.%M.%S', time.localtime())
            self.plot_folder = f'plots/{current_time}'
            self.model_save_folder = f'model/{current_time}'
            make_sure_dir_exists(self.plot_folder)
            make_sure_dir_exists(self.model_save_folder)
            self.cp = Checkpoint(self.model_save_folder)

        if load_from is not None:
            try:
                self.load_checkpoint(load_from)
            except FileNotFoundError:
                print(f'{load_from} not found. Running default.')
        else:
            print('Starting from scratch.')
    def __init__(self, config: Config):
        self.config = config
        self.is_training = True
        self.buffer = ReplayBuffer(self.config.max_buff)

        self.model = DQN(self.config.state_dim, self.config.action_dim).cuda()
        self.model_optim = Adam(self.model.parameters(), lr=self.config.learning_rate)

        if self.config.use_cuda:
            self.cuda()
 def test_append(self):
     count = 100
     start_length = count // 2
     max_length = count
     buffer = ReplayBuffer(start_length=start_length, max_length=max_length)
     for append_count in range(max_length*2):
         buffer.append(append_count)
         self.assertEqual(len(buffer.buffer), min(append_count+1, max_length), "Incorrect buffer size.")
         self.assertEqual(buffer.buffer[0], max(0, (append_count+1) - max_length), "Incorrect first value.")
         self.assertEqual(buffer.buffer[-1], append_count, "Incorrect last value.")
示例#30
0
class MADDPG():
    def __init__(self, num_agents, state_size, action_size, random_seed):
        """ Initialize multiple Agents each with a Actor-Critic network
            but they share the replay buffer to learn from experience
        """
        self.num_agents = num_agents
        self.agents = []
        for _ in range(num_agents):
            agent = Agent(state_size, action_size, random_seed)
            self.agents.append(agent)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def act(self, states, add_noise=True):
        clipped_actions = []
        for state, agent in zip(states, self.agents):
            clipped_actions.append(agent.act(state, add_noise))
        return clipped_actions

    def reset(self):
        for agent in self.agents:
            agent.reset()

    def learn(self, experiences, gamma):
        for agent in self.agents:
            agent.learn(experiences, gamma)

    def saveCheckPoints(self):
        for i, agent in enumerate(self.agents):
            torch.save(agent.actor_local.state_dict(),
                       f"checkpoints/actor_agent_{i}.pth")
            torch.save(agent.critic_local.state_dict(),
                       f"checkpoints/critic_agent_{i}.pth")

    def loadCheckPoints(self):
        for i, agent in enumerate(self.agents):
            agent.actor_local.load_state_dict(
                torch.load(f"checkpoints/actor_agent_{i}.pth"))
            agent.critic_local.load_state_dict(
                torch.load(f"checkpoints/critic_agent_{i}.pth"))

    def step(self, states, actions, rewards, next_states, dones):
        """Save experience in replay memory, and use random sample from buffer to learn."""

        # Save experience / reward
        for i in range(self.num_agents):
            self.memory.add(states[i], actions[i], rewards[i], next_states[i],
                            dones[i])
        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            for agent in self.agents:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)