def test_replay_buffer_from_state_wrong_type(): # Assign buffer = ReplayBuffer(batch_size=5, buffer_size=20) state = buffer.get_state() state.type = "WrongType" # Act with pytest.raises(ValueError): ReplayBuffer.from_state(state=state)
def test_replay_buffer_add(): # Assign buffer = ReplayBuffer(batch_size=5, buffer_size=5) # Act assert len(buffer) == 0 for sars in generate_sample_SARS(1, dict_type=True): buffer.add(**sars) # Assert assert len(buffer) == 1
def test_replay_buffer_from_state_without_data(): # Assign buffer = ReplayBuffer(batch_size=5, buffer_size=20) state = buffer.get_state() # Act new_buffer = ReplayBuffer.from_state(state=state) # Assert assert new_buffer == buffer assert new_buffer.buffer_size == state.buffer_size assert new_buffer.batch_size == state.batch_size assert new_buffer.data == []
def test_replay_buffer_get_state_empty(): # Assign batch_size = 10 buffer_size = 20 buffer = ReplayBuffer(batch_size=batch_size, buffer_size=buffer_size) # Act state: BufferState = buffer.get_state() # Assert assert state.type == ReplayBuffer.type assert state.batch_size == batch_size assert state.buffer_size == buffer_size assert state.data is None
def test_buffer_add(): # Assign buffer = ReplayBuffer(batch_size=5, buffer_size=5) # Act assert len(buffer) == 0 (state, actions, reward, next_state, done) = generate_sample_SARS() buffer.add_sars(state=state, action=actions, reward=reward, next_state=next_state, done=done) # Assert assert len(buffer) == 1
def test_replay_buffer_from_state_with_data(): # Assign buffer = ReplayBuffer(batch_size=5, buffer_size=20) buffer = populate_buffer(buffer, 30) state = buffer.get_state() # Act new_buffer = ReplayBuffer.from_state(state=state) # Assert assert new_buffer == buffer assert new_buffer.buffer_size == state.buffer_size assert new_buffer.batch_size == state.batch_size assert new_buffer.data == state.data assert len(buffer.data) == state.buffer_size
def test_buffer_size(): # Assign buffer_size = 10 buffer = ReplayBuffer(batch_size=5, buffer_size=buffer_size) # Act for (state, action, reward, next_state, done) in generate_sample_SARS(buffer_size + 1): buffer.add(state=state, action=action, reward=reward, next_state=next_state, done=done) # Assert assert len(buffer) == buffer_size
def test_buffer_size(): # Assign buffer_size = 10 buffer = ReplayBuffer(batch_size=5, buffer_size=buffer_size) # Act for _ in range(buffer_size + 2): (state, action, reward, next_state, done) = generate_sample_SARS() buffer.add_sars(state=state, action=action, reward=reward, next_state=next_state, done=done) # Assert assert len(buffer) == buffer_size
def __init__(self, env, state_size: int, action_size: int, agents_number: int, config: Dict, **kwargs): self.env = env self.state_size = state_size self.action_size = action_size self.agents_number = agents_number hidden_layers = config.get('hidden_layers', (256, 128)) noise_scale = float(config.get('noise_scale', 0.2)) noise_sigma = float(config.get('noise_sigma', 0.1)) actor_lr = float(config.get('actor_lr', 1e-3)) critic_lr = float(config.get('critic_lr', 1e-3)) self.maddpg_agent = [ DDPGAgent(agents_number * state_size, action_size, hidden_layers=hidden_layers, actor_lr=actor_lr, critic_lr=critic_lr, noise_scale=noise_scale, noise_sigma=noise_sigma) for _ in range(agents_number) ] self.gamma: float = float(config.get('gamma', 0.99)) self.tau: float = float(config.get('tau', 0.002)) self.gradient_clip: Optional[float] = config.get('gradient_clip') self.batch_size: int = int(config.get('batch_size', 64)) self.buffer_size = int(config.get('buffer_size', int(1e6))) self.buffer = ReplayBuffer(self.batch_size, self.buffer_size) self.warm_up: int = int(config.get('warm_up', 1e3)) self.update_freq: int = int(config.get('update_freq', 2)) self.number_updates: int = int(config.get('number_updates', 2)) self.critic = CriticBody(agents_number * state_size, agents_number * action_size, hidden_layers=hidden_layers).to(DEVICE) self.target_critic = CriticBody(agents_number * state_size, agents_number * action_size, hidden_layers=hidden_layers).to(DEVICE) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr) hard_update(self.target_critic, self.critic) self.reset()
def test_replay_buffer_dump(): import torch # Assign filled_buffer = 8 prop_keys = ["state", "action", "reward", "next_state"] buffer = ReplayBuffer(batch_size=5, buffer_size=10) for sars in generate_sample_SARS(filled_buffer): buffer.add(state=torch.tensor(sars[0]), reward=sars[1], action=[sars[2]], next_state=torch.tensor(sars[3]), dones=sars[4]) # Act dump = list(buffer.dump_buffer()) # Assert assert all([len(dump) == filled_buffer]) assert all([key in dump[0] for key in prop_keys])
def test_replay_buffer_dump_serializable(): import json import torch # Assign filled_buffer = 8 buffer = ReplayBuffer(batch_size=5, buffer_size=10) for sars in generate_sample_SARS(filled_buffer, dict_type=True): sars['state'] = torch.tensor(sars['state']) sars['next_state'] = torch.tensor(sars['next_state']) buffer.add(**sars) # Act dump = list(buffer.dump_buffer(serialize=True)) # Assert ser_dump = json.dumps(dump) assert isinstance(ser_dump, str) assert json.loads(ser_dump) == dump
def test_replay_buffer_load_json_dump(): # Assign prop_keys = ["state", "action", "reward", "next_state", "done"] buffer = ReplayBuffer(batch_size=20, buffer_size=20) ser_buffer = [] for sars in generate_sample_SARS(10, dict_type=True): ser_buffer.append(Experience(**sars)) # Act buffer.load_buffer(ser_buffer) # Assert samples = buffer.data assert len(buffer) == 10 assert len(samples) == 10 for sample in samples: assert all([hasattr(sample, key) for key in prop_keys]) assert all( [isinstance(getattr(sample, key), list) for key in prop_keys])
def from_state(state: BufferState) -> BufferBase: if state.type == ReplayBuffer.type: return ReplayBuffer.from_state(state) elif state.type == PERBuffer.type: return PERBuffer.from_state(state) elif state.type == NStepBuffer.type: return NStepBuffer.from_state(state) elif state.type == RolloutBuffer.type: return RolloutBuffer.from_state(state) else: raise ValueError(f"Buffer state contains unsupported buffer type: '{state.type}'")
def test_replay_buffer_sample(): # Assign batch_size = 5 buffer = ReplayBuffer(batch_size=batch_size, buffer_size=10) # Act for (state, actions, reward, next_state, done) in generate_sample_SARS(20): buffer.add(state=state, action=actions, reward=reward, next_state=next_state, done=done) # Assert samples = buffer.sample() # (states, actions, rewards, next_states, dones) assert len(samples["state"]) == batch_size assert len(samples["action"]) == batch_size assert len(samples["reward"]) == batch_size assert len(samples["next_state"]) == batch_size assert len(samples["done"]) == batch_size
def test_buffer_sample(): # Assign batch_size = 5 buffer = ReplayBuffer(batch_size=batch_size, buffer_size=10) # Act for _ in range(20): (state, actions, reward, next_state, done) = generate_sample_SARS() buffer.add_sars(state=state, action=actions, reward=reward, next_state=next_state, done=done) # Assert (states, actions, rewards, next_states, dones) = buffer.sample_sars() assert len(states) == batch_size assert len(actions) == batch_size assert len(rewards) == batch_size assert len(next_states) == batch_size assert len(dones) == batch_size
def test_replay_buffer_get_state_with_data(): # Assign batch_size = 10 buffer_size = 20 buffer = ReplayBuffer(batch_size=batch_size, buffer_size=buffer_size) for (state, action, reward, next_state, done) in generate_sample_SARS(buffer_size + 1): buffer.add(state=state, action=action, reward=reward, next_state=next_state, done=done) # Act state: BufferState = buffer.get_state() state_data: BufferState = buffer.get_state(include_data=True) # Assert assert state == state_data, "Default option is to include all data" assert state.type == ReplayBuffer.type assert state.batch_size == batch_size assert state.buffer_size == buffer_size assert len(state.data) == buffer_size for d in state.data: b_keys = ("state", "action", "reward", "done", "next_state") assert all([k in b_keys for k in d.get_dict().keys()])
def test_replay_buffer_get_state_without_data(): # Assign batch_size = 10 buffer_size = 20 buffer = ReplayBuffer(batch_size=batch_size, buffer_size=buffer_size) for (state, action, reward, next_state, done) in generate_sample_SARS(buffer_size + 1): buffer.add(state=state, action=action, reward=reward, next_state=next_state, done=done) # Act state: BufferState = buffer.get_state(include_data=False) # Assert assert state.type == ReplayBuffer.type assert state.batch_size == batch_size assert state.buffer_size == buffer_size assert state.data is None
def __init__(self, state_size: int, action_size: int, hidden_layers=(300, 200), config=None, device=None, **kwargs): config = config if config is not None else {} self.device = device if device is not None else DEVICE self.state_size = state_size self.action_size = action_size self.iteration = 0 self.actor_lr = float(config.get('actor_lr', 3e-4)) self.critic_lr = float(config.get('critic_lr', 1e-3)) self.gamma: float = float(config.get("gamma", 0.99)) self.ppo_ratio_clip: float = float(config.get("ppo_ratio_clip", 0.2)) self.rollout_length: int = int(config.get("rollout_length", 48)) # "Much less than the episode length" self.batch_size: int = int(config.get("batch_size", self.rollout_length // 2)) self.number_updates: int = int(config.get("number_updates", 5)) self.entropy_weight: float = float(config.get("entropy_weight", 0.0005)) self.value_loss_weight: float = float(config.get("value_loss_weight", 1.0)) self.local_memory_buffer = {} self.memory = ReplayBuffer(batch_size=self.batch_size, buffer_size=self.rollout_length) self.action_scale: float = float(config.get("action_scale", 1)) self.action_min: float = float(config.get("action_min", -2)) self.action_max: float = float(config.get("action_max", 2)) self.max_grad_norm_actor: float = float(config.get("max_grad_norm_actor", 100.0)) self.max_grad_norm_critic: float = float(config.get("max_grad_norm_critic", 100.0)) self.hidden_layers = config.get('hidden_layers', hidden_layers) self.actor = ActorBody(state_size, action_size, self.hidden_layers).to(self.device) self.critic = CriticBody(state_size, action_size, self.hidden_layers).to(self.device) self.policy = GaussianPolicy(action_size).to(self.device) self.actor_params = list(self.actor.parameters()) + [self.policy.std] self.critic_params = self.critic.parameters() self.actor_opt = torch.optim.SGD(self.actor_params, lr=self.actor_lr) self.critic_opt = torch.optim.SGD(self.critic_params, lr=self.critic_lr)
class DDPGAgent(AgentType): """ Deep Deterministic Policy Gradients (DDPG). Instead of popular Ornstein-Uhlenbeck (OU) process for noise this agent uses Gaussian noise. """ name = "DDPG" def __init__(self, state_size: int, action_size: int, hidden_layers: Sequence[int] = (128, 128), actor_lr: float = 2e-3, actor_lr_decay: float = 0, critic_lr: float = 2e-3, critic_lr_decay: float = 0, noise_scale: float = 0.2, noise_sigma: float = 0.1, clip: Tuple[int, int] = (-1, 1), config=None, device=None, **kwargs): config = config if config is not None else dict() self.device = device if device is not None else DEVICE # Reason sequence initiation. self.hidden_layers = config.get('hidden_layers', hidden_layers) self.actor = ActorBody(state_size, action_size, hidden_layers=hidden_layers).to(self.device) self.critic = CriticBody(state_size, action_size, hidden_layers=hidden_layers).to(self.device) self.target_actor = ActorBody(state_size, action_size, hidden_layers=hidden_layers).to( self.device) self.target_critic = CriticBody(state_size, action_size, hidden_layers=hidden_layers).to( self.device) # Noise sequence initiation self.noise = GaussianNoise(shape=(action_size, ), mu=1e-8, sigma=noise_sigma, scale=noise_scale, device=device) # Target sequence initiation hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) # Optimization sequence initiation. self.actor_optimizer = Adam(self.actor.parameters(), lr=actor_lr, weight_decay=actor_lr_decay) self.critic_optimizer = Adam(self.critic.parameters(), lr=critic_lr, weight_decay=critic_lr_decay) self.action_min = clip[0] self.action_max = clip[1] self.action_scale = config.get('action_scale', 1) self.gamma: float = float(config.get('gamma', 0.99)) self.tau: float = float(config.get('tau', 0.02)) self.batch_size: int = int(config.get('batch_size', 64)) self.buffer_size: int = int(config.get('buffer_size', int(1e6))) self.buffer = ReplayBuffer(self.batch_size, self.buffer_size) self.warm_up: int = int(config.get('warm_up', 0)) self.update_freq: int = int(config.get('update_freq', 1)) self.number_updates: int = int(config.get('number_updates', 1)) # Breath, my child. self.reset_agent() self.iteration = 0 def reset_agent(self) -> None: self.actor.reset_parameters() self.critic.reset_parameters() self.target_actor.reset_parameters() self.target_critic.reset_parameters() def act(self, obs, noise: float = 0.0): with torch.no_grad(): obs = torch.tensor(obs.astype(np.float32)).to(self.device) action = self.actor(obs) action += noise * self.noise.sample() return self.action_scale * torch.clamp( action, self.action_min, self.action_max).cpu().numpy().astype( np.float32) def target_act(self, obs, noise: float = 0.0): with torch.no_grad(): obs = torch.tensor(obs).to(self.device) action = self.target_actor(obs) + noise * self.noise.sample() return torch.clamp(action, self.action_min, self.action_max).cpu().numpy().astype( np.float32) def step(self, state, action, reward, next_state, done): self.iteration += 1 self.buffer.add(state=state, action=action, reward=reward, next_state=next_state, done=done) if self.iteration < self.warm_up: return if len(self.buffer) > self.batch_size and (self.iteration % self.update_freq) == 0: for _ in range(self.number_updates): self.learn(self.buffer.sample_sars()) def learn(self, samples): """update the critics and actors of all the agents """ states, actions, rewards, next_states, dones = samples rewards = rewards.to(self.device) dones = dones.type(torch.int).to(self.device) states = states.to(self.device) next_states = next_states.to(self.device) actions = actions.to(self.device) # critic loss next_actions = self.target_actor(next_states) Q_target_next = self.target_critic(next_states, next_actions) Q_target = rewards + (self.gamma * Q_target_next * (1 - dones)) Q_expected = self.critic(states, actions) critic_loss = mse_loss(Q_expected, Q_target) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() # torch.nn.utils.clip_grad_norm_(self.critic.parameters(), self.gradient_clip) self.critic_optimizer.step() self.critic_loss = critic_loss.item() # Compute actor loss pred_actions = self.actor(states) actor_loss = -self.critic(states, pred_actions).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() self.actor_loss = actor_loss.item() soft_update(self.target_actor, self.actor, self.tau) soft_update(self.target_critic, self.critic, self.tau) def describe_agent(self) -> Tuple[Any, Any, Any, Any]: """ Returns network's weights in order: Actor, TargetActor, Critic, TargetCritic """ return (self.actor.state_dict(), self.target_actor.state_dict(), self.critic.state_dict(), self.target_critic()) def log_writer(self, writer, episode): writer.add_scalar("loss/actor", self.actor_loss, episode) writer.add_scalar("loss/critic", self.critic_loss, episode) def save_state(self, path: str): agent_state = dict( actor=self.actor.state_dict(), target_actor=self.target_actor.state_dict(), critic=self.critic.state_dict(), target_critic=self.target_critic.state_dict(), ) torch.save(agent_state, path) def load_state(self, path: str): agent_state = torch.load(path) self.actor.load_state_dict(agent_state['actor']) self.critic.load_state_dict(agent_state['critic']) self.target_actor.load_state_dict(agent_state['target_actor']) self.target_critic.load_state_dict(agent_state['target_critic'])
class MADDPGAgent(MultiAgentType): name = "MADDPG" def __init__(self, state_size: int, action_size: int, num_agents: int, **kwargs): """Initiation of the Multi Agent DDPG. All keywords are also passed to DDPG agents. Parameters: state_size (int): Dimensionality of the state. action_size (int): Dimensionality of the action. num_agents (int): Number of agents. Keyword Arguments: hidden_layers (tuple of ints): Shape for fully connected hidden layers. noise_scale (float): Default: 1.0. Noise amplitude. noise_sigma (float): Default: 0.5. Noise variance. actor_lr (float): Default: 0.001. Learning rate for actor network. critic_lr (float): Default: 0.001. Learning rate for critic network. gamma (float): Default: 0.99. Discount value tau (float): Default: 0.02. Soft copy value. gradient_clip (optional float): Max norm for learning gradient. If None then no clip. batch_size (int): Number of samples per learning. buffer_size (int): Number of previous samples to remember. warm_up (int): Number of samples to see before start learning. update_freq (int): How many samples between learning sessions. number_updates (int): How many learning cycles per learning session. """ self.device = self._register_param(kwargs, "device", DEVICE, update=True) self.state_size: int = state_size self.action_size = action_size self.num_agents: int = num_agents self.agent_names: List[str] = kwargs.get("agent_names", map(str, range(self.num_agents))) hidden_layers = to_numbers_seq(self._register_param(kwargs, 'hidden_layers', (100, 100), update=True)) noise_scale = float(self._register_param(kwargs, 'noise_scale', 0.5)) noise_sigma = float(self._register_param(kwargs, 'noise_sigma', 1.0)) actor_lr = float(self._register_param(kwargs, 'actor_lr', 3e-4)) critic_lr = float(self._register_param(kwargs, 'critic_lr', 3e-4)) self.agents: Dict[str, DDPGAgent] = OrderedDict({ agent_name: DDPGAgent( state_size, action_size, actor_lr=actor_lr, critic_lr=critic_lr, noise_scale=noise_scale, noise_sigma=noise_sigma, **kwargs, ) for agent_name in self.agent_names }) self.gamma = float(self._register_param(kwargs, 'gamma', 0.99)) self.tau = float(self._register_param(kwargs, 'tau', 0.02)) self.gradient_clip: Optional[float] = self._register_param(kwargs, 'gradient_clip') self.batch_size = int(self._register_param(kwargs, 'batch_size', 64)) self.buffer_size = int(self._register_param(kwargs, 'buffer_size', int(1e6))) self.buffer = ReplayBuffer(self.batch_size, self.buffer_size) self.warm_up = int(self._register_param(kwargs, 'warm_up', 0)) self.update_freq = int(self._register_param(kwargs, 'update_freq', 1)) self.number_updates = int(self._register_param(kwargs, 'number_updates', 1)) self.critic = CriticBody(num_agents*state_size, num_agents*action_size, hidden_layers=hidden_layers).to(self.device) self.target_critic = CriticBody(num_agents*state_size, num_agents*action_size, hidden_layers=hidden_layers).to(self.device) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr) hard_update(self.target_critic, self.critic) self._step_data = {} self._loss_critic: float = float('inf') self._loss_actor: Dict[str, float] = {name: float('inf') for name in self.agent_names} self.reset() @property def loss(self) -> Dict[str, float]: out = {} for agent_name, agent in self.agents.items(): for loss_name, loss_value in agent.loss.items(): out[f"{agent_name}_{loss_name}"] = loss_value out[f"{agent_name}_actor"] = self._loss_actor[agent_name] out["critic"] = self._loss_critic return out def reset(self): self.iteration = 0 self.reset_agents() def reset_agents(self): for agent in self.agents.values(): agent.reset_agent() self.critic.reset_parameters() self.target_critic.reset_parameters() def step(self, agent_name: str, state: StateType, action: ActionType, reward, next_state, done) -> None: self._step_data[agent_name] = dict( state=state, action=action, reward=reward, next_state=next_state, done=done, ) def commit(self): step_data = defaultdict(list) for agent in self.agents: agent_data = self._step_data[agent] step_data['state'].append(agent_data['state']) step_data['action'].append(agent_data['action']) step_data['reward'].append(agent_data['reward']) step_data['next_state'].append(agent_data['next_state']) step_data['done'].append(agent_data['done']) self.buffer.add(**step_data) self._step_data = {} self.iteration += 1 if self.iteration < self.warm_up: return if len(self.buffer) > self.batch_size and (self.iteration % self.update_freq) == 0: for _ in range(self.number_updates): samples = self.buffer.sample() for agent_name in self.agents: self.learn(samples, agent_name) self.update_targets() @torch.no_grad() def act(self, agent_name: str, states: List[StateType], noise: float=0.0) -> List[float]: """Get actions from all agents. Synchronized action. Parameters: states: List of states per agent. Positions need to be consistent. noise: Scale for the noise to include Returns: actions: List of actions that each agent wants to perform """ tensor_states = torch.tensor(states).reshape(-1) agent = self.agents[agent_name] action = agent.act(tensor_states, noise) return action def __flatten_actions(self, actions): return actions.view(-1, self.num_agents*self.action_size) def learn(self, experiences, agent_name: str) -> None: """update the critics and actors of all the agents """ # TODO: Just look at this mess. agent_number = list(self.agents).index(agent_name) agent_rewards = to_tensor(experiences['reward']).select(1, agent_number).unsqueeze(-1).float().to(self.device) agent_dones = to_tensor(experiences['done']).select(1, agent_number).unsqueeze(-1).type(torch.int).to(self.device) states = to_tensor(experiences['state']).to(self.device).view(self.batch_size, self.num_agents, self.state_size) actions = to_tensor(experiences['action']).to(self.device) next_states = to_tensor(experiences['next_state']).float().to(self.device).view(self.batch_size, self.num_agents, self.state_size) flat_states = states.view(-1, self.num_agents*self.state_size) flat_next_states = next_states.view(-1, self.num_agents*self.state_size) flat_actions = actions.view(-1, self.num_agents*self.action_size) assert agent_rewards.shape == agent_dones.shape == (self.batch_size, 1) assert states.shape == next_states.shape == (self.batch_size, self.num_agents, self.state_size) assert actions.shape == (self.batch_size, self.num_agents, self.action_size) assert flat_actions.shape == (self.batch_size, self.num_agents*self.action_size) agent = self.agents[agent_name] next_actions = actions.detach().clone() next_actions.data[:, agent_number] = agent.target_actor(next_states[:, agent_number, :]) assert next_actions.shape == (self.batch_size, self.num_agents, self.action_size) # critic loss Q_target_next = self.target_critic(flat_next_states, self.__flatten_actions(next_actions)) Q_target = agent_rewards + (self.gamma * Q_target_next * (1 - agent_dones)) Q_expected = self.critic(flat_states, flat_actions) loss_critic = F.mse_loss(Q_expected, Q_target) # Minimize the loss self.critic_optimizer.zero_grad() loss_critic.backward() if self.gradient_clip: nn.utils.clip_grad_norm_(self.critic.parameters(), self.gradient_clip) self.critic_optimizer.step() self._loss_critic = float(loss_critic.mean().item()) # Compute actor loss pred_actions = actions.detach().clone() # pred_actions.data[:, agent_number] = agent.actor(flat_states) pred_actions.data[:, agent_number] = agent.actor(states[:, agent_number, :]) loss_actor = -self.critic(flat_states, self.__flatten_actions(pred_actions)).mean() agent.actor_optimizer.zero_grad() loss_actor.backward() agent.actor_optimizer.step() self._loss_actor[agent_name] = loss_actor.mean().item() def update_targets(self): """soft update targets""" for agent in self.agents.values(): soft_update(agent.target_actor, agent.actor, self.tau) soft_update(self.target_critic, self.critic, self.tau) def log_metrics(self, data_logger: DataLogger, step: int, full_log: bool=False): data_logger.log_value('loss/critic', self._loss_critic, step) for agent_name, agent in self.agents.items(): data_logger.log_values_dict(f"{agent_name}/loss", agent.loss, step) def get_state(self) -> Dict[str, dict]: """Returns agents' internal states""" agents_state = {} agents_state['config'] = self._config for agent_name, agent in self.agents.items(): agents_state[agent_name] = {"state": agent.state_dict(), "config": agent._config} return agents_state def save_state(self, path: str): """Saves current state of the Multi Agent instance and all related agents. All states are stored via PyTorch's :func:`save <torch.save>` function. Parameters: path: (str) String path to a location where the state is store. """ agents_state = self.get_state() torch.save(agents_state, path) def load_state(self, *, path: Optional[str]=None, agent_state: Optional[dict]=None) -> None: """Loads the state into the Multi Agent. The state can be provided either via path to a file that contains the state, see :meth:`save_state <self.save_state>`, or direclty via `state`. Parameters: path: (str) A path where the state was saved via `save_state`. state: (dict) Already loaded state kept in memory. """ if path is None and agent_state: raise ValueError("Either `path` or `agent_state` must be provided to load agent's state.") if path is not None and agent_state is None: agent_state = torch.load(path) self._config = agent_state.get('config', {}) self.__dict__.update(**self._config) for agent_name, agent in self.agents.items(): _agent_state = agent_state[agent_name] agent.load_state(agent_state=_agent_state["state"]) agent._config = _agent_state['config'] agent.__dict__.update(**agent._config) def seed(self, seed: int) -> None: for agent in self.agents.values(): agent.seed(seed) def state_dict(self) -> Dict[str, Any]: return {name: agent.state_dict() for (name, agent) in self.agents.items()}
class DDPGAgent(AgentBase): """ Deep Deterministic Policy Gradients (DDPG). Instead of popular Ornstein-Uhlenbeck (OU) process for noise this agent uses Gaussian noise. """ name = "DDPG" def __init__(self, state_size: int, action_size: int, noise_scale: float = 0.2, noise_sigma: float = 0.1, **kwargs): """ Parameters: state_size: Number of input dimensions. action_size: Number of output dimensions noise_scale (float): Added noise amplitude. Default: 0.2. noise_sigma (float): Added noise variance. Default: 0.1. Keyword parameters: hidden_layers (tuple of ints): Tuple defining hidden dimensions in fully connected nets. Default: (64, 64). gamma (float): Discount value. Default: 0.99. tau (float): Soft-copy factor. Default: 0.002. actor_lr (float): Learning rate for the actor (policy). Default: 0.0003. critic_lr (float): Learning rate for the critic (value function). Default: 0.0003. max_grad_norm_actor (float) Maximum norm value for actor gradient. Default: 10. max_grad_norm_critic (float): Maximum norm value for critic gradient. Default: 10. batch_size (int): Number of samples used in learning. Default: 64. buffer_size (int): Maximum number of samples to store. Default: 1e6. warm_up (int): Number of samples to observe before starting any learning step. Default: 0. update_freq (int): Number of steps between each learning step. Default 1. number_updates (int): How many times to use learning step in the learning phase. Default: 1. action_min (float): Minimum returned action value. Default: -1. action_max (float): Maximum returned action value. Default: 1. action_scale (float): Multipler value for action. Default: 1. """ super().__init__(**kwargs) self.device = self._register_param(kwargs, "device", DEVICE) self.state_size = state_size self.action_size = action_size # Reason sequence initiation. hidden_layers = to_numbers_seq( self._register_param(kwargs, 'hidden_layers', (64, 64))) self.actor = ActorBody(state_size, action_size, hidden_layers=hidden_layers, gate_out=torch.tanh).to(self.device) self.critic = CriticBody(state_size, action_size, hidden_layers=hidden_layers).to(self.device) self.target_actor = ActorBody(state_size, action_size, hidden_layers=hidden_layers, gate_out=torch.tanh).to(self.device) self.target_critic = CriticBody(state_size, action_size, hidden_layers=hidden_layers).to( self.device) # Noise sequence initiation self.noise = GaussianNoise(shape=(action_size, ), mu=1e-8, sigma=noise_sigma, scale=noise_scale, device=self.device) # Target sequence initiation hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) # Optimization sequence initiation. self.actor_lr = float(self._register_param(kwargs, 'actor_lr', 3e-4)) self.critic_lr = float(self._register_param(kwargs, 'critic_lr', 3e-4)) self.actor_optimizer = Adam(self.actor.parameters(), lr=self.actor_lr) self.critic_optimizer = Adam(self.critic.parameters(), lr=self.critic_lr) self.max_grad_norm_actor = float( self._register_param(kwargs, "max_grad_norm_actor", 10.0)) self.max_grad_norm_critic = float( self._register_param(kwargs, "max_grad_norm_critic", 10.0)) self.action_min = float(self._register_param(kwargs, 'action_min', -1)) self.action_max = float(self._register_param(kwargs, 'action_max', 1)) self.action_scale = float( self._register_param(kwargs, 'action_scale', 1)) self.gamma = float(self._register_param(kwargs, 'gamma', 0.99)) self.tau = float(self._register_param(kwargs, 'tau', 0.02)) self.batch_size = int(self._register_param(kwargs, 'batch_size', 64)) self.buffer_size = int( self._register_param(kwargs, 'buffer_size', int(1e6))) self.buffer = ReplayBuffer(self.batch_size, self.buffer_size) self.warm_up = int(self._register_param(kwargs, 'warm_up', 0)) self.update_freq = int(self._register_param(kwargs, 'update_freq', 1)) self.number_updates = int( self._register_param(kwargs, 'number_updates', 1)) # Breath, my child. self.reset_agent() self.iteration = 0 self._loss_actor = 0. self._loss_critic = 0. def reset_agent(self) -> None: self.actor.reset_parameters() self.critic.reset_parameters() self.target_actor.reset_parameters() self.target_critic.reset_parameters() @property def loss(self) -> Dict[str, float]: return {'actor': self._loss_actor, 'critic': self._loss_critic} @loss.setter def loss(self, value): if isinstance(value, dict): self._loss_actor = value['actor'] self._loss_critic = value['critic'] else: self._loss_actor = value self._loss_critic = value def __eq__(self, o: object) -> bool: return super().__eq__(o) \ and self._config == o._config \ and self.buffer == o.buffer \ and self.get_network_state() == o.get_network_state() @torch.no_grad() def act(self, obs, noise: float = 0.0) -> List[float]: """Acting on the observations. Returns action. Returns: action: (list float) Action values. """ obs = to_tensor(obs).float().to(self.device) action = self.actor(obs) action += noise * self.noise.sample() action = torch.clamp(action * self.action_scale, self.action_min, self.action_max) return action.cpu().numpy().tolist() def step(self, state, action, reward, next_state, done) -> None: self.iteration += 1 self.buffer.add(state=state, action=action, reward=reward, next_state=next_state, done=done) if self.iteration < self.warm_up: return if len(self.buffer) > self.batch_size and (self.iteration % self.update_freq) == 0: for _ in range(self.number_updates): self.learn(self.buffer.sample()) def compute_value_loss(self, states, actions, next_states, rewards, dones): next_actions = self.target_actor.act(next_states) assert next_actions.shape == actions.shape Q_target_next = self.target_critic.act(next_states, next_actions) Q_target = rewards + self.gamma * Q_target_next * (1 - dones) Q_expected = self.critic(states, actions) assert Q_expected.shape == Q_target.shape == Q_target_next.shape return mse_loss(Q_expected, Q_target) def compute_policy_loss(self, states) -> None: """Compute Policy loss based on provided states. Loss = Mean(-Q(s, _a) ), where _a is actor's estimate based on state, _a = Actor(s). """ pred_actions = self.actor(states) return -self.critic(states, pred_actions).mean() def learn(self, experiences) -> None: """Update critics and actors""" rewards = to_tensor(experiences['reward']).float().to( self.device).unsqueeze(1) dones = to_tensor(experiences['done']).type(torch.int).to( self.device).unsqueeze(1) states = to_tensor(experiences['state']).float().to(self.device) actions = to_tensor(experiences['action']).to(self.device) next_states = to_tensor(experiences['next_state']).float().to( self.device) assert rewards.shape == dones.shape == (self.batch_size, 1) assert states.shape == next_states.shape == (self.batch_size, self.state_size) assert actions.shape == (self.batch_size, self.action_size) # Value (critic) optimization loss_critic = self.compute_value_loss(states, actions, next_states, rewards, dones) self.critic_optimizer.zero_grad() loss_critic.backward() nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm_critic) self.critic_optimizer.step() self._loss_critic = float(loss_critic.item()) # Policy (actor) optimization loss_actor = self.compute_policy_loss(states) self.actor_optimizer.zero_grad() loss_actor.backward() nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm_actor) self.actor_optimizer.step() self._loss_actor = loss_actor.item() # Soft update target weights soft_update(self.target_actor, self.actor, self.tau) soft_update(self.target_critic, self.critic, self.tau) def state_dict(self) -> Dict[str, dict]: """Describes agent's networks. Returns: state: (dict) Provides actors and critics states. """ return { "actor": self.actor.state_dict(), "target_actor": self.target_actor.state_dict(), "critic": self.critic.state_dict(), "target_critic": self.target_critic.state_dict() } def log_metrics(self, data_logger: DataLogger, step: int, full_log: bool = False): data_logger.log_value("loss/actor", self._loss_actor, step) data_logger.log_value("loss/critic", self._loss_critic, step) if full_log: for idx, layer in enumerate(self.actor.layers): if hasattr(layer, "weight"): data_logger.create_histogram(f"actor/layer_weights_{idx}", layer.weight, step) if hasattr(layer, "bias") and layer.bias is not None: data_logger.create_histogram(f"actor/layer_bias_{idx}", layer.bias, step) for idx, layer in enumerate(self.critic.layers): if hasattr(layer, "weight"): data_logger.create_histogram(f"critic/layer_weights_{idx}", layer.weight, step) if hasattr(layer, "bias") and layer.bias is not None: data_logger.create_histogram(f"critic/layer_bias_{idx}", layer.bias, step) def get_state(self) -> AgentState: return AgentState( model=self.name, state_space=self.state_size, action_space=self.action_size, config=self._config, buffer=copy.deepcopy(self.buffer.get_state()), network=copy.deepcopy(self.get_network_state()), ) def get_network_state(self) -> NetworkState: net = dict( actor=self.actor.state_dict(), target_actor=self.target_actor.state_dict(), critic=self.critic.state_dict(), target_critic=self.target_critic.state_dict(), ) return NetworkState(net=net) @staticmethod def from_state(state: AgentState) -> AgentBase: config = copy.copy(state.config) config.update({ 'state_size': state.state_space, 'action_size': state.action_space }) agent = DDPGAgent(**config) if state.network is not None: agent.set_network(state.network) if state.buffer is not None: agent.set_buffer(state.buffer) return agent def set_buffer(self, buffer_state: BufferState) -> None: self.buffer = BufferFactory.from_state(buffer_state) def set_network(self, network_state: NetworkState) -> None: self.actor.load_state_dict(copy.deepcopy(network_state.net['actor'])) self.target_actor.load_state_dict(network_state.net['target_actor']) self.critic.load_state_dict(network_state.net['critic']) self.target_critic.load_state_dict(network_state.net['target_critic']) def save_state(self, path: str) -> None: agent_state = self.get_state() torch.save(agent_state, path) def load_state(self, *, path: Optional[str] = None, agent_state: Optional[dict] = None): if path is None and agent_state: raise ValueError( "Either `path` or `agent_state` must be provided to load agent's state." ) if path is not None and agent_state is None: agent_state = torch.load(path) self._config = agent_state.get('config', {}) self.__dict__.update(**self._config) self.actor.load_state_dict(agent_state['actor']) self.critic.load_state_dict(agent_state['critic']) self.target_actor.load_state_dict(agent_state['target_actor']) self.target_critic.load_state_dict(agent_state['target_critic'])
class MADDPGAgent(AgentType): name = "MADDPG" def __init__(self, env, state_size: int, action_size: int, agents_number: int, config: Dict, **kwargs): self.env = env self.state_size = state_size self.action_size = action_size self.agents_number = agents_number hidden_layers = config.get('hidden_layers', (256, 128)) noise_scale = float(config.get('noise_scale', 0.2)) noise_sigma = float(config.get('noise_sigma', 0.1)) actor_lr = float(config.get('actor_lr', 1e-3)) critic_lr = float(config.get('critic_lr', 1e-3)) self.maddpg_agent = [ DDPGAgent(agents_number * state_size, action_size, hidden_layers=hidden_layers, actor_lr=actor_lr, critic_lr=critic_lr, noise_scale=noise_scale, noise_sigma=noise_sigma) for _ in range(agents_number) ] self.gamma: float = float(config.get('gamma', 0.99)) self.tau: float = float(config.get('tau', 0.002)) self.gradient_clip: Optional[float] = config.get('gradient_clip') self.batch_size: int = int(config.get('batch_size', 64)) self.buffer_size = int(config.get('buffer_size', int(1e6))) self.buffer = ReplayBuffer(self.batch_size, self.buffer_size) self.warm_up: int = int(config.get('warm_up', 1e3)) self.update_freq: int = int(config.get('update_freq', 2)) self.number_updates: int = int(config.get('number_updates', 2)) self.critic = CriticBody(agents_number * state_size, agents_number * action_size, hidden_layers=hidden_layers).to(DEVICE) self.target_critic = CriticBody(agents_number * state_size, agents_number * action_size, hidden_layers=hidden_layers).to(DEVICE) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr) hard_update(self.target_critic, self.critic) self.reset() def reset(self): self.iteration = 0 self.reset_agents() def reset_agents(self): for agent in self.maddpg_agent: agent.reset_agent() self.critic.reset_parameters() self.target_critic.reset_parameters() def step(self, state, action, reward, next_state, done) -> None: self.iteration += 1 self.buffer.add_sars(state=state, action=action, reward=reward, next_state=next_state, done=done) if self.iteration < self.warm_up: return if len(self.buffer) > self.batch_size and (self.iteration % self.update_freq) == 0: for _ in range(self.number_updates): for agent_number in range(self.agents_number): batch = self.buffer.sample_sars() self.learn(batch, agent_number) # self.update_targets() def act(self, states, noise=0.0): """get actions from all agents in the MADDPG object""" tensor_states = torch.tensor(states) with torch.no_grad(): actions = [] for agent in self.maddpg_agent: agent.actor.eval() actions += agent.act(tensor_states, noise) agent.actor.train() return torch.stack(actions) def __flatten_actions(self, actions): return actions.view(-1, self.agents_number * self.action_size) def learn(self, samples, agent_number: int) -> None: """update the critics and actors of all the agents """ action_offset = agent_number * self.action_size # No need to flip since there are no paralle agents states, actions, rewards, next_states, dones = samples flat_states = states.view(-1, self.agents_number * self.state_size) flat_next_states = next_states.view( -1, self.agents_number * self.state_size) flat_actions = actions.view(-1, self.agents_number * self.action_size) agent_rewards = rewards.select(1, agent_number).view(-1, 1).detach() agent_dones = dones.select(1, agent_number).view(-1, 1).detach() agent = self.maddpg_agent[agent_number] next_actions = actions.detach().clone() next_actions.data[:, action_offset:action_offset + self.action_size] = agent.target_actor( flat_next_states) # critic loss Q_target_next = self.target_critic( flat_next_states, self.__flatten_actions(next_actions)) Q_target = agent_rewards + (self.gamma * Q_target_next * (1 - agent_dones)) Q_expected = self.critic(flat_states, flat_actions) critic_loss = F.mse_loss(Q_expected, Q_target) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() if self.gradient_clip: torch.nn.utils.clip_grad_norm_(self.critic.parameters(), self.gradient_clip) self.critic_optimizer.step() self.critic_loss = critic_loss.mean().item() # Compute actor loss pred_actions = actions.detach().clone() pred_actions.data[:, action_offset:action_offset + self.action_size] = agent.actor(flat_states) actor_loss = -self.critic(flat_states, self.__flatten_actions(pred_actions)).mean() agent.actor_optimizer.zero_grad() actor_loss.backward() agent.actor_optimizer.step() self.actor_loss = actor_loss.mean().item() soft_update(agent.target_actor, agent.actor, self.tau) soft_update(self.target_critic, self.critic, self.tau) def update_targets(self): """soft update targets""" for ddpg_agent in self.maddpg_agent: soft_update(ddpg_agent.target_actor, ddpg_agent.actor, self.tau) soft_update(self.target_critic, self.critic, self.tau) def log_writer(self, writer, episode): writer.add_scalar("loss/actor", self.actor_loss, episode) writer.add_scalar("loss/critic", self.critic_loss, episode)
def __init__(self, state_size: int, action_size: int, num_agents: int, **kwargs): """Initiation of the Multi Agent DDPG. All keywords are also passed to DDPG agents. Parameters: state_size (int): Dimensionality of the state. action_size (int): Dimensionality of the action. num_agents (int): Number of agents. Keyword Arguments: hidden_layers (tuple of ints): Shape for fully connected hidden layers. noise_scale (float): Default: 1.0. Noise amplitude. noise_sigma (float): Default: 0.5. Noise variance. actor_lr (float): Default: 0.001. Learning rate for actor network. critic_lr (float): Default: 0.001. Learning rate for critic network. gamma (float): Default: 0.99. Discount value tau (float): Default: 0.02. Soft copy value. gradient_clip (optional float): Max norm for learning gradient. If None then no clip. batch_size (int): Number of samples per learning. buffer_size (int): Number of previous samples to remember. warm_up (int): Number of samples to see before start learning. update_freq (int): How many samples between learning sessions. number_updates (int): How many learning cycles per learning session. """ self.device = self._register_param(kwargs, "device", DEVICE, update=True) self.state_size: int = state_size self.action_size = action_size self.num_agents: int = num_agents self.agent_names: List[str] = kwargs.get("agent_names", map(str, range(self.num_agents))) hidden_layers = to_numbers_seq(self._register_param(kwargs, 'hidden_layers', (100, 100), update=True)) noise_scale = float(self._register_param(kwargs, 'noise_scale', 0.5)) noise_sigma = float(self._register_param(kwargs, 'noise_sigma', 1.0)) actor_lr = float(self._register_param(kwargs, 'actor_lr', 3e-4)) critic_lr = float(self._register_param(kwargs, 'critic_lr', 3e-4)) self.agents: Dict[str, DDPGAgent] = OrderedDict({ agent_name: DDPGAgent( state_size, action_size, actor_lr=actor_lr, critic_lr=critic_lr, noise_scale=noise_scale, noise_sigma=noise_sigma, **kwargs, ) for agent_name in self.agent_names }) self.gamma = float(self._register_param(kwargs, 'gamma', 0.99)) self.tau = float(self._register_param(kwargs, 'tau', 0.02)) self.gradient_clip: Optional[float] = self._register_param(kwargs, 'gradient_clip') self.batch_size = int(self._register_param(kwargs, 'batch_size', 64)) self.buffer_size = int(self._register_param(kwargs, 'buffer_size', int(1e6))) self.buffer = ReplayBuffer(self.batch_size, self.buffer_size) self.warm_up = int(self._register_param(kwargs, 'warm_up', 0)) self.update_freq = int(self._register_param(kwargs, 'update_freq', 1)) self.number_updates = int(self._register_param(kwargs, 'number_updates', 1)) self.critic = CriticBody(num_agents*state_size, num_agents*action_size, hidden_layers=hidden_layers).to(self.device) self.target_critic = CriticBody(num_agents*state_size, num_agents*action_size, hidden_layers=hidden_layers).to(self.device) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr) hard_update(self.target_critic, self.critic) self._step_data = {} self._loss_critic: float = float('inf') self._loss_actor: Dict[str, float] = {name: float('inf') for name in self.agent_names} self.reset()
def __init__(self, state_size: int, action_size: int, noise_scale: float = 0.2, noise_sigma: float = 0.1, **kwargs): """ Parameters: state_size (int): Number of input dimensions. action_size (int): Number of output dimensions noise_scale (float): Added noise amplitude. Default: 0.2. noise_sigma (float): Added noise variance. Default: 0.1. Keyword parameters: hidden_layers (tuple of ints): Tuple defining hidden dimensions in fully connected nets. Default: (128, 128). actor_lr (float): Learning rate for the actor (policy). Default: 0.003. critic_lr (float): Learning rate for the critic (value function). Default: 0.003. gamma (float): Discount value. Default: 0.99. tau (float): Soft-copy factor. Default: 0.02. actor_hidden_layers (tuple of ints): Shape of network for actor. Default: `hideen_layers`. critic_hidden_layers (tuple of ints): Shape of network for critic. Default: `hideen_layers`. max_grad_norm_actor (float) Maximum norm value for actor gradient. Default: 100. max_grad_norm_critic (float): Maximum norm value for critic gradient. Default: 100. batch_size (int): Number of samples used in learning. Default: 64. buffer_size (int): Maximum number of samples to store. Default: 1e6. warm_up (int): Number of samples to observe before starting any learning step. Default: 0. update_freq (int): Number of steps between each learning step. Default 1. number_updates (int): How many times to use learning step in the learning phase. Default: 1. action_min (float): Minimum returned action value. Default: -1. action_max (float): Maximum returned action value. Default: 1. action_scale (float): Multipler value for action. Default: 1. """ super().__init__(**kwargs) self.device = self._register_param( kwargs, "device", DEVICE) # Default device is CUDA if available # Reason sequence initiation. self.state_size = state_size self.action_size = action_size hidden_layers = to_numbers_seq( self._register_param(kwargs, 'hidden_layers', (128, 128))) self.actor = ActorBody(state_size, action_size, hidden_layers=hidden_layers).to(self.device) self.critic = DoubleCritic(state_size, action_size, CriticBody, hidden_layers=hidden_layers).to(self.device) self.target_actor = ActorBody(state_size, action_size, hidden_layers=hidden_layers).to( self.device) self.target_critic = DoubleCritic(state_size, action_size, CriticBody, hidden_layers=hidden_layers).to( self.device) # Noise sequence initiation # self.noise = GaussianNoise(shape=(action_size,), mu=1e-8, sigma=noise_sigma, scale=noise_scale, device=device) self.noise = OUProcess(shape=action_size, scale=noise_scale, sigma=noise_sigma, device=self.device) # Target sequence initiation hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) # Optimization sequence initiation. actor_lr = float(self._register_param(kwargs, 'actor_lr', 3e-3)) critic_lr = float(self._register_param(kwargs, 'critic_lr', 3e-3)) self.actor_optimizer = AdamW(self.actor.parameters(), lr=actor_lr) self.critic_optimizer = AdamW(self.critic.parameters(), lr=critic_lr) self.max_grad_norm_actor: float = float( kwargs.get("max_grad_norm_actor", 100)) self.max_grad_norm_critic: float = float( kwargs.get("max_grad_norm_critic", 100)) self.action_min = float(self._register_param(kwargs, 'action_min', -1.)) self.action_max = float(self._register_param(kwargs, 'action_max', 1.)) self.action_scale = float( self._register_param(kwargs, 'action_scale', 1.)) self.gamma = float(self._register_param(kwargs, 'gamma', 0.99)) self.tau = float(self._register_param(kwargs, 'tau', 0.02)) self.batch_size = int(self._register_param(kwargs, 'batch_size', 64)) self.buffer_size = int( self._register_param(kwargs, 'buffer_size', int(1e6))) self.buffer = ReplayBuffer(self.batch_size, self.buffer_size) self.warm_up = int(self._register_param(kwargs, 'warm_up', 0)) self.update_freq = int(self._register_param(kwargs, 'update_freq', 1)) self.update_policy_freq = int( self._register_param(kwargs, 'update_policy_freq', 1)) self.number_updates = int( self._register_param(kwargs, 'number_updates', 1)) self.noise_reset_freq = int( self._register_param(kwargs, 'noise_reset_freq', 10000)) # Breath, my child. self.reset_agent() self.iteration = 0 self._loss_actor = 0. self._loss_critic = 0.
class TD3Agent(AgentBase): """ Twin Delayed Deep Deterministic (TD3) Policy Gradient. In short, it's a slightly modified/improved version of the DDPG. Compared to the DDPG in this package, which uses Guassian noise, this TD3 uses Ornstein–Uhlenbeck process as the noise. """ name = "TD3" def __init__(self, state_size: int, action_size: int, noise_scale: float = 0.2, noise_sigma: float = 0.1, **kwargs): """ Parameters: state_size (int): Number of input dimensions. action_size (int): Number of output dimensions noise_scale (float): Added noise amplitude. Default: 0.2. noise_sigma (float): Added noise variance. Default: 0.1. Keyword parameters: hidden_layers (tuple of ints): Tuple defining hidden dimensions in fully connected nets. Default: (128, 128). actor_lr (float): Learning rate for the actor (policy). Default: 0.003. critic_lr (float): Learning rate for the critic (value function). Default: 0.003. gamma (float): Discount value. Default: 0.99. tau (float): Soft-copy factor. Default: 0.02. actor_hidden_layers (tuple of ints): Shape of network for actor. Default: `hideen_layers`. critic_hidden_layers (tuple of ints): Shape of network for critic. Default: `hideen_layers`. max_grad_norm_actor (float) Maximum norm value for actor gradient. Default: 100. max_grad_norm_critic (float): Maximum norm value for critic gradient. Default: 100. batch_size (int): Number of samples used in learning. Default: 64. buffer_size (int): Maximum number of samples to store. Default: 1e6. warm_up (int): Number of samples to observe before starting any learning step. Default: 0. update_freq (int): Number of steps between each learning step. Default 1. number_updates (int): How many times to use learning step in the learning phase. Default: 1. action_min (float): Minimum returned action value. Default: -1. action_max (float): Maximum returned action value. Default: 1. action_scale (float): Multipler value for action. Default: 1. """ super().__init__(**kwargs) self.device = self._register_param( kwargs, "device", DEVICE) # Default device is CUDA if available # Reason sequence initiation. self.state_size = state_size self.action_size = action_size hidden_layers = to_numbers_seq( self._register_param(kwargs, 'hidden_layers', (128, 128))) self.actor = ActorBody(state_size, action_size, hidden_layers=hidden_layers).to(self.device) self.critic = DoubleCritic(state_size, action_size, CriticBody, hidden_layers=hidden_layers).to(self.device) self.target_actor = ActorBody(state_size, action_size, hidden_layers=hidden_layers).to( self.device) self.target_critic = DoubleCritic(state_size, action_size, CriticBody, hidden_layers=hidden_layers).to( self.device) # Noise sequence initiation # self.noise = GaussianNoise(shape=(action_size,), mu=1e-8, sigma=noise_sigma, scale=noise_scale, device=device) self.noise = OUProcess(shape=action_size, scale=noise_scale, sigma=noise_sigma, device=self.device) # Target sequence initiation hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) # Optimization sequence initiation. actor_lr = float(self._register_param(kwargs, 'actor_lr', 3e-3)) critic_lr = float(self._register_param(kwargs, 'critic_lr', 3e-3)) self.actor_optimizer = AdamW(self.actor.parameters(), lr=actor_lr) self.critic_optimizer = AdamW(self.critic.parameters(), lr=critic_lr) self.max_grad_norm_actor: float = float( kwargs.get("max_grad_norm_actor", 100)) self.max_grad_norm_critic: float = float( kwargs.get("max_grad_norm_critic", 100)) self.action_min = float(self._register_param(kwargs, 'action_min', -1.)) self.action_max = float(self._register_param(kwargs, 'action_max', 1.)) self.action_scale = float( self._register_param(kwargs, 'action_scale', 1.)) self.gamma = float(self._register_param(kwargs, 'gamma', 0.99)) self.tau = float(self._register_param(kwargs, 'tau', 0.02)) self.batch_size = int(self._register_param(kwargs, 'batch_size', 64)) self.buffer_size = int( self._register_param(kwargs, 'buffer_size', int(1e6))) self.buffer = ReplayBuffer(self.batch_size, self.buffer_size) self.warm_up = int(self._register_param(kwargs, 'warm_up', 0)) self.update_freq = int(self._register_param(kwargs, 'update_freq', 1)) self.update_policy_freq = int( self._register_param(kwargs, 'update_policy_freq', 1)) self.number_updates = int( self._register_param(kwargs, 'number_updates', 1)) self.noise_reset_freq = int( self._register_param(kwargs, 'noise_reset_freq', 10000)) # Breath, my child. self.reset_agent() self.iteration = 0 self._loss_actor = 0. self._loss_critic = 0. @property def loss(self) -> Dict[str, float]: return {'actor': self._loss_actor, 'critic': self._loss_critic} @loss.setter def loss(self, value): if isinstance(value, dict): self._loss_actor = value['actor'] self._loss_critic = value['critic'] else: self._loss_actor = value self._loss_critic = value def reset_agent(self) -> None: self.actor.reset_parameters() self.critic.reset_parameters() self.target_actor.reset_parameters() self.target_critic.reset_parameters() def act(self, state, epsilon: float = 0.0, training_mode=True) -> List[float]: """ Agent acting on observations. When the training_mode is True (default) a noise is added to each action. """ # Epsilon greedy if self._rng.random() < epsilon: rnd_actions = torch.rand(self.action_size) * ( self.action_max - self.action_min) - self.action_min return rnd_actions.tolist() with torch.no_grad(): state = to_tensor(state).float().to(self.device) action = self.actor(state) if training_mode: action += self.noise.sample() return (self.action_scale * torch.clamp(action, self.action_min, self.action_max)).tolist() def target_act(self, staten, noise: float = 0.0): with torch.no_grad(): staten = to_tensor(staten).float().to(self.device) action = self.target_actor(staten) + noise * self.noise.sample() return torch.clamp(action, self.action_min, self.action_max).cpu().numpy().astype( np.float32) def step(self, state, action, reward, next_state, done): self.iteration += 1 self.buffer.add(state=state, action=action, reward=reward, next_state=next_state, done=done) if (self.iteration % self.noise_reset_freq) == 0: self.noise.reset_states() if self.iteration < self.warm_up: return if len(self.buffer) <= self.batch_size: return if not (self.iteration % self.update_freq) or not ( self.iteration % self.update_policy_freq): for _ in range(self.number_updates): # Note: Inside this there's a delayed policy update. # Every `update_policy_freq` it will learn `number_updates` times. self.learn(self.buffer.sample()) def learn(self, experiences): """Update critics and actors""" rewards = to_tensor(experiences['reward']).float().to( self.device).unsqueeze(1) dones = to_tensor(experiences['done']).type(torch.int).to( self.device).unsqueeze(1) states = to_tensor(experiences['state']).float().to(self.device) actions = to_tensor(experiences['action']).to(self.device) next_states = to_tensor(experiences['next_state']).float().to( self.device) if (self.iteration % self.update_freq) == 0: self._update_value_function(states, actions, rewards, next_states, dones) if (self.iteration % self.update_policy_freq) == 0: self._update_policy(states) soft_update(self.target_actor, self.actor, self.tau) soft_update(self.target_critic, self.critic, self.tau) def _update_value_function(self, states, actions, rewards, next_states, dones): # critic loss next_actions = self.target_actor.act(next_states) Q_target_next = torch.min( *self.target_critic.act(next_states, next_actions)) Q_target = rewards + (self.gamma * Q_target_next * (1 - dones)) Q1_expected, Q2_expected = self.critic(states, actions) loss_critic = mse_loss(Q1_expected, Q_target) + mse_loss( Q2_expected, Q_target) # Minimize the loss self.critic_optimizer.zero_grad() loss_critic.backward() nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm_critic) self.critic_optimizer.step() self._loss_critic = float(loss_critic.item()) def _update_policy(self, states): # Compute actor loss pred_actions = self.actor(states) loss_actor = -self.critic(states, pred_actions)[0].mean() self.actor_optimizer.zero_grad() loss_actor.backward() nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm_actor) self.actor_optimizer.step() self._loss_actor = loss_actor.item() def state_dict(self) -> Dict[str, dict]: """Describes agent's networks. Returns: state: (dict) Provides actors and critics states. """ return { "actor": self.actor.state_dict(), "target_actor": self.target_actor.state_dict(), "critic": self.critic.state_dict(), "target_critic": self.target_critic() } def log_metrics(self, data_logger: DataLogger, step: int, full_log: bool = False): data_logger.log_value("loss/actor", self._loss_actor, step) data_logger.log_value("loss/critic", self._loss_critic, step) def get_state(self): return dict( actor=self.actor.state_dict(), target_actor=self.target_actor.state_dict(), critic=self.critic.state_dict(), target_critic=self.target_critic.state_dict(), config=self._config, ) def save_state(self, path: str): agent_state = self.get_state() torch.save(agent_state, path) def load_state(self, path: str): agent_state = torch.load(path) self._config = agent_state.get('config', {}) self.__dict__.update(**self._config) self.actor.load_state_dict(agent_state['actor']) self.critic.load_state_dict(agent_state['critic']) self.target_actor.load_state_dict(agent_state['target_actor']) self.target_critic.load_state_dict(agent_state['target_critic'])
class DDPGAgent(AgentBase): """ Deep Deterministic Policy Gradients (DDPG). Instead of popular Ornstein-Uhlenbeck (OU) process for noise this agent uses Gaussian noise. """ name = "DDPG" def __init__(self, state_size: int, action_size: int, actor_lr: float = 2e-3, critic_lr: float = 2e-3, noise_scale: float = 0.2, noise_sigma: float = 0.1, **kwargs): super().__init__(**kwargs) self.device = self._register_param(kwargs, "device", DEVICE) self.state_size = state_size self.action_size = action_size # Reason sequence initiation. hidden_layers = to_numbers_seq( self._register_param(kwargs, 'hidden_layers', (128, 128))) self.actor = ActorBody(state_size, action_size, hidden_layers=hidden_layers, gate_out=torch.tanh).to(self.device) self.critic = CriticBody(state_size, action_size, hidden_layers=hidden_layers).to(self.device) self.target_actor = ActorBody(state_size, action_size, hidden_layers=hidden_layers, gate_out=torch.tanh).to(self.device) self.target_critic = CriticBody(state_size, action_size, hidden_layers=hidden_layers).to( self.device) # Noise sequence initiation self.noise = GaussianNoise(shape=(action_size, ), mu=1e-8, sigma=noise_sigma, scale=noise_scale, device=self.device) # Target sequence initiation hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) # Optimization sequence initiation. self.actor_lr = float( self._register_param(kwargs, 'actor_lr', actor_lr)) self.critic_lr = float( self._register_param(kwargs, 'critic_lr', critic_lr)) self.actor_optimizer = Adam(self.actor.parameters(), lr=self.actor_lr) self.critic_optimizer = Adam(self.critic.parameters(), lr=self.critic_lr) self.max_grad_norm_actor = float( self._register_param(kwargs, "max_grad_norm_actor", 10.0)) self.max_grad_norm_critic = float( self._register_param(kwargs, "max_grad_norm_critic", 10.0)) self.action_min = float(self._register_param(kwargs, 'action_min', -1)) self.action_max = float(self._register_param(kwargs, 'action_max', 1)) self.action_scale = float( self._register_param(kwargs, 'action_scale', 1)) self.gamma = float(self._register_param(kwargs, 'gamma', 0.99)) self.tau = float(self._register_param(kwargs, 'tau', 0.02)) self.batch_size = int(self._register_param(kwargs, 'batch_size', 64)) self.buffer_size = int( self._register_param(kwargs, 'buffer_size', int(1e6))) self.buffer = ReplayBuffer(self.batch_size, self.buffer_size) self.warm_up = int(self._register_param(kwargs, 'warm_up', 0)) self.update_freq = int(self._register_param(kwargs, 'update_freq', 1)) self.number_updates = int( self._register_param(kwargs, 'number_updates', 1)) # Breath, my child. self.reset_agent() self.iteration = 0 self._loss_actor = 0. self._loss_critic = 0. def reset_agent(self) -> None: self.actor.reset_parameters() self.critic.reset_parameters() self.target_actor.reset_parameters() self.target_critic.reset_parameters() @property def loss(self) -> Dict[str, float]: return {'actor': self._loss_actor, 'critic': self._loss_critic} @loss.setter def loss(self, value): if isinstance(value, dict): self._loss_actor = value['actor'] self._loss_critic = value['critic'] else: self._loss_actor = value self._loss_critic = value @torch.no_grad() def act(self, obs, noise: float = 0.0) -> List[float]: """Acting on the observations. Returns action. Returns: action: (list float) Action values. """ obs = to_tensor(obs).float().to(self.device) action = self.actor(obs) action += noise * self.noise.sample() action = torch.clamp(action * self.action_scale, self.action_min, self.action_max) return action.cpu().numpy().tolist() def step(self, state, action, reward, next_state, done) -> None: self.iteration += 1 self.buffer.add(state=state, action=action, reward=reward, next_state=next_state, done=done) if self.iteration < self.warm_up: return if len(self.buffer) > self.batch_size and (self.iteration % self.update_freq) == 0: for _ in range(self.number_updates): self.learn(self.buffer.sample()) def compute_value_loss(self, states, actions, next_states, rewards, dones): next_actions = self.target_actor.act(next_states) assert next_actions.shape == actions.shape Q_target_next = self.target_critic.act(next_states, next_actions) Q_target = rewards + self.gamma * Q_target_next * (1 - dones) Q_expected = self.critic(states, actions) assert Q_expected.shape == Q_target.shape == Q_target_next.shape return mse_loss(Q_expected, Q_target) def compute_policy_loss(self, states) -> None: """Compute Policy loss based on provided states. Loss = Mean(-Q(s, _a) ), where _a is actor's estimate based on state, _a = Actor(s). """ pred_actions = self.actor(states) return -self.critic(states, pred_actions).mean() def learn(self, experiences) -> None: """Update critics and actors""" rewards = to_tensor(experiences['reward']).float().to( self.device).unsqueeze(1) dones = to_tensor(experiences['done']).type(torch.int).to( self.device).unsqueeze(1) states = to_tensor(experiences['state']).float().to(self.device) actions = to_tensor(experiences['action']).to(self.device) next_states = to_tensor(experiences['next_state']).float().to( self.device) assert rewards.shape == dones.shape == (self.batch_size, 1) assert states.shape == next_states.shape == (self.batch_size, self.state_size) assert actions.shape == (self.batch_size, self.action_size) # Value (critic) optimization loss_critic = self.compute_value_loss(states, actions, next_states, rewards, dones) self.critic_optimizer.zero_grad() loss_critic.backward() nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm_critic) self.critic_optimizer.step() self._loss_critic = float(loss_critic.item()) # Policy (actor) optimization loss_actor = self.compute_policy_loss(states) self.actor_optimizer.zero_grad() loss_actor.backward() nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm_actor) self.actor_optimizer.step() self._loss_actor = loss_actor.item() # Soft update target weights soft_update(self.target_actor, self.actor, self.tau) soft_update(self.target_critic, self.critic, self.tau) def state_dict(self) -> Dict[str, dict]: """Describes agent's networks. Returns: state: (dict) Provides actors and critics states. """ return { "actor": self.actor.state_dict(), "target_actor": self.target_actor.state_dict(), "critic": self.critic.state_dict(), "target_critic": self.target_critic.state_dict() } def log_metrics(self, data_logger: DataLogger, step: int, full_log: bool = False): data_logger.log_value("loss/actor", self._loss_actor, step) data_logger.log_value("loss/critic", self._loss_critic, step) if full_log: for idx, layer in enumerate(self.actor.layers): if hasattr(layer, "weight"): data_logger.create_histogram(f"actor/layer_weights_{idx}", layer.weight, step) if hasattr(layer, "bias") and layer.bias is not None: data_logger.create_histogram(f"actor/layer_bias_{idx}", layer.bias, step) for idx, layer in enumerate(self.critic.layers): if hasattr(layer, "weight"): data_logger.create_histogram(f"critic/layer_weights_{idx}", layer.weight, step) if hasattr(layer, "bias") and layer.bias is not None: data_logger.create_histogram(f"critic/layer_bias_{idx}", layer.bias, step) def get_state(self) -> AgentState: net = dict( actor=self.actor.state_dict(), target_actor=self.target_actor.state_dict(), critic=self.critic.state_dict(), target_critic=self.target_critic.state_dict(), ) network_state: NetworkState = NetworkState(net=net) return AgentState(model=self.name, state_space=self.state_size, action_space=self.action_size, config=self._config, buffer=self.buffer.get_state(), network=network_state) def save_state(self, path: str) -> None: agent_state = self.get_state() torch.save(agent_state, path) def load_state(self, *, path: Optional[str] = None, agent_state: Optional[dict] = None): if path is None and agent_state: raise ValueError( "Either `path` or `agent_state` must be provided to load agent's state." ) if path is not None and agent_state is None: agent_state = torch.load(path) self._config = agent_state.get('config', {}) self.__dict__.update(**self._config) self.actor.load_state_dict(agent_state['actor']) self.critic.load_state_dict(agent_state['critic']) self.target_actor.load_state_dict(agent_state['target_actor']) self.target_critic.load_state_dict(agent_state['target_critic'])
def __init__(self, state_size: int, action_size: int, actor_lr: float = 2e-3, critic_lr: float = 2e-3, noise_scale: float = 0.2, noise_sigma: float = 0.1, **kwargs): super().__init__(**kwargs) self.device = self._register_param(kwargs, "device", DEVICE) self.state_size = state_size self.action_size = action_size # Reason sequence initiation. hidden_layers = to_numbers_seq( self._register_param(kwargs, 'hidden_layers', (128, 128))) self.actor = ActorBody(state_size, action_size, hidden_layers=hidden_layers, gate_out=torch.tanh).to(self.device) self.critic = CriticBody(state_size, action_size, hidden_layers=hidden_layers).to(self.device) self.target_actor = ActorBody(state_size, action_size, hidden_layers=hidden_layers, gate_out=torch.tanh).to(self.device) self.target_critic = CriticBody(state_size, action_size, hidden_layers=hidden_layers).to( self.device) # Noise sequence initiation self.noise = GaussianNoise(shape=(action_size, ), mu=1e-8, sigma=noise_sigma, scale=noise_scale, device=self.device) # Target sequence initiation hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) # Optimization sequence initiation. self.actor_lr = float( self._register_param(kwargs, 'actor_lr', actor_lr)) self.critic_lr = float( self._register_param(kwargs, 'critic_lr', critic_lr)) self.actor_optimizer = Adam(self.actor.parameters(), lr=self.actor_lr) self.critic_optimizer = Adam(self.critic.parameters(), lr=self.critic_lr) self.max_grad_norm_actor = float( self._register_param(kwargs, "max_grad_norm_actor", 10.0)) self.max_grad_norm_critic = float( self._register_param(kwargs, "max_grad_norm_critic", 10.0)) self.action_min = float(self._register_param(kwargs, 'action_min', -1)) self.action_max = float(self._register_param(kwargs, 'action_max', 1)) self.action_scale = float( self._register_param(kwargs, 'action_scale', 1)) self.gamma = float(self._register_param(kwargs, 'gamma', 0.99)) self.tau = float(self._register_param(kwargs, 'tau', 0.02)) self.batch_size = int(self._register_param(kwargs, 'batch_size', 64)) self.buffer_size = int( self._register_param(kwargs, 'buffer_size', int(1e6))) self.buffer = ReplayBuffer(self.batch_size, self.buffer_size) self.warm_up = int(self._register_param(kwargs, 'warm_up', 0)) self.update_freq = int(self._register_param(kwargs, 'update_freq', 1)) self.number_updates = int( self._register_param(kwargs, 'number_updates', 1)) # Breath, my child. self.reset_agent() self.iteration = 0 self._loss_actor = 0. self._loss_critic = 0.
def __init__(self, state_size: int, action_size: int, hidden_layers: Sequence[int] = (128, 128), actor_lr: float = 2e-3, critic_lr: float = 2e-3, clip: Tuple[int, int] = (-1, 1), alpha: float = 0.2, device=None, **kwargs): self.device = device if device is not None else DEVICE self.action_size = action_size # Reason sequence initiation. self.hidden_layers = kwargs.get('hidden_layers', hidden_layers) self.policy = GaussianPolicy(action_size).to(self.device) self.actor = ActorBody(state_size, action_size, hidden_layers=hidden_layers).to(self.device) self.double_critic = DoubleCritic(state_size, action_size, hidden_layers).to(self.device) self.target_double_critic = DoubleCritic(state_size, action_size, hidden_layers).to(self.device) # Target sequence initiation hard_update(self.target_double_critic, self.double_critic) # Optimization sequence initiation. self.target_entropy = -action_size self.alpha_lr = kwargs.get("alpha_lr") alpha_init = kwargs.get("alpha", alpha) self.log_alpha = torch.tensor(np.log(alpha_init), device=self.device, requires_grad=True) self.actor_params = list(self.actor.parameters()) + [self.policy.std] self.critic_params = list(self.double_critic.parameters()) self.actor_optimizer = optim.Adam(self.actor_params, lr=actor_lr) self.critic_optimizer = optim.Adam(list(self.critic_params), lr=critic_lr) if self.alpha_lr is not None: self.alpha_optimizer = optim.Adam([self.log_alpha], lr=self.alpha_lr) self.action_min = clip[0] self.action_max = clip[1] self.action_scale = kwargs.get('action_scale', 1) self.max_grad_norm_alpha: float = float( kwargs.get("max_grad_norm_alpha", 1.0)) self.max_grad_norm_actor: float = float( kwargs.get("max_grad_norm_actor", 20.0)) self.max_grad_norm_critic: float = float( kwargs.get("max_grad_norm_critic", 20.0)) self.gamma: float = float(kwargs.get('gamma', 0.99)) self.tau: float = float(kwargs.get('tau', 0.02)) self.batch_size: int = int(kwargs.get('batch_size', 64)) self.buffer_size: int = int(kwargs.get('buffer_size', int(1e6))) self.memory = Buffer(self.batch_size, self.buffer_size) self.warm_up: int = int(kwargs.get('warm_up', 0)) self.update_freq: int = int(kwargs.get('update_freq', 1)) self.number_updates: int = int(kwargs.get('number_updates', 1)) # Breath, my child. self.reset_agent() self.iteration = 0 self.actor_loss = np.nan self.critic_loss = np.nan
def __init__(self, state_size: int, action_size: int, hidden_layers: Sequence[int] = (128, 128), actor_lr: float = 2e-3, actor_lr_decay: float = 0, critic_lr: float = 2e-3, critic_lr_decay: float = 0, noise_scale: float = 0.2, noise_sigma: float = 0.1, clip: Tuple[int, int] = (-1, 1), config=None, device=None, **kwargs): config = config if config is not None else dict() self.device = device if device is not None else DEVICE # Reason sequence initiation. self.hidden_layers = config.get('hidden_layers', hidden_layers) self.actor = ActorBody(state_size, action_size, hidden_layers=hidden_layers).to(self.device) self.critic = CriticBody(state_size, action_size, hidden_layers=hidden_layers).to(self.device) self.target_actor = ActorBody(state_size, action_size, hidden_layers=hidden_layers).to( self.device) self.target_critic = CriticBody(state_size, action_size, hidden_layers=hidden_layers).to( self.device) # Noise sequence initiation self.noise = GaussianNoise(shape=(action_size, ), mu=1e-8, sigma=noise_sigma, scale=noise_scale, device=device) # Target sequence initiation hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) # Optimization sequence initiation. self.actor_optimizer = Adam(self.actor.parameters(), lr=actor_lr, weight_decay=actor_lr_decay) self.critic_optimizer = Adam(self.critic.parameters(), lr=critic_lr, weight_decay=critic_lr_decay) self.action_min = clip[0] self.action_max = clip[1] self.action_scale = config.get('action_scale', 1) self.gamma: float = float(config.get('gamma', 0.99)) self.tau: float = float(config.get('tau', 0.02)) self.batch_size: int = int(config.get('batch_size', 64)) self.buffer_size: int = int(config.get('buffer_size', int(1e6))) self.buffer = ReplayBuffer(self.batch_size, self.buffer_size) self.warm_up: int = int(config.get('warm_up', 0)) self.update_freq: int = int(config.get('update_freq', 1)) self.number_updates: int = int(config.get('number_updates', 1)) # Breath, my child. self.reset_agent() self.iteration = 0
def test_replay_buffer_seed(): # Assign batch_size = 4 buffer_0 = ReplayBuffer(batch_size) buffer_1 = ReplayBuffer(batch_size, seed=32167) buffer_2 = ReplayBuffer(batch_size, seed=32167) # Act for sars in generate_sample_SARS(400, dict_type=True): buffer_0.add(**copy.deepcopy(sars)) buffer_1.add(**copy.deepcopy(sars)) buffer_2.add(**copy.deepcopy(sars)) # Assert for _ in range(10): samples_0 = buffer_0.sample() samples_1 = buffer_1.sample() samples_2 = buffer_2.sample() assert samples_0 != samples_1 assert samples_0 != samples_2 assert samples_1 == samples_2