예제 #1
0
class DQN(RLAlgorithm):
    def __init__(self, env, do_render, num_threads, gamma, lr,
                 global_max_episode):

        state_size, action_size = env.observation_space.shape[
            0], env.action_space.n

        self.qnetwork_global = QNetwork(state_size, action_size)  #.to(device)
        self.qnetwork_global.share_memory()

        self.qnetwork_target = QNetwork(state_size, action_size)  #.to(device)
        self.qnetwork_target.share_memory()

        self.agents = [
            DQNAgent(id=id,
                     env=env,
                     do_render=do_render,
                     state_size=state_size,
                     action_size=action_size,
                     n_episodes=global_max_episode,
                     lr=lr,
                     gamma=gamma,
                     update_every=UPDATE_EVERY + num_threads,
                     global_network=self.qnetwork_global,
                     target_network=self.qnetwork_target)
            for id in range(num_threads)
        ]

    def train(self):
        [agent.start() for agent in self.agents]
        [agent.join() for agent in self.agents]
예제 #2
0
    def __init__(self, state_size, action_size, num_agents, double_dqn=True):
        self.action_size = action_size
        self.double_dqn = double_dqn

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size).to(device)
        self.qnetwork_target = copy.deepcopy(self.qnetwork_local)
        self.optimizer = torch.optim.Adam(self.qnetwork_local.parameters(),
                                          lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(BUFFER_SIZE)
        self.num_agents = num_agents
        self.t_step = 0
예제 #3
0
    def __init__(self, state_size, action_size, seed):
        
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) #prediction net
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) #target network
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # store SARS when reach the batch size train 
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        self.t_step = 0 # time step
    def __init__(self, state_size, action_size, seed, num_threads,
                 update_every, lr):
        super(ParameterServer, self).__init__()

        # TODO: Figure out how to just create a tensor that can do ASGD without the model needed
        self.model = QNetwork(state_size, action_size, seed)  # TODO remove

        self.time = mp.Value('i', 0)

        p = [p for p in self.model.parameters()]
        # params = [torch.nn.Parameter(p[i].clone().detach()) for i in range(len(p))]
        # [g.share_memory_() for g in params]  # Store gradients in shared memory
        # self.parameters = params
        self.qs = [mp.Queue() for q in range(len(p))]
        self.shard_mem = [p[i].share_memory_() for i in range(len(p))]
        self.shards = [
            ParameterServerShard(p[i], self.shard_mem[i], lr, self.qs[i])
            for i in range(len(p))
        ]
        [shard.start() for shard in self.shards]
class ParameterServer():
    def __init__(self, state_size, action_size, seed, num_threads,
                 update_every, lr):
        super(ParameterServer, self).__init__()

        # TODO: Figure out how to just create a tensor that can do ASGD without the model needed
        self.model = QNetwork(state_size, action_size, seed)  # TODO remove

        self.time = mp.Value('i', 0)

        p = [p for p in self.model.parameters()]
        # params = [torch.nn.Parameter(p[i].clone().detach()) for i in range(len(p))]
        # [g.share_memory_() for g in params]  # Store gradients in shared memory
        # self.parameters = params
        self.qs = [mp.Queue() for q in range(len(p))]
        self.shard_mem = [p[i].share_memory_() for i in range(len(p))]
        self.shards = [
            ParameterServerShard(p[i], self.shard_mem[i], lr, self.qs[i])
            for i in range(len(p))
        ]
        [shard.start() for shard in self.shards]

        # TODO: Readers / Writers lock on gradients

    #    self.gradients = SharedGradients(num_threads)

    # def initialize_gradients(self, i, gradients):
    #     self.gradients.initialize(i, gradients)
    #
    # def apply_gradients(self):
    #     self.optimizer.zero_grad()
    #     self.model.set_gradients(self.gradients.sum())
    #     self.optimizer.step()

# Need to fixwith ASGD
# How to process asynchronously? Also do mini-batches

    def record_gradients(self, gradients):
        #with self.time.get_lock():
        # TODO just create this as a tensor instead, it works better
        self.time.value += 1

        for q, g in zip(self.qs, gradients):
            # shard.update(g, self.time.value)
            q.put(torch.from_numpy(g).share_memory_())

    # def set_gradients(self, gradients):
    #     for g, p in zip(gradients, self.parameters):
    #         if g is not None:
    #             p.grad = torch.tensor(g)
    #            # p.grad = torch.from_numpy(g)

    def get(self):
        return self.shard_mem
예제 #6
0
    def __init__(self, state_size, action_size, seed=0):
        """Initialize an Agent object.
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed=seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed=seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
예제 #7
0
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed=0):
        """Initialize an Agent object.
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed=seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed=seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # compute and minimize the loss
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
예제 #8
0
class Agent:
    def __init__(self, state_size, action_size, num_agents, double_dqn=True):
        self.action_size = action_size
        self.double_dqn = double_dqn

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size).to(device)
        self.qnetwork_target = copy.deepcopy(self.qnetwork_local)
        self.optimizer = torch.optim.Adam(self.qnetwork_local.parameters(),
                                          lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(BUFFER_SIZE)
        self.num_agents = num_agents
        self.t_step = 0

    def reset(self):
        self.finished = [False] * self.num_agents

    # Decide on an action to take in the environment

    def act(self, state, eps=0.):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)

        # Epsilon-greedy action selection
        if random.random() > eps:
            return torch.argmax(action_values).item()
        else:
            return torch.randint(self.action_size, ()).item()

    # Record the results of the agent's action and update the model

    def step(self, handle, state, action, next_state, agent_done, episode_done,
             collision):
        if not self.finished[handle]:
            if agent_done:
                reward = 1
            elif collision:
                reward = -5
            else:
                reward = -.1

            # Save experience in replay memory
            self.memory.push(state, action, reward, next_state, agent_done
                             or episode_done)
            self.finished[handle] = agent_done or episode_done

        # Perform a gradient update every UPDATE_EVERY time steps
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0 and len(self.memory) > BATCH_SIZE * 20:
            self.learn(*self.memory.sample(BATCH_SIZE, device))

    def learn(self, states, actions, rewards, next_states, dones):
        self.qnetwork_local.train()

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        if self.double_dqn:
            Q_best_action = self.qnetwork_local(next_states).argmax(1)
            Q_targets_next = self.qnetwork_target(next_states).gather(
                1, Q_best_action.unsqueeze(-1))
        else:
            Q_targets_next = self.qnetwork_target(next_states).detach().max(
                1)[0].unsqueeze(-1)

        # Compute Q targets for current states
        Q_targets = rewards + GAMMA * Q_targets_next * (1 - dones)

        # Compute loss and perform a gradient step
        self.optimizer.zero_grad()
        loss = F.mse_loss(Q_expected, Q_targets)
        loss.backward()
        self.optimizer.step()

        # Update the target network parameters to `tau * local.parameters() + (1 - tau) * target.parameters()`
        for target_param, local_param in zip(self.qnetwork_target.parameters(),
                                             self.qnetwork_local.parameters()):
            target_param.data.copy_(TAU * local_param.data +
                                    (1.0 - TAU) * target_param.data)

    # Checkpointing methods

    def save(self, path, *data):
        torch.save(self.qnetwork_local.state_dict(),
                   path / 'dqn/model_checkpoint.local')
        torch.save(self.qnetwork_target.state_dict(),
                   path / 'dqn/model_checkpoint.target')
        torch.save(self.optimizer.state_dict(),
                   path / 'dqn/model_checkpoint.optimizer')
        with open(path / 'dqn/model_checkpoint.meta', 'wb') as file:
            pickle.dump(data, file)

    def load(self, path, *defaults):
        try:
            print("Loading model from checkpoint...")
            self.qnetwork_local.load_state_dict(
                torch.load(path / 'dqn/model_checkpoint.local'))
            self.qnetwork_target.load_state_dict(
                torch.load(path / 'dqn/model_checkpoint.target'))
            self.optimizer.load_state_dict(
                torch.load(path / 'dqn/model_checkpoint.optimizer'))
            with open(path / 'dqn/model_checkpoint.meta', 'rb') as file:
                return pickle.load(file)
        except:
            print("No checkpoint file was found")
            return defaults
예제 #9
0
class Agent():

    def __init__(self, state_size, action_size, seed):
        
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) #prediction net
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) #target network
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # store SARS when reach the batch size train 
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        self.t_step = 0 # time step
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        
        states, actions, rewards, next_states, dones = experiences

        # predictied Q value
        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        # Compute Q targets for current states 
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)