Python ReplayMemory.push示例

    # Loop until mission starts:
    print("Waiting for the mission to start ", end=' ')
    world_state = agent_host.getWorldState()
    while not world_state.has_mission_begun:
        print(".", end="")
        time.sleep(0.1)
        world_state = agent_host.getWorldState()
        for error in world_state.errors:
            print("Error:", error.text)

    print()
    print("Mission running ", end=' ')

    agent_host.sendCommand("chat /time set day")

    agent = ag.Agent(agent_host)

    # Loop until mission ends:
    while not agent.finished:
        state = transform_farm(copy.deepcopy(agent.state))
        action = agent.select_action(state, net)
        reward = agent.run(action.item())
        memory.push(state, action, transform_farm(copy.deepcopy(agent.state)),
                    torch.tensor([reward]).long())
        cnn.train(net, memory)

    print()
    print("Mission ended")
    agent_host.sendCommand("chat /kill @p")
    # Mission has ended.

示例#2

显示文件

文件： DQN.py 项目： gsitcia/Deep-Reinforcement-Learning

class DQN:
    def __init__(self, env, hparams):
        self.hparams = hparams
        self.env = env
        self.n = env.action_space.n
        self.Q = DCNN(4, self.n)
        self.T = DCNN(4, self.n)
        self.T.load_state_dict(self.Q.state_dict())
        self.T.eval()
        self.memory = ReplayMemory(hparams.memory_size)
        self.steps = 0
        self.state = env.reset()
        self.optimizer = torch.optim.RMSprop(self.Q.parameters(),
                                             lr=hparams.lr,
                                             momentum=hparams.momentum)
        self.n_episodes = 0

    @torch.no_grad()
    def select_action(self):
        hparams = self.hparams
        start = hparams.eps_start
        end = hparams.eps_end
        time = hparams.eps_time
        steps = self.steps
        self.steps += 1
        if steps < time:
            epsilon = start - (start - end) * steps / time
        else:
            epsilon = end

        sample = random.random()

        if sample > epsilon:
            return self.Q(s2t(self.state).to(device)).max(1)[1].item()
        else:
            return self.env.action_space.sample()

    def sample_step(self, fs_min=2, fs_max=6):
        """repeats a single action between fs_min and fs_max (inclusive) times"""
        fs = random.randint(fs_min, fs_max)
        action = self.select_action()
        r = 0
        for _ in range(fs):
            new_state, reward, done, _ = self.env.step(action)
            self.memory.push(self.state, action,
                             new_state if not done else None, reward)
            r += reward
            self.state = self.env.reset() if done else new_state
            if done:
                self.n_episodes += 1
        return r

    def optimize(self):
        hparams = self.hparams
        transitions = self.memory.sample(hparams.batch_size)
        batch = Transition(*zip(*transitions))
        states = torch.cat([s2t(state) for state in batch.state]).to(device)
        actions = torch.tensor(batch.action).unsqueeze(1).to(device)
        target_values = torch.tensor(
            batch.reward).unsqueeze(1).to(device).float()
        non_terminal_next_states = torch.cat([
            s2t(state) for state in batch.next_state if state is not None
        ]).to(device)
        non_terminal_mask = torch.tensor([
            state is not None for state in batch.next_state
        ]).to(device).unsqueeze(1)

        values = self.Q(states).gather(1, actions).float()
        target_values[non_terminal_mask] += hparams.gamma * self.T(
            non_terminal_next_states).detach().max(1)[0].float()

        #print(values.dtype,target_values.dtype)
        loss = F.smooth_l1_loss(values, target_values)
        self.optimizer.zero_grad()
        loss.backward()

        for param in self.Q.parameters():
            param.grad.data.clamp_(-1, 1)  # maybe try sign_?

        self.optimizer.step()
        return loss

示例#3

显示文件

class Agent:
    def __init__(self, env, env_w, device, config: Config):
        self.env = env
        self.env_w = env_w
        self.device = device
        self.cfg = config
        self.n_actions = config.n_actions
        self.policy_net = config.policy_net
        self.target_net = config.target_net
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.optimizer = optim.RMSprop(self.policy_net.parameters())
        self.memory = ReplayMemory(10000)
        self.steps_done = 0
        self.episode_durations = []

    def select_action(self, state):
        self.steps_done += 1
        sample = random.random()
        eps_threshold = self.cfg.EPS_END + (self.cfg.EPS_START - self.cfg.EPS_END) * \
            math.exp(-1. * self.steps_done / self.cfg.EPS_DECAY)
        if sample < eps_threshold:
            with torch.no_grad():
                # t.max(1) will return largest column value of each row.
                # second column on max result is index of where max element was
                # found, so we pick action with the larger expected reward.
                # action = self.policy_net(state).max(1)[1]
                action = self.policy_net(state).argmax() % self.n_actions
        else:
            action = random.randrange(self.n_actions)
        return torch.tensor([[action]], device=self.device, dtype=torch.long)

    def optimize_model(self):
        if len(self.memory) < self.cfg.BATCH_SIZE:
            return
        transitions = self.memory.sample(self.cfg.BATCH_SIZE)
        # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
        # detailed explanation). This converts batch-array of Transitions
        # to Transition of batch-arrays.
        batch = Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        # (a final state would've been the one after which simulation ended)
        non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)),
                                      device=self.device,
                                      dtype=torch.bool)
        non_final_next_states = torch.cat([s for s in batch.next_state if s is not None])
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = self.policy_net(state_batch).gather(1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1)[0].
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        next_state_values = torch.zeros(self.cfg.BATCH_SIZE, device=self.device)
        next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1)[0].detach()
        # Compute the expected Q values
        expected_state_action_values = (next_state_values * self.cfg.GAMMA) + reward_batch

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

    def step(self, i_episode):
        # Initialize the environment and state
        self.env.reset()
        last_screen = self.env_w.get_screen()
        current_screen = self.env_w.get_screen()
        state = current_screen - last_screen
        for t in count():
            # Select and perform an action
            action = self.select_action(state)
            obs, reward, done, obs_ = self.env.step(action.item())
            # reward = torch.tensor([reward], device=self.device)
            reward = torch.tensor([-abs(obs[2])], device=self.device, dtype=torch.float)

            # Observe new state
            last_screen = current_screen
            current_screen = self.env_w.get_screen()
            if not done:
                next_state = current_screen - last_screen
            else:
                next_state = None

            # Store the transition in memory
            self.memory.push(state, action, next_state, reward)

            # Move to the next state
            state = next_state

            # Perform one step of the optimization (on the target network)
            self.optimize_model()
            if done:
                self.episode_durations.append(t + 1)
                self.env_w.plot_durations(self.episode_durations)
                break
        # Update the target network, copying all weights and biases in DQN
        if i_episode % self.cfg.TARGET_UPDATE == 0:
            self.target_net.load_state_dict(self.policy_net.state_dict())

示例#4

显示文件

    while not world_state.has_mission_begun:
        print(".", end="")
        time.sleep(0.1)
        world_state = agent_host.getWorldState()
        for error in world_state.errors:
            print("Error:", error.text)

    print()
    print("Mission running ", end=' ')

    agent_host.sendCommand("chat /time set day")

    agent = ag.Agent(agent_host)

    # Loop until mission ends:
    while not agent.finished:
        state = agent.state.copy()
        action = agent.select_action(state)
        reward = agent.run(action)
        memory.push(state, action, agent.state.copy(), reward)
        net.train(memory)

    print(agent.state)
    memory.print_replay()
    #net.forward(agent.state)

    print()
    print("Mission ended")
    agent_host.sendCommand("chat /kill @p")
    # Mission has ended.