Python ReplayMemory.push примеры использования

Язык программирования: Python

Пространство имен/Пакет: replay

Класс/Тип: ReplayMemory

Метод/Функция: push

Примеров на hotexamples.com: 4

Python ReplayMemory.push - 4 примера найдено. Это лучшие примеры Python кода для replay.ReplayMemory.push, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

ReplayMemory(21)

sample(5)

push(4)

add(4)

save_buffer(3)

add_experience(3)

load_buffer(2)

gen_sample(2)

get_last_n_states(1)

online_shrink_frame_size(1)

add_all(1)

rewards(1)

reset_unique(1)

remember(1)

get_history_minibatch(1)

print_replay(1)

percentages_rewards(1)

percentages_actions(1)

num_rewards(1)

get_last_state(1)

num_examples(1)

num_actions(1)

_get_state(1)

is_valid_index(1)

init_unique(1)

init_click_sample(1)

init_click(1)

get_unique_minibatch(1)

get_minibatch(1)

set_priorities(1)

Пример #1

Показать файл

Файл: cnn_main.py Проект: Farbod909/cs175-dont-starve

    # Loop until mission starts:
    print("Waiting for the mission to start ", end=' ')
    world_state = agent_host.getWorldState()
    while not world_state.has_mission_begun:
        print(".", end="")
        time.sleep(0.1)
        world_state = agent_host.getWorldState()
        for error in world_state.errors:
            print("Error:", error.text)

    print()
    print("Mission running ", end=' ')

    agent_host.sendCommand("chat /time set day")

    agent = ag.Agent(agent_host)

    # Loop until mission ends:
    while not agent.finished:
        state = transform_farm(copy.deepcopy(agent.state))
        action = agent.select_action(state, net)
        reward = agent.run(action.item())
        memory.push(state, action, transform_farm(copy.deepcopy(agent.state)),
                    torch.tensor([reward]).long())
        cnn.train(net, memory)

    print()
    print("Mission ended")
    agent_host.sendCommand("chat /kill @p")
    # Mission has ended.

Пример #2

Показать файл

Файл: DQN.py Проект: gsitcia/Deep-Reinforcement-Learning

class DQN:
    def __init__(self, env, hparams):
        self.hparams = hparams
        self.env = env
        self.n = env.action_space.n
        self.Q = DCNN(4, self.n)
        self.T = DCNN(4, self.n)
        self.T.load_state_dict(self.Q.state_dict())
        self.T.eval()
        self.memory = ReplayMemory(hparams.memory_size)
        self.steps = 0
        self.state = env.reset()
        self.optimizer = torch.optim.RMSprop(self.Q.parameters(),
                                             lr=hparams.lr,
                                             momentum=hparams.momentum)
        self.n_episodes = 0

    @torch.no_grad()
    def select_action(self):
        hparams = self.hparams
        start = hparams.eps_start
        end = hparams.eps_end
        time = hparams.eps_time
        steps = self.steps
        self.steps += 1
        if steps < time:
            epsilon = start - (start - end) * steps / time
        else:
            epsilon = end

        sample = random.random()

        if sample > epsilon:
            return self.Q(s2t(self.state).to(device)).max(1)[1].item()
        else:
            return self.env.action_space.sample()

    def sample_step(self, fs_min=2, fs_max=6):
        """repeats a single action between fs_min and fs_max (inclusive) times"""
        fs = random.randint(fs_min, fs_max)
        action = self.select_action()
        r = 0
        for _ in range(fs):
            new_state, reward, done, _ = self.env.step(action)
            self.memory.push(self.state, action,
                             new_state if not done else None, reward)
            r += reward
            self.state = self.env.reset() if done else new_state
            if done:
                self.n_episodes += 1
        return r

    def optimize(self):
        hparams = self.hparams
        transitions = self.memory.sample(hparams.batch_size)
        batch = Transition(*zip(*transitions))
        states = torch.cat([s2t(state) for state in batch.state]).to(device)
        actions = torch.tensor(batch.action).unsqueeze(1).to(device)
        target_values = torch.tensor(
            batch.reward).unsqueeze(1).to(device).float()
        non_terminal_next_states = torch.cat([
            s2t(state) for state in batch.next_state if state is not None
        ]).to(device)
        non_terminal_mask = torch.tensor([
            state is not None for state in batch.next_state
        ]).to(device).unsqueeze(1)

        values = self.Q(states).gather(1, actions).float()
        target_values[non_terminal_mask] += hparams.gamma * self.T(
            non_terminal_next_states).detach().max(1)[0].float()

        #print(values.dtype,target_values.dtype)
        loss = F.smooth_l1_loss(values, target_values)
        self.optimizer.zero_grad()
        loss.backward()

        for param in self.Q.parameters():
            param.grad.data.clamp_(-1, 1)  # maybe try sign_?

        self.optimizer.step()
        return loss

Пример #3

Показать файл

class Agent:
    def __init__(self, env, env_w, device, config: Config):
        self.env = env
        self.env_w = env_w
        self.device = device
        self.cfg = config
        self.n_actions = config.n_actions
        self.policy_net = config.policy_net
        self.target_net = config.target_net
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.optimizer = optim.RMSprop(self.policy_net.parameters())
        self.memory = ReplayMemory(10000)
        self.steps_done = 0
        self.episode_durations = []

    def select_action(self, state):
        self.steps_done += 1
        sample = random.random()
        eps_threshold = self.cfg.EPS_END + (self.cfg.EPS_START - self.cfg.EPS_END) * \
            math.exp(-1. * self.steps_done / self.cfg.EPS_DECAY)
        if sample < eps_threshold:
            with torch.no_grad():
                # t.max(1) will return largest column value of each row.
                # second column on max result is index of where max element was
                # found, so we pick action with the larger expected reward.
                # action = self.policy_net(state).max(1)[1]
                action = self.policy_net(state).argmax() % self.n_actions
        else:
            action = random.randrange(self.n_actions)
        return torch.tensor([[action]], device=self.device, dtype=torch.long)

    def optimize_model(self):
        if len(self.memory) < self.cfg.BATCH_SIZE:
            return
        transitions = self.memory.sample(self.cfg.BATCH_SIZE)
        # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
        # detailed explanation). This converts batch-array of Transitions
        # to Transition of batch-arrays.
        batch = Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        # (a final state would've been the one after which simulation ended)
        non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)),
                                      device=self.device,
                                      dtype=torch.bool)
        non_final_next_states = torch.cat([s for s in batch.next_state if s is not None])
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = self.policy_net(state_batch).gather(1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1)[0].
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        next_state_values = torch.zeros(self.cfg.BATCH_SIZE, device=self.device)
        next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1)[0].detach()
        # Compute the expected Q values
        expected_state_action_values = (next_state_values * self.cfg.GAMMA) + reward_batch

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

    def step(self, i_episode):
        # Initialize the environment and state
        self.env.reset()
        last_screen = self.env_w.get_screen()
        current_screen = self.env_w.get_screen()
        state = current_screen - last_screen
        for t in count():
            # Select and perform an action
            action = self.select_action(state)
            obs, reward, done, obs_ = self.env.step(action.item())
            # reward = torch.tensor([reward], device=self.device)
            reward = torch.tensor([-abs(obs[2])], device=self.device, dtype=torch.float)

            # Observe new state
            last_screen = current_screen
            current_screen = self.env_w.get_screen()
            if not done:
                next_state = current_screen - last_screen
            else:
                next_state = None

            # Store the transition in memory
            self.memory.push(state, action, next_state, reward)

            # Move to the next state
            state = next_state

            # Perform one step of the optimization (on the target network)
            self.optimize_model()
            if done:
                self.episode_durations.append(t + 1)
                self.env_w.plot_durations(self.episode_durations)
                break
        # Update the target network, copying all weights and biases in DQN
        if i_episode % self.cfg.TARGET_UPDATE == 0:
            self.target_net.load_state_dict(self.policy_net.state_dict())

Пример #4

Показать файл

    while not world_state.has_mission_begun:
        print(".", end="")
        time.sleep(0.1)
        world_state = agent_host.getWorldState()
        for error in world_state.errors:
            print("Error:", error.text)

    print()
    print("Mission running ", end=' ')

    agent_host.sendCommand("chat /time set day")

    agent = ag.Agent(agent_host)

    # Loop until mission ends:
    while not agent.finished:
        state = agent.state.copy()
        action = agent.select_action(state)
        reward = agent.run(action)
        memory.push(state, action, agent.state.copy(), reward)
        net.train(memory)

    print(agent.state)
    memory.print_replay()
    #net.forward(agent.state)

    print()
    print("Mission ended")
    agent_host.sendCommand("chat /kill @p")
    # Mission has ended.