Python ReplayBuffer.push примеры использования

Язык программирования: Python

Пространство имен/Пакет: model

Класс/Тип: ReplayBuffer

Метод/Функция: push

Примеров на hotexamples.com: 2

Python ReplayBuffer.push - 2 примера найдено. Это лучшие примеры Python кода для model.ReplayBuffer.push, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

ReplayBuffer(11)

sample(7)

add(6)

detach(2)

push(2)

create_dataset(1)

load_from_gzip(1)

Пример #1

Показать файл

def ReplayBuffer_Init(rep_buf_size, rep_buf_ini, env, action_space):
    """Initialize the ReplayBuffer with the size of rep_buf_ini
    :param rep_buf_ini: size of initialized replay buffer
    :param action_space: action space dimension of game
    :return:
    """
    replay_buffer = ReplayBuffer(rep_buf_size)
    while len(replay_buffer) < rep_buf_ini:

        observation = env.reset()
        done = False

        while not done:
            t_observation = trace.from_numpy(observation).float()
            # t_observation.shape： torch.Size([4, 84, 84])
            # t_observation.shape：torch.Size([1, 4, 84, 84])
            t_observation = t_observation.view(1, t_observation.shape[0],
                                               t_observation.shape[1],
                                               t_observation.shape[2])
            action = random.sample(range(len(action_space)), 1)[0]

            next_observation, reward, done, info = env.step(
                action_space[action])

            replay_buffer.push(observation, action, reward, next_observation,
                               done)
            observation = next_observation  #

    print('Experience Replay buffer initialized')
    return replay_buffer
    pass

Пример #2

Показать файл

def main():
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    print("use_cuda: ", use_cuda)
    print("Device: ", device)

    env = atari_wrapper.make_atari('RiverraidNoFrameskip-v4')
    env = atari_wrapper.wrap_deepmind(env,
                                      clip_rewards=False,
                                      frame_stack=True,
                                      pytorch_img=True)

    action_space = [a for a in range(env.action_space.n)]
    n_action = len(action_space)

    # DQN Model and optimizer:
    policy_model = DQNModel().to(device)
    target_model = DQNModel().to(device)
    target_model.load_state_dict(policy_model.state_dict())

    optimizer = torch.optim.RMSprop(policy_model.parameters(),
                                    lr=lr,
                                    alpha=alpha)

    # Initialize the Replay Buffer
    replay_buffer = ReplayBuffer(rep_buf_size)

    while len(replay_buffer) < rep_buf_ini:

        observation = env.reset()
        done = False

        while not done:
            with torch.no_grad():
                t_observation = torch.from_numpy(observation).float().to(
                    device)
                t_observation = t_observation.view(1, t_observation.shape[0],
                                                   t_observation.shape[1],
                                                   t_observation.shape[2])
                action = random.sample(range(len(action_space)), 1)[0]

            next_observation, reward, done, info = env.step(
                action_space[action])

            replay_buffer.push(observation, action, reward, next_observation,
                               done)
            observation = next_observation

    print('Experience Replay buffer initialized')

    # Use log to record the performance
    logger = logging.getLogger('dqn_Riverraid')
    logger.setLevel(logging.INFO)
    logger_handler = logging.FileHandler('./dqn_Riverraid.log')
    logger.addHandler(logger_handler)

    # Training part
    env.reset()
    score = 0
    episode_score = []
    mean_episode_score = []
    episode_true = 0
    num_frames = 0
    episode = 0
    last_100episode_score = deque(maxlen=100)

    while episode < max_episodes:

        observation = env.reset()
        done = False
        # import time
        # start=time.time()

        while not done:

            with torch.no_grad():

                t_observation = torch.from_numpy(observation).float().to(
                    device) / 255
                t_observation = t_observation.view(1, t_observation.shape[0],
                                                   t_observation.shape[1],
                                                   t_observation.shape[2])
                epsilon = epsilon_by_frame(num_frames)
                if random.random() > epsilon:
                    q_value = policy_model(t_observation)
                    action = q_value.argmax(1).data.cpu().numpy().astype(
                        int)[0]
                else:
                    action = random.sample(range(len(action_space)), 1)[0]

            next_observation, reward, done, info = env.step(
                action_space[action])
            num_frames += 1
            score += reward

            replay_buffer.push(observation, action, reward, next_observation,
                               done)
            observation = next_observation

            # Update policy
            if len(replay_buffer
                   ) > batch_size and num_frames % skip_frame == 0:
                observations, actions, rewards, next_observations, dones = replay_buffer.sample(
                    batch_size)

                observations = torch.from_numpy(np.array(observations) /
                                                255).float().to(device)

                actions = torch.from_numpy(
                    np.array(actions).astype(int)).float().to(device)
                actions = actions.view(actions.shape[0], 1)

                rewards = torch.from_numpy(
                    np.array(rewards)).float().to(device)
                rewards = rewards.view(rewards.shape[0], 1)

                next_observations = torch.from_numpy(
                    np.array(next_observations) / 255).float().to(device)

                dones = torch.from_numpy(
                    np.array(dones).astype(int)).float().to(device)
                dones = dones.view(dones.shape[0], 1)

                q_values = policy_model(observations)
                next_q_values = target_model(next_observations)

                q_value = q_values.gather(1, actions.long())
                next_q_value = next_q_values.max(1)[0].unsqueeze(1)
                expected_q_value = rewards + gamma * next_q_value * (1 - dones)

                loss = huber_loss(q_value, expected_q_value)

                optimizer.zero_grad()
                loss.backward()

                optimizer.step()

                for target_param, policy_param in zip(
                        target_model.parameters(), policy_model.parameters()):
                    target_param.data.copy_(TAU * policy_param.data +
                                            (1 - TAU) * target_param.data)

        episode += 1
        # episode_score.append(score)
        # end=time.time()
        # print("Running time ( %i episode): %.3f Seconds "%(episode ,end-start))

        if info['ale.lives'] == 0:
            # episode_score.append(score)
            mean_score = score
            episode_true += 1
            score = 0

            # if episode % 20 == 0:
            # mean_score = np.mean(episode_score)
            mean_episode_score.append(mean_score)
            last_100episode_score.append(mean_score)
            # episode_score = []
            logger.info('Frame: ' + str(num_frames) + ' / Episode: ' +
                        str(episode_true) + ' / Average Score : ' +
                        str(int(mean_score)) + '   / epsilon: ' +
                        str(float(epsilon)))
            #plot_score(mean_episode_score, episode_true)
            pickle.dump(mean_episode_score,
                        open('./dqn_Riverraid_mean_scores.pickle', 'wb'))
            if episode_true % 50 == 1:
                logger.info('Frame: ' + str(num_frames) + ' / Episode: ' +
                            str(episode_true) + ' / Average Score : ' +
                            str(int(mean_score)) + '   / epsilon: ' +
                            str(float(epsilon)) +
                            '   / last_100episode_score: ' +
                            str(float(np.mean(last_100episode_score))))

        if episode % 50 == 0:
            torch.save(target_model.state_dict(),
                       './dqn_spaceinvaders_target_model_state_dict.pt')
            torch.save(policy_model.state_dict(),
                       './dqn_spaceinvaders_model_state_dict.pt')

    pass