Python ReplayBuffer примеры использования

Язык программирования: Python

Пространство имен/Пакет: tools

Класс/Тип: ReplayBuffer

Примеров на hotexamples.com: 6

Python ReplayBuffer - 6 примеров найдено. Это лучшие примеры Python кода для tools.ReplayBuffer, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

ReplayBuffer(3)

sample(3)

push(2)

Основные методы

ReplayBuffer (3)

sample (3)

push (2)

Пример #1

Показать файл

Файл: dqn.py Проект: JshenCCC/task_scheduling_dqn_pytorch

    def __init__(self, nn_module):
        self.eval_net, self.target_net = nn_module(), nn_module()
        self.eval_net.initialize_weights()
        self.target_net.load_state_dict(self.eval_net.state_dict())
        self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=LR)
        self.loss_func = nn.MSELoss()

        self.consider_tasks, self.edges_num = CONSIDER_TASKS, EDGES_NUM
        self.bitrate_type, self.resolution_type = BIRATE_TYPE, RESOLUTION_TYPE

        self.learn_step_counter = 0  # for target updating
        self.memory_counter = 0  # for storing memory
        self.memory_size = MEMORY_SIZE
        self.memory = ReplayBuffer(MEMORY_SIZE)

Пример #2

Показать файл

Файл: DDPG.py Проект: yosoufe/DRLND_Continuous_Control_P2

class DDPG_Agent:
    """
    DDPG Algorithm
    """
    def __init__(self,
                 state_size,
                 action_size,
                 actor_model,
                 critic_model,
                 device,
                 num_agents=1,
                 seed=0,
                 tau=1e-3,
                 batch_size=1024,
                 discount_factor=0.99,
                 actor_learning_rate=1e-4,
                 critic_learning_rate=1e-3):
        """
        Initialize the 4 networks
        Copy 2 of them into the other two:
        * actor and actor_target
        * critic and critic_target
        init the replay buffer and the noise process

        Args:
            state_size:
            action_size:
            num_agents:
            seed:
            tau:
            batch_size:
            discount_factor:
            actor_learning_rate:
            critic_learning_rate:

        """
        self.tau = tau
        self.state_size = state_size
        self.action_size = action_size
        self.actor_local = actor_model(state_size, action_size, seed)
        self.actor_target = actor_model(state_size, action_size, seed)
        self.critic_local = critic_model(state_size, action_size, seed)
        self.critic_target = critic_model(state_size, action_size, seed)
        self.critic2_local = critic_model(state_size, action_size, seed + 1)
        self.critic2_target = critic_model(state_size, action_size, seed + 1)
        self.soft_update(1.0)
        self.batch_size = batch_size
        self.replayBuffer = ReplayBuffer(batch_size=batch_size,
                                         buffer_size=300 * 1000,
                                         seed=seed,
                                         device=device)
        self.num_agents = num_agents
        self.noise_process = OUNoise(action_size * num_agents,
                                     seed,
                                     max_sigma=0.1,
                                     min_sigma=0.001,
                                     decay_period=300 * 300)
        self.discount_factor = discount_factor
        self.actor_opt = optim.Adam(self.actor_local.parameters(),
                                    lr=actor_learning_rate)
        self.critic_opt = optim.Adam(self.critic_local.parameters(),
                                     lr=critic_learning_rate)
        self.critic2_opt = optim.Adam(self.critic2_local.parameters(),
                                      lr=critic_learning_rate)
        self.critic_criterion = nn.MSELoss()
        self.critic2_criterion = nn.MSELoss()
        self.device = device
        for model in [
                self.actor_local, self.actor_target, self.critic_local,
                self.critic_target, self.critic2_local, self.critic2_target
        ]:
            model.to(device)

    def act(self, state, add_noise=True):
        """
        * Create actions using Actor Policy Network
        * Add noise to the actions and return it.

        Args:
            state: numpy array in shape of (num_agents, action_size).
            add_noise:

        Returns:
            actions_with_noise: numpy arrays of size (num_agents, action_size)
            actions_without_noise: numpy arrays of size (num_agents, action_size)
        """
        state = torch.from_numpy(state).float().view(
            self.num_agents, self.state_size).to(self.device)
        self.actor_local.eval()
        actions_with_noise = None
        actions_without_noise = None
        with torch.no_grad():
            actions = self.actor_local(state)
            actions_without_noise = actions.cpu().numpy()
        self.actor_local.train()
        if add_noise:
            actions_with_noise = actions_without_noise + self.noise_process.sample(
            ).reshape(self.num_agents, self.action_size)
        return actions_with_noise, actions_without_noise

    def step(self, state, action, reward, next_state, done):
        """
        * save sample in the replay buffer
        * if replay buffer is large enough
            * learn

        Args:
            state:
            action:
            reward:
            next_state:
            done:

        Returns:
            None
        """
        self.replayBuffer.push(state, action, reward, next_state, done)
        if len(self.replayBuffer) > self.batch_size:
            self.learn(*self.replayBuffer.sample())

    def learn(self, states, actions, rewards, next_states, dones):
        """
        * sample a batch
        * set y from reward, Target Critic Network and Target Policy network
        * Calculate loss from y and Critic Network
        * Update the actor policy (would also update the critic by chain rule) using sampled policy gradient
        * soft update the target critic and target policy

        Args:
            actions:
            rewards:
            next_states:
            dones:

        Returns:
            None
        """
        # Update Critic
        next_actions = self.actor_target(next_states)
        # value = self.critic_target(next_states, next_actions).detach()
        value = (self.critic_target(next_states, next_actions).detach() +
                 self.critic2_target(next_states, next_actions).detach()) / 2.0
        # value = torch.min(self.critic_target(next_states, next_actions).detach(),
        #                   self.critic2_target(next_states, next_actions).detach())
        y = rewards + self.discount_factor * value

        Q = self.critic_local(states, actions)
        critic_loss = self.critic_criterion(Q, y)

        Q2 = self.critic2_local(states, actions)
        critic2_loss = self.critic2_criterion(Q2, y)

        # Update Actor
        action_predictions = self.actor_local(states)
        actor_loss = -self.critic_local(states, action_predictions).mean()

        # update networks
        self.actor_opt.zero_grad()
        actor_loss.backward()
        self.actor_opt.step()

        self.critic2_opt.zero_grad()
        critic2_loss.backward()
        self.critic2_opt.step()

        self.critic_opt.zero_grad()
        critic_loss.backward()
        self.critic_opt.step()

        # soft update
        self.soft_update(self.tau)

    def reset(self):
        self.noise_process.reset()

    def soft_update(self, tau):
        for target_param, local_param in zip(self.actor_target.parameters(),
                                             self.actor_local.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
        for target_param, local_param in zip(self.critic_target.parameters(),
                                             self.critic_local.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
        for target_param, local_param in zip(self.critic2_target.parameters(),
                                             self.critic2_local.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Пример #3

Показать файл

Файл: DDPG.py Проект: yosoufe/DRLND_Continuous_Control_P2

    def __init__(self,
                 state_size,
                 action_size,
                 actor_model,
                 critic_model,
                 device,
                 num_agents=1,
                 seed=0,
                 tau=1e-3,
                 batch_size=1024,
                 discount_factor=0.99,
                 actor_learning_rate=1e-4,
                 critic_learning_rate=1e-3):
        """
        Initialize the 4 networks
        Copy 2 of them into the other two:
        * actor and actor_target
        * critic and critic_target
        init the replay buffer and the noise process

        Args:
            state_size:
            action_size:
            num_agents:
            seed:
            tau:
            batch_size:
            discount_factor:
            actor_learning_rate:
            critic_learning_rate:

        """
        self.tau = tau
        self.state_size = state_size
        self.action_size = action_size
        self.actor_local = actor_model(state_size, action_size, seed)
        self.actor_target = actor_model(state_size, action_size, seed)
        self.critic_local = critic_model(state_size, action_size, seed)
        self.critic_target = critic_model(state_size, action_size, seed)
        self.critic2_local = critic_model(state_size, action_size, seed + 1)
        self.critic2_target = critic_model(state_size, action_size, seed + 1)
        self.soft_update(1.0)
        self.batch_size = batch_size
        self.replayBuffer = ReplayBuffer(batch_size=batch_size,
                                         buffer_size=300 * 1000,
                                         seed=seed,
                                         device=device)
        self.num_agents = num_agents
        self.noise_process = OUNoise(action_size * num_agents,
                                     seed,
                                     max_sigma=0.1,
                                     min_sigma=0.001,
                                     decay_period=300 * 300)
        self.discount_factor = discount_factor
        self.actor_opt = optim.Adam(self.actor_local.parameters(),
                                    lr=actor_learning_rate)
        self.critic_opt = optim.Adam(self.critic_local.parameters(),
                                     lr=critic_learning_rate)
        self.critic2_opt = optim.Adam(self.critic2_local.parameters(),
                                      lr=critic_learning_rate)
        self.critic_criterion = nn.MSELoss()
        self.critic2_criterion = nn.MSELoss()
        self.device = device
        for model in [
                self.actor_local, self.actor_target, self.critic_local,
                self.critic_target, self.critic2_local, self.critic2_target
        ]:
            model.to(device)

Пример #4

Показать файл

Файл: run.py Проект: nangeblog/ModelRepo

        '-l',
        '--load',
        type=int,
        help="Indicates the step you wanna start, file must exist")

    args = parser.parse_args()
    start = 0

    scenario = scenarios.load(args.scenario).Scenario()
    world = scenario.make_world()

    env = MultiAgentEnv(world, scenario.reset_world, scenario.reward,
                        scenario.observation)
    policies = MultiAgent(env, "Multi-Agent")

    REPLAY_BUFFER = ReplayBuffer(env, GeneralConfig())

    if args.load is not None:
        start = args.load

        if args.dir is None:
            print("[!] Please indicate a path for model storing")
            exit(1)

        policies.load(args.dir, start)

    for i in range(start, start + args.n_round):
        play(i, env, policies)

        if (i + 1) % args.every == 0:
            policies.save(args.dir, i)

Пример #5

Показать файл

class MADDPG:
    """
    DDPG for multi and interactive agent Algorithm
    """
    def __init__(self,
                 state_size,
                 action_size,
                 actor_model,
                 critic_model,
                 device,
                 num_agents=1,
                 num_interacting_agents=2,
                 seed=0,
                 tau=1e-3,
                 batch_size=1024,
                 discount_factor=0.99,
                 actor_learning_rate=1e-4,
                 critic_learning_rate=1e-3,
                 replayBuffer=None):
        """
        Initialize the 4 networks
        Copy 2 of them into the other two:
        * actor and actor_target
        * critic and critic_target
        init the replay buffer and the noise process

        Args:
            state_size:
            action_size:
            num_agents:
            seed:
            tau:
            batch_size:
            discount_factor:
            actor_learning_rate:
            critic_learning_rate:

        """
        self.tau = tau
        self.state_size = state_size
        self.action_size = action_size
        actor_layers = [32, 64, 64]
        critic_layers = [64, 16, 128, 16]  # [64, 64, 256, 32]
        self.actor_local = actor_model(state_size,
                                       action_size,
                                       seed,
                                       layer_sizes=actor_layers)
        self.actor_target = actor_model(state_size,
                                        action_size,
                                        seed,
                                        layer_sizes=actor_layers)
        self.num_interacting_agents = num_interacting_agents
        self.critic_local = critic_model(state_size,
                                         state_size,
                                         action_size,
                                         action_size,
                                         seed,
                                         layer_sizes=critic_layers)
        self.critic_target = critic_model(state_size,
                                          state_size,
                                          action_size,
                                          action_size,
                                          seed,
                                          layer_sizes=critic_layers)
        self.critic2_local = critic_model(state_size,
                                          state_size,
                                          action_size,
                                          action_size,
                                          seed + 1,
                                          layer_sizes=critic_layers)
        self.critic2_target = critic_model(state_size,
                                           state_size,
                                           action_size,
                                           action_size,
                                           seed + 1,
                                           layer_sizes=critic_layers)
        self.soft_update(1.0)
        self.batch_size = batch_size
        if replayBuffer is None:
            self.replayBuffer = ReplayBuffer(batch_size=batch_size,
                                             buffer_size=300 * 1000,
                                             seed=seed,
                                             device=device)
        else:
            self.replayBuffer = replayBuffer
        self.num_agents = num_agents
        self.noise_process = OUNoise(action_size * num_agents,
                                     seed,
                                     max_sigma=0.1,
                                     min_sigma=0.001,
                                     decay_period=30 * 300,
                                     decay_delay=4048 / 30)
        self.discount_factor = discount_factor
        self.actor_opt = optim.Adam(self.actor_local.parameters(),
                                    lr=actor_learning_rate)
        self.critic_opt = optim.Adam(self.critic_local.parameters(),
                                     lr=critic_learning_rate)
        self.critic2_opt = optim.Adam(self.critic2_local.parameters(),
                                      lr=critic_learning_rate)
        self.critic_criterion = nn.MSELoss()
        self.critic2_criterion = nn.MSELoss()
        self.device = device
        self.other = None
        for model in [
                self.actor_local, self.actor_target, self.critic_local,
                self.critic_target, self.critic2_local, self.critic2_target
        ]:
            model.to(device)

    def set_other_agent(self, agent):
        self.other = agent

    def act(self, state, add_noise=True):
        """
        * Create actions using Actor Policy Network
        * Add noise to the actions and return it.

        Args:
            state: numpy array in shape of (state_size,).
            add_noise:

        Returns:
            actions_with_noise: numpy arrays of size (num_agents, action_size)
            actions_without_noise: numpy arrays of size (num_agents, action_size)
        """
        state = torch.from_numpy(state).float().view(
            self.num_agents, self.state_size).to(self.device)
        self.actor_local.eval()
        actions_with_noise = None
        actions_without_noise = None
        with torch.no_grad():
            actions = self.actor_local(state)
            actions_without_noise = actions.cpu().numpy()
        self.actor_local.train()
        if add_noise:
            actions_with_noise = actions_without_noise + self.noise_process.sample(
            ).reshape(self.num_agents, self.action_size)
        return actions_with_noise, actions_without_noise

    def step(self, this_state, others_state, this_action, others_action,
             reward, this_next_states, others_next_states, done):
        """
        * save sample in the replay buffer
        * if replay buffer is large enough
            * learn

        Args:
            this_state: 1D numpy array in shape of (1, num_states)
            others_state: 1D numpy array in shape of (1, num_states*num_other_agents)
            this_action: 1D numpy array in shape of (1, num_actions)
            others_action: D numpy array in shape of (1, num_actions*num_other_agents)
            reward: reward of this agent
            this_next_states: same as this_state but for next time stamp
            others_next_states: same as others_state but for next time stamp
            this_next_actions: same as this_action but for next time stamp
            others_next_actions: same as others_action but for next time stamp
            done: of this agent

        Returns:
            None
        """
        # print(np.hstack((this_state, others_state)))
        # print(np.hstack((this_action, others_action)))
        # print(np.hstack((this_next_states, others_next_states)))
        # print(np.hstack((this_next_actions, others_next_actions)))
        # print(reward)
        self.replayBuffer.push(state=np.hstack((this_state, others_state)),
                               action=np.hstack((this_action, others_action)),
                               reward=reward,
                               next_states=np.hstack(
                                   (this_next_states, others_next_states)),
                               next_actions=None,
                               done=done)

        if len(self.replayBuffer) > self.batch_size * 2:
            self.learn(*self.replayBuffer.sample())

    def learn(self, states, actions, rewards, next_states, next_actions,
              dones):
        """
        * set y from reward, Target Critic Network and Target Policy network
        * Calculate loss from y and Critic Network
        * Update the actor policy (would also update the critic by chain rule) using sampled policy gradient
        * soft update the target critic and target policy

        Args:
            states: state of all agents in a row
            actions: actions of all agents in a row
            rewards: rewards
            next_states: next state of all agents in a row
            next_actions: same as actions
            dones: dones

        Returns:
            None
        """
        all_states = states
        this_state = states[:, 0:self.state_size]

        next_actions = self.actor_local(next_states[:, 0:self.state_size])
        next_actions = torch.cat(
            (next_actions,
             self.other.actor_local(next_states[:, self.state_size:])),
            1).detach()

        # Update Critic
        value = (self.critic_target(next_states, next_actions).detach() +
                 self.critic2_target(next_states, next_actions).detach()) / 2.0
        y = rewards + self.discount_factor * value

        Q = self.critic_local(all_states, actions)
        critic_loss = self.critic_criterion(Q, y)

        Q2 = self.critic2_local(all_states, actions)
        critic2_loss = self.critic2_criterion(Q2, y)

        self.critic_opt.zero_grad()
        critic_loss.backward()
        self.critic_opt.step()

        self.critic2_opt.zero_grad()
        critic2_loss.backward()
        self.critic2_opt.step()

        # Update Actor
        action_predictions = self.actor_local(this_state)
        actions_pred_and_others = torch.cat(
            (action_predictions, next_actions[:, self.action_size:]), dim=1)
        actor_loss = (
            -self.critic_local(all_states, actions_pred_and_others).mean() -
            self.critic2_local(all_states,
                               actions_pred_and_others).mean()) * 0.5

        self.actor_opt.zero_grad()
        actor_loss.backward()
        self.actor_opt.step()

        # soft update
        self.soft_update(self.tau)
        # print(states.shape, this_state.shape, action_predictions.shape)

    def reset(self):
        self.noise_process.reset()

    def soft_update(self, tau):
        for target_param, local_param in zip(self.actor_target.parameters(),
                                             self.actor_local.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
        for target_param, local_param in zip(self.critic_target.parameters(),
                                             self.critic_local.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
        for target_param, local_param in zip(self.critic2_target.parameters(),
                                             self.critic2_local.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def save_agent(self, file_name):
        i = 0
        path = None
        while True:
            if not os.path.isfile(f'{file_name}_{i}.pth'):
                path = f'{file_name}_{i}.pth'
                break
            else:
                i += 1

        torch.save(
            {
                'actor_local': self.actor_local.state_dict(),
                'critic_local': self.critic_local.state_dict(),
                'critic2_local': self.critic2_local.state_dict(),
                'actor_opt': self.actor_opt.state_dict(),
                'critic_opt': self.critic_opt.state_dict(),
                'critic2_opt': self.critic2_opt.state_dict()
            }, path)

    def load_agent(self, file_name, is_exact_path=False):
        i = 0
        path = file_name
        while not is_exact_path:
            if os.path.isfile(f'{file_name}_{i}.pth'):
                path = f'{file_name}_{i}.pth'
                i += 1
            else:
                break

        ckpt = torch.load(path)
        self.actor_local.load_state_dict(ckpt['actor_local'])
        self.critic_local.load_state_dict(ckpt['critic_local'])
        self.critic2_local.load_state_dict(ckpt['critic2_local'])
        self.actor_opt.load_state_dict(ckpt['actor_opt'])
        self.critic_opt.load_state_dict(ckpt['critic_opt'])
        self.critic2_opt.load_state_dict(ckpt['critic2_opt'])
        self.actor_local.to(self.device)
        self.critic_local.to(self.device)
        self.critic2_local.to(self.device)

Пример #6

Показать файл

Файл: dqn.py Проект: JshenCCC/task_scheduling_dqn_pytorch

class DQN(object):
    def __init__(self, nn_module):
        self.eval_net, self.target_net = nn_module(), nn_module()
        self.eval_net.initialize_weights()
        self.target_net.load_state_dict(self.eval_net.state_dict())
        self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=LR)
        self.loss_func = nn.MSELoss()

        self.consider_tasks, self.edges_num = CONSIDER_TASKS, EDGES_NUM
        self.bitrate_type, self.resolution_type = BIRATE_TYPE, RESOLUTION_TYPE

        self.learn_step_counter = 0  # for target updating
        self.memory_counter = 0  # for storing memory
        self.memory_size = MEMORY_SIZE
        self.memory = ReplayBuffer(MEMORY_SIZE)

    def soft_update(self):
        self.target_net.load_state_dict(self.eval_net.state_dict())
        return

    def anneal_epsilon(self, anneal=0.999):
        if self.eval_net.epsilon <= 0.1: return
        self.eval_net.epsilon = self.eval_net.epsilon * anneal

    def train(self):
        self.learn_step_counter += 1
        if self.learn_step_counter < BATCH_SIZE or self.learn_step_counter % 5 != 0:
            return
        # target parameter update
        if self.learn_step_counter % TARGET_REPLACE_ITER == 0:
            self.target_net.load_state_dict(self.eval_net.state_dict())

        # batch sample
        b_s, b_a, b_r, b_s_ = self.memory.sample(BATCH_SIZE)
        b_s = state2tensor(b_s, is_batch=True)
        b_a = [[bb[0], bb[1], bb[2]] for bb in b_a]  # Adjust the subscript

        # q_eval w.r.t action in sample
        g = torch.from_numpy(np.array(b_a)[:, 0:2])
        self.eval_net.train()
        if type(self.eval_net) == TSNet:
            q_eval = self.eval_net(b_s, g)  # q_eval.shape (batch, data)
            b_a = [bb[-1:] for bb in b_a]
        else:
            q_eval = self.eval_net(b_s)
            b_a = [bb[:2] for bb in b_a]
        q_eval_wrt_a = torch.gather(q_eval,
                                    1,
                                    index=torch.LongTensor(np.array(b_a)))
        q_eval_wrt_a = q_eval_wrt_a.sum(dim=1).unsqueeze(0).t()

        # q_target with the maximum q of next_state
        if type(self.eval_net) == TSNet:
            q_target = torch.FloatTensor(list(b_r)).unsqueeze(0).t()  #
        else:
            b_s_ = state2tensor(b_s_, is_batch=True)
            q_next = self.target_net(
                b_s_).detach()  # detach() from graph, don't back propagate
            max_q_next = torch.cat([
                q_next[:, 0:BIRATE_TYPE].max(1)[0],
                q_next[:, -EDGES_NUM:].max(1)[0]
            ],
                                   dim=0).reshape(2, BATCH_SIZE).t()
            max_q_next = max_q_next.sum(dim=1).unsqueeze(0).t()
            b_r = torch.FloatTensor(list(b_r)).unsqueeze(0).t()
            q_target = b_r + DISCOUNT * max_q_next

        # MSELoss
        loss = self.loss_func(q_eval_wrt_a, q_target)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()