Python ReplayBuffer.sample примеры использования

Язык программирования: Python

Пространство имен/Пакет: replaybuffer

Класс/Тип: ReplayBuffer

Метод/Функция: sample

Примеров на hotexamples.com: 30

Python ReplayBuffer.sample - 30 примеров найдено. Это лучшие примеры Python кода для replaybuffer.ReplayBuffer.sample, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

ReplayBuffer(30)

add(30)

sample(30)

sample_batch(8)

size(6)

sample_data(4)

teleporter_save_data(4)

append(3)

push(2)

put(2)

add_buffer(2)

store_transition(2)

get_batch(1)

back_trace_reward(1)

sample_option_critic(1)

save(1)

save_option_critic(1)

count(1)

store_episode(1)

clear_buffer(1)

update_priority(1)

Пример #1

Показать файл

Файл: agent.py Проект: breengles/hse_project_hns

    def train(self,
              transitions: int,
              sigma_max: float = 1.,
              sigma_min: float = 0.,
              buffer_size: int = 10000,
              batch_size: int = 128,
              progress_upd_step: int = None,
              start_training: int = 1000,
              shaping_coef: float = 300.):
        history = ReplayBuffer(buffer_size)
        progress_upd_step = progress_upd_step if progress_upd_step else transitions // 100

        log = {
            "alpha": self.alpha,
            "gamma": self.gamma,
            "sigma_max": sigma_max,
            "sigma_min": sigma_min,
            "buffer_size": buffer_size,
            "batch_size": batch_size,
            "tau": self.tau,
            "shaping_coef": shaping_coef,
            "step": [],
            "reward_mean": [],
            "reward_std": []
        }

        state = self.reset()
        t = tqdm(range(transitions))
        for i in t:
            sigma = sigma_max - (sigma_max - sigma_min) * i / transitions
            action = self.act(state)
            noise = np.random.normal(scale=sigma, size=action.shape)
            action = np.clip(action + noise, -1, 1)

            next_state, reward, done, _ = self.env.step(action)
            reward += shaping_coef * (self.gamma * np.abs(next_state[1]) -
                                      np.abs(state[1]))
            done_ = next_state[0] >= 0.5

            history.add((state, action, next_state, reward, done_))

            state = self.reset() if done else next_state

            if i > start_training:
                batch = history.sample(batch_size)
                self.update_critic(batch)
                self.update_actor(batch)

            if (i + 1) % progress_upd_step == 0:
                reward_mean, reward_std = self.evaluate_policy()

                log["step"].append(i)
                log["reward_mean"].append(reward_mean)
                log["reward_std"].append(reward_std)

                t.set_description(
                    f"step: {i + 1} | Rmean = {reward_mean:0.4f} | Rstd = {reward_std:0.4f}"
                )

        return log

Пример #2

Показать файл

    def train(self,
              transitions: int,
              eps_max: float = 0.5,
              eps_min: float = 0.,
              buffer_size: int = 10000,
              batch_size: int = 128,
              shaping_coef: float = 300.,
              progress_upd_step: int = None,
              start_training: int = 10000):
        history = ReplayBuffer(size=buffer_size)
        progress_upd_step = progress_upd_step if progress_upd_step else transitions // 100

        log = {
            "alpha": self.alpha,
            "gamma": self.gamma,
            "buffer_size": buffer_size,
            "batch_size": batch_size,
            "tau": self.tau,
            "shaping_coef": shaping_coef,
            "eps_max": eps_max,
            "eps_min": eps_min,
            "step": [],
            "reward_mean": [],
            "reward_std": []
        }

        state = self.reset()

        t = tqdm(range(transitions))
        for i in t:
            eps = eps_max - (eps_max - eps_min) * i / transitions
            if random() < eps:
                action = self.env.action_space.sample()
            else:
                action = self.act(state)

            next_state, reward, done, _ = self.env.step(action)
            reward += shaping_coef * (self.gamma * np.abs(next_state[1]) -
                                      np.abs(state[1]))
            done_ = next_state[0] >= 0.5

            history.add((state, action, next_state, reward, done_))

            state = self.reset() if done else next_state

            if i > start_training:
                self.update(history.sample(batch_size))

            if (i + 1) % progress_upd_step == 0:
                reward_mean, reward_std = self.evaluate_policy()

                log["step"].append(i)
                log["reward_mean"].append(reward_mean)
                log["reward_std"].append(reward_std)

                t.set_description(
                    f"step: {i + 1} | Rmean = {reward_mean:0.4f} | Rstd = {reward_std:0.4f}"
                )

        return log

Пример #3

Показать файл

Файл: gameagent.py Проект: seolhokim/ddpg-mountain-car-continuous

class Agent:
    def __init__(self,
                 input_dim,
                 output_dim,
                 tau=0.001,
                 gamma=0.99,
                 train_batch_size=640):
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.tau = tau
        self.gamma = gamma
        self.train_batch_size = train_batch_size
        self.main_critic = Critic(input_dim, output_dim, tau, gamma)
        self.target_critic = Critic(input_dim, output_dim, tau, gamma)

        self.main_actor = Actor(input_dim, output_dim, tau, gamma)
        self.target_actor = Actor(input_dim, output_dim, tau, gamma)

        self.target_critic.model.set_weights(
            self.main_critic.model.get_weights())
        self.target_actor.model.set_weights(
            self.main_actor.model.get_weights())

        self.memory = ReplayBuffer(batch_size=train_batch_size)

    def get_action(self, state):
        return self.main_actor.get_action(state)

    def train(self):
        data = self.memory.sample()
        states = np.vstack([e.state for e in data if e is not None])
        actions = np.array([e.action for e in data if e is not None
                            ]).astype(np.float32).reshape(-1, self.output_dim)
        rewards = np.array([e.reward for e in data if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in data
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([e.next_state for e in data if e is not None])

        actions_next = self.target_actor.model.predict_on_batch(next_states)
        Q_targets_next = self.target_critic.model.predict_on_batch(
            [next_states, actions_next])

        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)

        self.main_critic.train(states, actions, Q_targets)
        action_gradients = np.reshape(self.main_critic.get_gradient(states,actions), \
                                         (-1, self.output_dim))

        self.main_actor.train(states, action_gradients)

        self.target_actor.model = self.main_actor.soft_update(
            self.target_actor.model)
        self.target_critic.model = self.main_critic.soft_update(
            self.target_critic.model)

Пример #4

Показать файл

Файл: agentcommon.py Проект: agutierreza/drl-unity-tennis

class AgentCommon():
    """Interacts with and learns from the environment."""
    
    def __init__(self, state_size, action_size, num_agents, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)

        # Noise process
        #self.noise = OUNoise(action_size, random_seed)
        self.noise = OUNoise((self.num_agents, action_size), seed = random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
        
        self.actorL = ActorAgent(state_size, action_size, num_agents, self.noise, LR_ACTOR, self.memory, random_seed)
        self.actorR = ActorAgent(state_size, action_size, num_agents, self.noise, LR_ACTOR, self.memory, random_seed)
        self.sharedcritic = CriticAgent(state_size, action_size, num_agents, LR_CRITIC, WEIGHT_DECAY, TAU, random_seed)
    
    def step(self, state, action, reward, next_state, done):
        self.actorL.step(state[0], action[0], reward[0], next_state[0], done[0])
        self.actorR.step(state[1], action[1], reward[1], next_state[1], done[1])
        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences1 = self.memory.sample()
            experiences2 = self.memory.sample()
            self.sharedcritic.learn(self.actorL,experiences1, GAMMA)
            self.sharedcritic.learn(self.actorR,experiences2, GAMMA)

    def act(self, state, add_noise=True):
        actionL = self.actorL.act(state[0],add_noise=add_noise)
        actionR = self.actorL.act(state[1],add_noise=add_noise)
        return[actionL,actionR]
    
    def reset(self):
        self.noise.reset()

Пример #5

Показать файл

class Agent():
    """ DDPG Agent, interacts with environment and learns from environment """
    def __init__(self, device, state_size, n_agents, action_size, random_seed, \
                         buffer_size, batch_size, gamma, TAU, lr_actor, lr_critic, weight_decay,  \
                         learn_interval, learn_num, ou_sigma, ou_theta, checkpoint_folder = './'):

        # Set Computational device
        self.DEVICE = device

        # Init State, action and agent dimensions
        self.state_size = state_size
        self.n_agents = n_agents
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.l_step = 0
        self.log_interval = 200

        # Init Hyperparameters
        self.BUFFER_SIZE = buffer_size
        self.BATCH_SIZE = batch_size
        self.GAMMA = gamma
        self.TAU = TAU
        self.LR_ACTOR = lr_actor
        self.LR_CRITIC = lr_critic
        self.WEIGHT_DECAY = weight_decay
        self.LEARN_INTERVAL = learn_interval
        self.LEARN_NUM = learn_num

        # Init Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Init Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=weight_decay)

        # Init Noise Process
        self.noise = OUNoise((n_agents, action_size),
                             random_seed,
                             mu=0.,
                             theta=ou_theta,
                             sigma=ou_sigma)

        # Init Replay Memory
        self.memory = ReplayBuffer(device, action_size, buffer_size,
                                   batch_size, random_seed)

    # think
    def act(self, states, add_noise=True):
        """ Decide what action to take next """

        # evaluate state through actor_local
        states = torch.from_numpy(states).float().to(self.DEVICE)
        actions = np.zeros((self.n_agents, self.action_size))

        self.actor_local.eval()  # put actor_local network in "evaluation" mode
        with torch.no_grad():
            for n, state in enumerate(states):
                actions[n, :] = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()  # put actor_local back into "training" mode

        # add noise for better performance
        if add_noise:
            actions += self.noise.sample()

        return np.clip(actions, -1, 1)

    # embody
    def step(self, t, s, a, r, s_, done):
        """ Commit step into the brain """

        # Save SARS' to replay buffer --- state-action-reward-next_state tuple
        for n in range(self.n_agents):
            # self.memory.add(s, a, r, s_, done)

            # print ("going to learn 10 times")

            self.memory.add(s[n], a[n], r[n], s_[n], done[n])

        if t % self.LEARN_INTERVAL != 0:
            return

        # Learn (if enough samples are available in memory        )
        if len(self.memory) > self.BATCH_SIZE:
            # print ("going to learn 10 times")
            for _ in range(self.LEARN_NUM):
                experiences = self.memory.sample()  # get a memory sample
                self.learn(experiences, self.GAMMA)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """ Learn from experiences, with discount factor gamma
        
        Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        
        Params:
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """

        states, actions, rewards, next_states, dones = experiences

        # ------ Update Critic ------ #

        # get predicted next-state actions and Q values from target networks
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # minimize loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        #         torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ------ Update Actor ------ #

        # compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # minimize loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ------ Update Target Networks ------ #
        self.soft_update(self.critic_local, self.critic_target, self.TAU)
        self.soft_update(self.actor_local, self.actor_target, self.TAU)

        # keep count of steps taken
        # self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Пример #6

Показать файл

    buffer = ReplayBuffer(BUFFER_SIZE)

    losses = np.zeros(N_EPISODES)
    # Loop over episodes
    for episode in range(N_EPISODES):

        episode_losses = np.zeros(EPISODE_LENGTH)
        # Reset the environment for the start of the episode.
        agent.reset()
        # Loop over steps within this episode. The episode length here is 20.
        for step_num in range(EPISODE_LENGTH):
            # Step the agent once, and get the transition tuple for this step
            transition = agent.step()

            buffer.append(transition)

            if len(buffer) >= BATCH_SIZE:
                loss = dqn.batch_train_q_network(buffer.sample(BATCH_SIZE))
                episode_losses[step_num] = loss

            # time.sleep(0.2)

        losses[episode] = np.average(episode_losses)
        print("Finished episode {}, average loss = {}".format(
            episode, losses[episode]))

    # shift x-axis by BATCH_SIZE iterations
    ax.plot(losses, color='blue')
    plt.yscale('log')
    fig.savefig("dqn_erb_loss_vs_episodes.png")

Пример #7

Показать файл

class TD3():
    """ Twin Delayed Deep Deterministic Policy Gradient Model """

    def __init__(self, state_size, action_size, random_seed):
        
                """ Initialize the model with arguments as follows:
                
                    ARGUMENTS
                    =========
                        - state_size (int) = dimension of input space
                        - action_size (int) = dimension of action space
                        - random_seed (int) = random seed

                    Returns 
                    =======
                        - best learned action to take after Actor-Critic Learning
                """
            
                self.state_size = state_size
                self.action_size = action_size
                self.seed = random.seed(random_seed)

                # create noise
                self.noise = OUNoise(action_size, random_seed)
                self.noise_decay = NOISE_DECAY
                
                # create memory
                self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed, device)
                


                # Actor Networks (local online net + target net)
                self.actor_local = Actor(state_size, action_size, random_seed).to(device)
                self.actor_target = Actor(state_size, action_size, random_seed).to(device)
                self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr = LR_ACTOR)

                # Critic Networks (local online net + target net)
                self.critic_local = Critic(state_size, action_size, random_seed).to(device)
                self.critic_target = Critic(state_size, action_size, random_seed).to(device)
                self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)
                
                # instantiate online and target networks with same weights
                self.soft_update(self.actor_local, self.actor_target, 1)
                self.soft_update(self.critic_local, self.critic_target, 1)
                
                self.learn_counter = 0
                
                
    def act(self, state, add_noise=True):
        """ Choose an action while interacting and learning in the environment """

        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample() * self.noise_decay
            self.noise_decay *= self.noise_decay
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma, noise_clip=0.5, policy_freq=2):
        """ Sample from experiences and learn """

        # update the learn counter
        self.learn_counter += 1

        # get experience tuples
        states, actions, rewards, next_states, dones  = experiences
            
        # build noise on the action 
        ##### CAVE: need to put actions onto cpu() to create a cpu tensor that is put onto CUDA with .to(device)
        #noise = torch.FloatTensor(actions.cpu()).data.normal_(0, policy_noise).to(device)
        #noise = noise.clamp(-noise_clip, noise_clip)
        ### <<--- adding this kind of noise was implemented in the paper on github,
        ### but i used OU-Noise in the act method, so maybe better to use the same while learning

        noise = torch.FloatTensor([self.noise.sample() for _ in range(len(actions))]).to(device)
        noise = noise.clamp(-noise_clip, noise_clip)  
        # clip between -/+ max action dims because action+noise might run oor
        next_action = (self.actor_target(next_states) + noise).clamp(-1, 1)

        # compute the target Q value
        target_Q1, target_Q2 = self.critic_target(next_states, next_action)
        target_Q = torch.min(target_Q1, target_Q2)
        target_Q = rewards + (gamma * target_Q * (1-dones)).detach()

        # get current Q estimates
        current_Q1, current_Q2 = self.critic_local(states, actions)

        # compute critic loss
        critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)

        # update the critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # delay the policy update
        if self.learn_counter % policy_freq == 0:
                    
                # get actor_local predicted next action and use critic_local to complete
                actions_pred = self.actor_local.forward(states)
                actor_loss = -self.critic_local.Q1(states, actions_pred).mean()

                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()

                # delay update of actor and critic target models
                self.soft_update(self.actor_local, self.actor_target, TAU)
                self.soft_update(self.critic_local, self.critic_target, TAU)


    def soft_update(self, local_model, target_model, tau):
        # Perform soft update of the target networks
        # at every time step, keep 1-tau of target network
        # and add only a small fraction (tau) of the current online networks
        # to prevent oszillation
        for local_param, target_param in zip(local_model.parameters(), target_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

    def step(self, state, action, reward, next_state, done):
        # at every iteration, add new SARS' trajectory to memory, then learn from batches 
        # if learning_step is reached and enough samples are in the buffer
        
        self.memory.add(state, action, reward, next_state, done)
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

Пример #8

Показать файл

Файл: main.py Проект: albimc/Project_Collaboration_Competition

def main():

    ##########
    # CONFIG #
    ##########
    # Target Reward
    tgt_score = 0.5
    # Device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # Seed
    seed = 7
    seeding(seed)
    # Model Architecture
    # Actor
    hidden_in_actor = 256
    hidden_out_actor = 128
    lr_actor = 1e-4
    # Critic
    hidden_in_critic = 256
    hidden_out_critic = 128
    lr_critic = 3e-4
    weight_decay_critic = 0
    # Episodes
    number_of_episodes = 10000
    episode_length = 2000
    # Buffer
    buffer_size = int(1e6)
    batchsize = 512
    # Agent Update Frequency
    episode_per_update = 1
    # Rewards Discounts Factor
    discount_factor = 0.95
    # Soft Update Weight
    tau = 1e-2
    # Noise Process
    noise_factor = 2
    noise_reduction = 0.9999
    noise_floor = 0.0
    # Window
    win_len = 100
    # Save Frequency
    save_interval = 200
    # Logger
    log_path = os.getcwd() + "/log"
    logger = SummaryWriter(log_dir=log_path)
    # Model Directory
    model_dir = os.getcwd() + "/model_dir"
    os.makedirs(model_dir, exist_ok=True)
    # Load Saved Model
    load_model = False

    ####################
    # Load Environment #
    ####################
    env = UnityEnvironment(file_name="./Tennis_Linux_NoVis/Tennis.x86_64")
    # Get brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    print('Brain Name:', brain_name)
    # Reset the environment
    env_info = env.reset(train_mode=True)[brain_name]
    # Number of Agents
    num_agents = len(env_info.agents)
    print('Number of agents:', num_agents)
    # size of each action
    action_size = brain.vector_action_space_size
    print('Size of each action:', action_size)
    # examine the state space
    states = env_info.vector_observations
    state_size = states.shape[1]
    print('There are {} agents. Each observes a state with length: {}'.format(
        states.shape[0], state_size))

    ####################
    # Show Progressbar #
    ####################
    widget = [
        'episode: ',
        pb.Counter(), '/',
        str(number_of_episodes), ' ',
        pb.Percentage(), ' ',
        pb.ETA(), ' ',
        pb.Bar(marker=pb.RotatingMarker()), ' '
    ]
    timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()
    start = time.time()

    ###############
    # Multi Agent #
    ###############
    maddpg = MADDPG(state_size, action_size, num_agents, hidden_in_actor,
                    hidden_out_actor, lr_actor, hidden_in_critic,
                    hidden_out_critic, lr_critic, weight_decay_critic,
                    discount_factor, tau, seed, device)

    if load_model:
        load_dict_list = torch.load(os.path.join(model_dir,
                                                 'episode-saved.pt'))
        for i in range(num_agents):
            maddpg.maddpg_agent[i].actor.load_state_dict(
                load_dict_list[i]['actor_params'])
            maddpg.maddpg_agent[i].actor_optimizer.load_state_dict(
                load_dict_list[i]['actor_optim_params'])
            maddpg.maddpg_agent[i].critic.load_state_dict(
                load_dict_list[i]['critic_params'])
            maddpg.maddpg_agent[i].critic_optimizer.load_state_dict(
                load_dict_list[i]['critic_optim_params'])

    #################
    # Replay Buffer #
    #################
    rebuffer = ReplayBuffer(buffer_size, seed, device)

    #################
    # TRAINING LOOP #
    #################
    # initialize scores
    scores_history = []
    scores_window = deque(maxlen=save_interval)

    # i_episode = 0
    for i_episode in range(number_of_episodes):
        timer.update(i_episode)

        # Reset Environmet
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations
        scores = np.zeros(num_agents)

        # Reset Agent
        maddpg.reset()

        # episode_t = 0
        for episode_t in range(episode_length):

            # Explore with decaying noise factor
            actions = maddpg.act(states, noise_factor=noise_factor)
            env_info = env.step(actions)[brain_name]  # Environment reacts
            next_states = env_info.vector_observations  # get the next states
            rewards = env_info.rewards  # get the rewards
            dones = env_info.local_done  # see if episode has finished

            ###################
            # Save Experience #
            ###################
            rebuffer.add(states, actions, rewards, next_states, dones)

            scores += rewards
            states = next_states

            if any(dones):
                break

        scores_history.append(np.max(scores))  # save most recent score
        scores_window.append(np.max(scores))  # save most recent score
        avg_rewards = np.mean(scores_window)
        noise_factor = max(noise_floor, noise_factor *
                           noise_reduction)  # Reduce Noise Factor

        #########
        # LEARN #
        #########
        if len(rebuffer) > batchsize and i_episode % episode_per_update == 0:
            for a_i in range(num_agents):
                samples = rebuffer.sample(batchsize)
                maddpg.update(samples, a_i, logger)
            # Soft Update
            maddpg.update_targets()

        ##################
        # Track Progress #
        ##################
        if i_episode % save_interval == 0 or i_episode == number_of_episodes - 1:
            logger.add_scalars('rewards', {
                'Avg Reward': avg_rewards,
                'Noise Factor': noise_factor
            }, i_episode)
            print(
                '\nElapsed time {:.1f} \t Update Count {} \t Last Episode t {}'
                .format((time.time() - start) / 60, maddpg.update_count,
                        episode_t),
                '\nEpisode {} \tAverage Score: {:.2f} \tNoise Factor {:2f}'.
                format(i_episode, avg_rewards, noise_factor),
                end="\n")

        ##############
        # Save Model #
        ##############
        save_info = ((i_episode) % save_interval == 0
                     or i_episode == number_of_episodes)
        if save_info:
            save_dict_list = []
            for i in range(num_agents):
                save_dict = {
                    'actor_params':
                    maddpg.maddpg_agent[i].actor.state_dict(),
                    'actor_optim_params':
                    maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                    'critic_params':
                    maddpg.maddpg_agent[i].critic.state_dict(),
                    'critic_optim_params':
                    maddpg.maddpg_agent[i].critic_optimizer.state_dict()
                }
                save_dict_list.append(save_dict)
            torch.save(save_dict_list,
                       os.path.join(model_dir, 'episode-Latest.pt'))

            pd.Series(scores_history).to_csv(
                os.path.join(model_dir, "scores.csv"))

            # plot the scores
            rolling_mean = pd.Series(scores_history).rolling(win_len).mean()
            fig = plt.figure()
            ax = fig.add_subplot(111)
            plt.plot(np.arange(len(scores_history)), scores_history)
            plt.axhline(y=tgt_score, color='r', linestyle='dashed')
            plt.plot(rolling_mean, lw=3)
            plt.ylabel('Score')
            plt.xlabel('Episode #')
            # plt.show()
            fig.savefig(os.path.join(model_dir, 'Average_Score.pdf'))
            fig.savefig(os.path.join(model_dir, 'Average_Score.jpg'))
            plt.close()

        if avg_rewards > tgt_score:
            logger.add_scalars('rewards', {
                'Avg Reward': avg_rewards,
                'Noise Factor': noise_factor
            }, i_episode)
            print(
                '\nElapsed time {:.1f} \t Update Count {} \t Last Episode t {}'
                .format((time.time() - start) / 60, maddpg.update_count,
                        episode_t),
                '\nEpisode {} \tAverage Score: {:.2f} \tNoise Factor {:2f}'.
                format(i_episode, avg_rewards, noise_factor),
                end="\n")
            break

    env.close()
    logger.close()
    timer.finish()

Пример #9

Показать файл

Файл: test.py Проект: nsh8286/graduation

                 ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))
print("s_t: ", s_t)
print("s_t size: ", s_t.size)
a = [[0, 1]]
#t_start = timeit.default_timer()
for i in range(max_step):
    ob, r_t, done, info = env.step(a[0])
    if done:
        break
    s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                      ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))
    memory.put((s_t, a[0], r_t, s_t1, done))
    s_t = s_t1
#t_end = timeit.default_timer()
s_done = s_t
print('done?: ', s_done)
#print('{}steps, {} time spent'.format(i,t_end-t_start))
env.end()
s, a, r, sp, d = memory.sample(3)
print('s: ', s)
print('a: ', a)
print('r: ', r)
print('sp: ', sp)
print('d: ', d)

# # --noise 테스트합니다.--
# noise = OrnsteinUhlenbeckNoise(mu = np.zeros(1),theta=0.1,dt=0.2,sigma = 0.1, x0 = np.array([0.5]))
# for i in range(300):
#     noise()
#     print(noise)

Пример #10

Показать файл

Файл: ddqn_agent.py Проект: janamejaya/DRLND_P1_Navigation

class DDQNAgent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 hidden_layers=[64, 64],
                 buffer_size=int(1e5),
                 batch_size=64,
                 gamma=0.99,
                 tau=1e-3,
                 learning_rate=5e-4,
                 update_every=4,
                 head_name="DuelingDQN",
                 head_scale="max"):
        """Initialize an Agent object.
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            hidden_layers (list of int ; optional): number of each layer nodes
            buffer_size (int ; optional): replay buffer size
            batch_size (int; optional): minibatch size
            gamma (float; optional): discount factor
            tau (float; optional): for soft update of target parameters
            learning_rate (float; optional): learning rate
            update_every (int; optional): how often to update the network
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr = learning_rate
        self.update_every = update_every

        # detect GPU device
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        # Assign model parameters and assign device
        model_params = [
            state_size, action_size, seed, hidden_layers, head_name, head_scale
        ]
        self.qnetwork_local = QNetwork(*model_params).to(self.device)
        self.qnetwork_target = QNetwork(*model_params).to(self.device)

        # Set up optimizer
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=self.lr)

        # Initialize Replay memory
        self.memory = ReplayBuffer(action_size, self.buffer_size,
                                   self.batch_size, seed, self.device)
        # Initialize time step (for updating every self.update_every steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Update time step
        self.t_step = self.t_step + 1

        # Learn every self.update_every time steps.
        if self.t_step % self.update_every == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences, self.gamma)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)

        # Go to evaluation mode and get Q values for current state
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)

        # get back to train mode
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        # From the experiences buffer, separate out S_t, A_t, R_t, S_t+1, done data
        states, actions, rewards, next_states, dones = experiences

        # Go to evaluation mode
        self.qnetwork_target.eval()
        with torch.no_grad():
            # get Q values for the next state
            Q_dash_local = self.qnetwork_local(next_states)
            Q_dash_target = self.qnetwork_target(next_states)

            # Find the predicted action based on the local Q_network
            argmax_action = torch.max(Q_dash_local, dim=1, keepdim=True)[1]

            # Get the Q-value from the target network
            Q_dash_max = Q_dash_target.gather(1, argmax_action)

            # Update the target value
            y = rewards + gamma * Q_dash_max * (1 - dones)

        # Go back to train mode
        self.qnetwork_target.train()

        # Predict Q-values based on the local network
        self.optimizer.zero_grad()
        Q = self.qnetwork_local(states)
        y_pred = Q.gather(1, actions)

        # TD-error/loss function
        loss = torch.sum((y - y_pred)**2)

        # Optimize the network
        loss.backward()
        self.optimizer.step()

        # Update the target network using the local and target networks
        self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        ?_target = ?*?_local + (1 - ?)*?_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Пример #11

Показать файл

Файл: DQNAgent.py Проект: Linlin-Li-1/SuperMarioAI

class DQNAgent(Agent):
    class SAVE:
        MEMORY = 1
        TARGETNETWORK = 2
        TRAINNETWORK = 4
        HYPERPARAM = 8
        ALL = 15

    def __init__(self,
                 DQNType,
                 input_shape,
                 replaybuffersize=100000,
                 input_preprocess=[]):
        super().__init__(MOVEMENTS.COMPLEX)
        self.memory = ReplayBuffer(replaybuffersize)
        self.train_network = DQNType(input_shape, len(self.movements))
        self.target_network = self.train_network.clone_model()
        self.input_preprocess = input_preprocess

        ## Initialize
        self.counter = 0
        self.epsilon = 1

        ## hyperparameters
        self.hyperparams = {
            "burn_in": 10000,
            "copy_each": 5000,
            "learn_each": 1,
            "save_each": 5000,
            "final_epsilon": 0.1,
            "epsilon_decay_rate": 0.99998,
            "batch_size": 32,
            "gamma": 0.99
        }

    def setparam(self, **kwargs):
        for key, val in kwargs.items():
            self.hyperparams[key] = val
        return self

    def getparams(self):
        return self.hyperparams

    def preprocess(self, image):
        for pc in self.input_preprocess:
            image = pc(image)
        return image

    def reward(self, reward, info_old, info_new):
        return reward + (info_new["score"] - info_old["score"]) / 100

    def action(self, states):
        self.action_states = self.preprocess(states)
        ## Random exploration
        if random.uniform(0, 1) < self.epsilon:
            self.action_num = random.choice(range(len(self.movements)))
        ## Make a decision based on the network
        else:
            normalized_states = figure.normalize()(
                self.action_states)  # convert to 0-1 scale
            output = self.train_network.predict(normalized_states[None, ...])
            self.action_num = np.argmax(output)
        return self.movements[self.action_num]

    def feedback(self, states, reward, info, done):
        # what to do after getting a reward
        self.counter += 1
        self.memory.append((
            self.action_states,  # already preprocessed
            self.action_num,
            reward,
            info,
            done,
            self.preprocess(states)))
        self.updateNetwork()

    def save(self, file_path, saveMethod=None):
        if saveMethod is None:
            saveMethod = self.SAVE.ALL
        if (saveMethod & self.SAVE.MEMORY):
            self.memory.save(file_path + "memory")
        if (saveMethod & self.SAVE.TARGETNETWORK):
            self.target_network.save_model(file_path + "target_net")
        if (saveMethod & self.SAVE.TRAINNETWORK):
            self.train_network.save_model(file_path + "train_net")
        # if (saveMethod & self.SAVE.HYPERPARAM):
        #     with open(file_path + "hyperparam.json", "w") as f:
        #         json.dump(self.hyperparams, f, indent=2)
    def load(self, file_path):
        try:
            self.target_network.load_model(file_path + "target_net")
            self.train_network.load_model(file_path + "train_net")
            with open(file_path + "hyperparam.json", "r") as f:
                self.hyperparams = json.load(f)
        except Exception as e:
            print(e)

    def updateNetwork(self):
        if self.counter < self.hyperparams["burn_in"]:
            return
        self.epsilon *= self.hyperparams["epsilon_decay_rate"]
        self.epsilon = max(self.epsilon, self.hyperparams["final_epsilon"])
        if (self.counter - self.hyperparams["burn_in"]
            ) % self.hyperparams["learn_each"] == 0:
            self.learn()
        if (self.counter - self.hyperparams["burn_in"]
            ) % self.hyperparams["copy_each"] == 0:
            self.target_network = self.train_network.clone_model()
        if (self.counter - self.hyperparams["burn_in"]
            ) % self.hyperparams["save_each"] == 0:
            self.save("./autosave/step_" + str(self.counter))

    def learn(self):
        learn_sample = self.memory.sample(self.hyperparams["batch_size"])
        state_raw = np.stack(
            [states for states, _, _, _, _, _ in learn_sample], axis=0)
        actions = [action for _, action, _, _, _, _ in learn_sample]
        rewards = [reward for _, _, reward, _, _, _ in learn_sample]
        not_done = [not done for _, _, _, _, done, _ in learn_sample]
        next_state_raw = np.stack(
            [states for _, _, _, _, _, states in learn_sample], axis=0)
        state = figure.normalize()(state_raw)
        next_state = figure.normalize()(next_state_raw)
        best_action_next = np.argmax(self.train_network.predict(next_state),
                                     axis=1)
        # Predicts the Q values calculated at the best_action_next
        # We shall only keep those entries corresponding to the real actions taken
        # Terminal states should not involve calculating the expected Q value.
        Q_value_next_target_mat = self.target_network.predict(
            next_state, actions=best_action_next)
        Q_value_next_target_vec = np.max(Q_value_next_target_mat, axis=1)
        Q_value_target_vec = np.array(rewards) + self.hyperparams[
            "gamma"] * np.array(not_done) * Q_value_next_target_vec
        Q_value_target_mat = np.zeros(Q_value_next_target_mat.shape)
        for id, num in enumerate(actions):
            Q_value_target_mat[id, num] = Q_value_target_vec[id]

        self.train_network.fit(state, actions, Q_value_target_mat, verbose=0)

Пример #12

Показать файл

    # Loop over episodes
    for episode in range(N_EPISODES):
        epsilon = min(10 / (episode + 1), 1)

        episode_losses = np.zeros(EPISODE_LENGTH)
        # Reset the environment for the start of the episode.
        agent.reset()
        # Loop over steps within this episode. The episode length here is 20.
        for step_num in range(EPISODE_LENGTH):
            # Step the agent once, and get the transition tuple for this step
            transition = agent.step(dqn, epsilon)

            buffer.append(transition)

            if len(buffer) >= BATCH_SIZE:
                loss = dqn.batch_train_q_network(buffer.sample(BATCH_SIZE), target_network=target)
                episode_losses[step_num] = loss

            if (episode * EPISODE_LENGTH + step_num) % TARGET_SWAP == 0:
                print("Swapped target network on step {}".format(episode * EPISODE_LENGTH + step_num))
                target.q_network.load_state_dict(dqn.q_network.state_dict())
            # time.sleep(0.05)
        
        losses[episode] = np.average(episode_losses)
        print("Finished episode {}, average loss = {}".format(episode, losses[episode]))

    
    # evaluate Q-value
    q_values = np.zeros((10, 10, 4))
    for col in range(10):
        x = col / 10 + 0.05

Пример #13

Показать файл

Файл: maddpg_agent.py Проект: CenturyLiu/RL-Project-Tennis

class MADDPG:
    def __init__(self,
                 num_agents,
                 local_obs_dim,
                 local_action_size,
                 global_obs_dim,
                 global_action_size,
                 discount_factor=0.95,
                 tau=0.02,
                 device=device,
                 random_seed=4,
                 lr_critic=1.0e-4,
                 weight_decay=0.0):
        super(MADDPG, self).__init__()

        # parameter configuration
        self.num_agents = num_agents
        self.device = device
        self.discount_factor = discount_factor
        self.tau = tau
        self.num_agents = num_agents
        self.global_action_size = global_action_size
        self.global_obs_dim = global_obs_dim
        torch.manual_seed(random_seed)
        random.seed(random_seed)
        self.random_seed = random_seed
        self.weight_decay = weight_decay

        # define actors
        self.actors = [
            DDPGActor(num_agents,
                      local_obs_dim,
                      local_action_size,
                      global_obs_dim,
                      global_action_size,
                      device=device) for _ in range(num_agents)
        ]
        # define centralized critic
        self.critic = Critic(global_obs_dim, global_action_size,
                             self.random_seed).to(self.device)
        self.target_critic = Critic(global_obs_dim, global_action_size,
                                    self.random_seed).to(self.device)
        hard_update(self.target_critic, self.critic)

        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=lr_critic,
                                     weight_decay=self.weight_decay)

        # noise coef
        self.noise_coef = 1.0
        self.noise_coef_decay = 1e-6

        # Replay memory
        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed)

    def act(self, obs_all_agents):
        actions = [
            ddpg_actor.act(local_obs, self.noise_coef)
            for ddpg_actor, local_obs in zip(self.actors, obs_all_agents)
        ]
        return actions

    def target_act(self, obs_all_agents):
        actions = [
            ddpg_actor.target_act(local_obs, noise_coef=0, add_noise=False)
            for ddpg_actor, local_obs in zip(self.actors, obs_all_agents)
        ]
        return actions

    def step(self, obs, obs_full, actions, rewards, next_obs, next_obs_full,
             dones, timestep):
        self.memory.add(obs, obs_full, actions, rewards, next_obs,
                        next_obs_full, dones)

        timestep = timestep % TRAIN_EVERY

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE and timestep == 0:
            for _ in range(N_LEARN_UPDATES):
                experiences = self.memory.sample()
                self.learn(experiences, self.discount_factor)

    def learn(self, experiences, gamma):
        obs, obs_full, action, reward, next_obs, next_obs_full, done = experiences

        obs = obs.permute(1, 0, -1)  # agent_id * batch_size * state_size
        obs_full = obs_full.view(-1, self.global_obs_dim)
        next_obs = next_obs.permute(1, 0, -1)
        next_obs_full = next_obs_full.view(-1, self.global_obs_dim)
        action = action.reshape(-1, self.global_action_size)

        # ---------------- update centralized critic ----------------------- #
        self.critic_optimizer.zero_grad()

        # get target actions from all target_actors
        target_actions = np.array(self.target_act(next_obs))
        target_actions = torch.from_numpy(target_actions).float().permute(
            1, 0, -1)
        target_actions = target_actions.reshape(-1, self.global_action_size)

        # update critic
        with torch.no_grad():
            q_next = self.target_critic.forward(next_obs_full,
                                                target_actions.to(self.device))

        y = reward + gamma * q_next * (1 - done)

        q = self.critic.forward(obs_full, action)

        critic_loss = 0
        for i in range(self.num_agents):
            critic_loss += F.mse_loss(q, y[:, i].detach().reshape(
                -1, 1)) / self.num_agents
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------- update actor for all agents --------------------- #
        for ii in range(len(self.actors)):
            self.actors[ii].actor_optimizer.zero_grad()

            q_action = [ self.actors[i].actor_local(ob) if i == ii \
                   else self.actors[i].actor_local(ob).detach()
                   for i, ob in enumerate(obs) ]

            q_action = torch.stack(q_action).permute(1, 0, -1)
            q_action = q_action.reshape(-1, self.global_action_size).to(
                self.device)

            # policy_gradient
            actor_loss = -self.critic.forward(obs_full, q_action).mean()
            actor_loss.backward()
            self.actors[ii].actor_optimizer.step()

        # --------------- soft update all target networks ------------------- #
        soft_update(self.target_critic, self.critic, self.tau)
        for actor in self.actors:
            actor.update_target(self.tau)

        # -------------- reset noise --------------------------------------- #
        for actor in self.actors:
            actor.action_noise.reset()

        self.noise_coef -= self.noise_coef_decay
        if self.noise_coef < 0.01:
            self.noise_coef = 0.01

Пример #14

Показать файл

Файл: train_model.py Проект: geez0219/DQN

        done = 0
        total_reward = 0
        step = agent.step_move()
        epsilon = max(1 - step * arg.epsilon_decrease, arg.epsilon_min)
        while not done:
            if np.random.uniform(0, 1) < epsilon:
                action = agent.random_action()
            else:
                action = agent.choose_action(obs)
            obs_, reward, done, _ = env.step(
                action + 1)  # because there is only three action
            replay_buffer.store_transition(obs, obs_, action, reward, done)
            total_reward += reward
            obs = obs_

        print('in {}, {}th game: the reward {} '.format(
            arg.run_name, step, total_reward))

        if step % train_period == 0:
            s1, s2, a, r, d = replay_buffer.sample(batch_size=train_batch)
            if step % record_period == 0:
                loss = agent.train(s1, s2, a, r, d, True)
                agent.log_reward(total_reward)
                agent.save()
            else:
                loss = agent.train(s1, s2, a, r, d, False)
            print('{}th game: the training loss {}'.format(step, loss))

        if step % arg.update_period == 0:
            agent.update_target_network()

Пример #15

Показать файл

Файл: maddpg.py Проект: janamejaya/DRLND-Collaboration_and_Competition

class DDPG_Agent():
    """Interacts with and learns from the environment."""
#self.state_size, self.action_size, self.seed, hidden_layers_actor, hidden_layers_critic, self.buffer_size, learning_rate_actor, learning_rate_critic
    def __init__(self, state_size, action_size, num_agents, seed, device,
                 buffer_size=int(1e5), batch_size=128, num_batches = 5, update_every=10,
                 gamma=0.99, tau=8e-3,
                 learning_rate_actor=1e-3, learning_rate_critic=1e-3, weight_decay=0.0001,                
                 hidden_layers_actor=[32,32], hidden_layers_critic=[32, 32, 32],
                 add_noise=True, start_eps=5.0, end_eps=0.0, end_eps_episode=500,
                 agent_id=-1):
        """Initialize an Agent object.
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents
            seed (int): random seed
            hidden_layers (list of int ; optional): number of each layer nodes
            buffer_size (int ; optional): replay buffer size
            batch_size (int; optional): minibatch size
            gamma (float; optional): discount factor
            tau (float; optional): for soft update of target parameters
            learning_rate_X (float; optional): learning rate for X=actor or critic
        """
        print('In DPPG_AGENT: seed = ', seed)
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(seed)
        self.device = device
        
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.update_every = update_every
        self.num_batches = num_batches
        
        self.gamma = gamma
        self.tau = tau
        
        self.lr_actor = learning_rate_actor
        self.lr_critic = learning_rate_critic
        self.weight_decay_critic = weight_decay
        
        self.add_noise = add_noise
        self.start_eps = start_eps
        self.eps = start_eps
        self.end_eps = end_eps
        self.eps_decay = 1/(end_eps_episode*num_batches)  # set decay rate based on epsilon end target
        self.timestep = 0
        
        self.agent_id = agent_id
     
        ### SET UP THE ACTOR NETWORK ###
        # Assign model parameters and assign device
        model_params_actor  = [state_size, action_size, seed, hidden_layers_actor]
        
        # Create the Actor Network (w/ Target Network)
        self.actor_local = Actor(*model_params_actor).to(self.device)
        self.actor_target = Actor(*model_params_actor).to(self.device)
        #print('actor_local network is: ', print(self.actor_local))
        
        # Set up optimizer for the Actor network
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor)       
        
        ### SET UP THE CRITIC NETWORK ###
        model_params_critic = [state_size, action_size, seed, hidden_layers_critic]

        # Create the Critic Network (w/ Target Network)
        self.critic_local = Critic(*model_params_critic).to(self.device)
        self.critic_target = Critic(*model_params_critic).to(self.device)
        
        # Set up optimizer for the Critic Network
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay_critic)

        # Noise process
        self.noise = OUNoise(action_size, self.seed)
        
        # Replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed, device)

    def step(self, states, actions, rewards, next_states, dones, agent_number):
        # Increment timestep by 1
        self.timestep += 1
        
        # Save experience in replay memory
        self.memory.add(states, actions, rewards, next_states, dones)
        
         # If there are enough samples and a model update is to be made at this time step
        if len(self.memory) > self.batch_size and self.timestep%self.update_every == 0:
            # For each batch
            for i in range(self.num_batches):
                # Sample experiences from memory
                experiences = self.memory.sample()
        
                # Learn from the experience
                self.learn(experiences, self.gamma, agent_number)

    def act(self, state, scale_noise=True):
        """Returns actions for given state as per current policy.
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().to(self.device)
        
        # Go to evaluation mode and get Q values for current state
        self.actor_local.eval()
        with torch.no_grad():
            # Get action for the agent and concatenate them
            action = [self.actor_local(state[0]).cpu().data.numpy()]
            
        # get back to train mode
        self.actor_local.train()
        
        # Add noise to the action probabilities
        # Note, we want the magnitude of noise to decrease as the agent keeps learning
        action += int(scale_noise)*(self.eps)*self.noise.sample()
        
        return np.clip(action, -1.0, 1.0)
    
    def reset(self):
        """
        Reset the noise, and all neural network parameters for the current agent
        """
        self.noise.reset()
        self.eps = self.start_eps
        self.timestep = 0
        self.critic_local.reset_parameters()
        self.actor_local.reset_parameters()
        self.critic_target.reset_parameters()
        self.actor_target.reset_parameters()
        
        # ReSet up optimizer for the Actor network
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor)
        
        # Set up optimizer for the Critic Network
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay_critic)
        
        # Clear the experience buffer
        self.memory.clear_buffer()
        
    def reset_noise(self):
        """
        Reset the noise only
        """
        self.noise.reset()
   
    def learn(self, experiences, gamma, agent_number):
        ####     DRAW FROM MEMORY AND PREPARE SARS DATA        ####
        # From the experiences buffer, separate out S_t, A_t, R_t, S_t+1, done data
        states, actions, rewards, next_states, dones = experiences
        
        # NOTE: actions has dimension of batch_size x concatenated action for all agents
      
        # get the next action for the current agent for the entire batch
        actions_next = self.actor_target(next_states)
    
        # Construct next action vector for the agent
        if agent_number == 0:
            actions_next = torch.cat((actions_next, actions[:,2:]), dim=1)
        else:
            actions_next = torch.cat((actions[:,:2], actions_next), dim=1)
        
        ####    UPDATE CRITIC   ####
        # Get predicted next-state actions and Q values from target models
        # Get the next targets
        Q_targets_next = self.critic_target(next_states, actions_next)
        
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
        
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        
        # Define the loss
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        
        # Clip gradient @1
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()
   

        # --------------UPDATE ACTOR -----------------------#
        # Compute actor loss
        actions_pred = self.actor_local(states)

        # Construct action prediction vector relative to each agent
        if agent_number == 0:
            actions_pred = torch.cat((actions_pred, actions[:,2:]), dim=1)
        else:
            actions_pred = torch.cat((actions[:,:2], actions_pred), dim=1)
        
        # Calculate the loss. Note the negative sign since we use steepest ascent
        actor_loss = -self.critic_local(states, actions_pred).mean()
        
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Update the target networks using the local and target networks
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)
        
        # update noise decay parameter
        self.eps -= self.eps_decay
        self.eps = max(self.eps, self.end_eps)
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        X_target = tau*X_local + (1 - tau)*X_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

Пример #16

Показать файл

Файл: agent.py Проект: chiurane/deep-reinforcement-learning-nanodegree

class Agent(object):
    """
    The Agent interacts with and learns from the environment.
    """
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 random_seed=0,
                 params=params):
        """
        Initialize an Agent object.
        Params
        ======
        state_size (int): dimension of each state
        action_size (int): dimension of each action
        num_agents (int): number of agents
        random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)
        self.params = params

        # Actor (Policy) Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(self.params['DEVICE'])
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(self.params['DEVICE'])
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.params['LR_ACTOR'])

        # Critic (Value) Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(self.params['DEVICE'])
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(self.params['DEVICE'])
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=self.params['LR_CRITIC'],
            weight_decay=self.params['WEIGHT_DECAY'])

        # Initialize target and local to same weights
        self.hard_update(self.actor_local, self.actor_target)
        self.hard_update(self.critic_local, self.critic_target)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.params['BUFFER_SIZE'],
                                   self.params['BATCH_SIZE'], random_seed)

    def hard_update(self, local_model, target_model):
        """
        Hard update model parameters.
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(local_param.data)

    def step(self, states, actions, rewards, next_states, dones):
        """
        Save experiences in replay memory and use random sample from buffer to learn.
        """

        # Save experience / reward, cater for when multiples
        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        # Learn if enough samples are available in memory
        if len(self.memory) > self.params['BATCH_SIZE']:
            experiences = self.memory.sample()
            self.learn(experiences, self.params['GAMMA'])

    def act(self, states, add_noise=True):
        """
        Returns actions for a given state as per current policy.
        """
        states = torch.from_numpy(states).float().to(self.params['DEVICE'])
        actions = np.zeros((self.num_agents, self.action_size))
        self.actor_local.eval()
        with torch.no_grad():
            for i, state in enumerate(states):
                actions[i, :] = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            actions += self.noise.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma=params['GAMMA']):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Update Critic(Value)
        # Get predicted next-state actions and Q-Values from target Network
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # Compute Q Targe for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimise the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(
            self.critic_local.parameters(),
            1)  # Stabilize learning per bernchmark guidelines
        self.critic_optimizer.step()

        # Update Actor (Policy)
        # Compute Actor Loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Update target networks
        self.soft_update(self.critic_local,
                         self.critic_target,
                         tau=self.params['TAU'])
        self.soft_update(self.actor_local,
                         self.actor_target,
                         tau=self.params['TAU'])

    def soft_update(self, local_model, target_model, tau=params['TAU']):
        """
        Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Пример #17

Показать файл

def train(graph_distr, epochs, batch_size, eps, n_step, discount, capacity,
          gcn_params, opt_params):
    '''
    graph_distr: object that wraps the graph generating distr
    epochs: int
    batch_size: int
    eps: float for exploration probability
    n: int, num steps for n-step Q-learning
    discount: float, how much to discount future state/action value
    capacity: int, number of episodes to keep in memory
    gcn_params: dictionary of graph conv net parameters
    opt_params: dictionary of params for optimizer
    '''
    qnet = QNetwork(gcn_params)
    memory = ReplayBuffer(capacity)
    opt_params['params'] = qnet.parameters()
    optimizer = get_optimizer(opt_params)

    for e in range(epochs):
        node_labels, edge_weights, adj = graph_distr.next()
        embedding = qnet.embed_graph(node_labels, edge_weights, adj)

        state = []  # s_0
        state_vec = Variable(torch.zeros((1, qnet.embed_dim)))
        state_vec_prev = None
        actions = []
        rewards = []
        s_complement = set(range(len(adj)))
        losses = []
        best_actions = []

        for t in range(len(adj)):
            if t > 0:
                v_best_t = qnet.best_action(state, list(s_complement),
                                            embedding)
            if random.random() < eps or t == 0:
                v_t = random.choice(tuple(s_complement))
            else:
                v_t = v_best_t

            action_vec = embedding[v_t].unsqueeze(0)
            vprev = None if t == 0 else state[-1]
            r_t = 0 if t == 0 else -edge_weights.data[vprev, v_t]
            s_complement.remove(v_t)

            # ideally store: s_0 , a_0, r_0, s_1, v_best_1
            # ideally store: s_1 , a_1, r_1, s_2, v_best_2
            if t >= n_step:
                new_state = state[:]
                # the action prev is what action got taken.
                # v_best_t must be the argmax action of the current state
                v_best_embedding = embedding[v_best_t].unsqueeze(0)
                episode = (state_vec_prev, action_vec_prev, rewards[-1],
                           state_vec, v_best_embedding)
                # should try to add v_best_t so we dont recompute later

                memory.push(*episode)
                if len(memory) > batch_size:
                    batch = memory.sample(batch_size)
                    batch_loss = qnet.backprop_batch(batch, optimizer)
                    losses.append(batch_loss)

            state_vec_prev = state_vec
            action_vec_prev = action_vec
            state.append(v_t)
            state_vec = state_vec + action_vec
            rewards.append(r_t)

        epoch_loss = torch.mean(torch.cat(losses))
        print('Epoch {} | avg loss: {:.3f} | Exploration rate: {:.3f}'.format(
            e, float(epoch_loss), eps))
        eps = update_exploration(eps)

Пример #18

Показать файл

Файл: agent.py Проект: torayeff/drl-codes

class Agent:
    def __init__(self,
                 state_size,
                 action_size,
                 device,
                 buffer_size=int(1e5),
                 batch_size=64,
                 gamma=0.99,
                 tau=1e-3,
                 lr=5e-4,
                 update_every=4):
        self.state_size = state_size
        self.action_size = action_size
        self.device = device
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr = lr
        self.update_every = update_every

        # model settings
        self.qnet_local = Model(state_size, action_size).to(self.device)
        self.qnet_target = Model(state_size, action_size).to(self.device)
        self.optimizer = optim.Adam(self.qnet_local.parameters(), lr=self.lr)

        # replay buffer settings
        self.replay_buffer = ReplayBuffer(self.buffer_size, self.batch_size)
        self.update_step = 0

    def step(self, state, action, reward, next_state, done):
        self.replay_buffer.add(state, action, reward, next_state, done)

        self.update_step = (self.update_step + 1) % self.update_every
        if (self.update_step
                == 0) and (len(self.replay_buffer) > self.batch_size):
            experiences = self.replay_buffer.sample()
            self.learn(experiences)

    def act(self, state, eps=0.0):
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)

        self.qnet_local.eval()
        with torch.no_grad():
            action_values = self.qnet_local(state)
        self.qnet_local.train()

        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return np.random.choice(self.action_size)

    def learn(self, experiences):
        states, actions, rewards, next_states, dones = experiences

        # convert to tensors and send to device
        states = torch.from_numpy(states).float().to(self.device)
        actions = torch.from_numpy(actions).long().to(self.device)
        rewards = torch.from_numpy(rewards).float().to(self.device)
        next_states = torch.from_numpy(next_states).float().to(self.device)
        dones = torch.from_numpy(dones).float().to(self.device)

        # max returns max values (0) and indices (1)
        # unsqueeze is needed to add batch dim B x 1
        q_max = self.qnet_target(next_states).detach().max(1)[0].unsqueeze(1)
        y = rewards + self.gamma * q_max * (1 - dones)

        # select action values corresponding to actions
        # this is what .gather does
        # note for the expected we pass states, not next_states
        q_expected = self.qnet_local(states).gather(1, actions)

        loss = F.mse_loss(q_expected, y)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.soft_update()

    def soft_update(self):
        for target_param, local_param in zip(self.qnet_target.parameters(),
                                             self.qnet_local.parameters()):
            target_param.data.copy_(self.tau * local_param.data +
                                    (1 - self.tau) * target_param.data)

    def train(self,
              env,
              n_episodes=2000,
              max_t=1000,
              eps_start=1.0,
              eps_end=0.01,
              eps_decay=0.995):
        scores = []
        scores_window = deque(maxlen=100)
        eps = eps_start

        brain_name = env.brain_names[0]

        for i_episode in range(1, n_episodes + 1):

            env_info = env.reset(train_mode=True)[brain_name]
            state = env_info.vector_observations[0]

            score = 0
            for t in range(max_t):
                action = self.act(state, eps)
                env_info = env.step(action)[brain_name]
                next_state = env_info.vector_observations[0]
                reward = env_info.rewards[0]
                done = env_info.local_done[0]

                self.step(state, action, reward, next_state, done)

                state = next_state
                score += reward

                if done:
                    break
            scores_window.append(score)
            scores.append(score)
            avg_scores = np.mean(scores_window)
            eps = max(eps_end, eps_decay * eps)

            print(f'\rEpisode {i_episode}\tAverage Score: {avg_scores:.2f}',
                  end='')
            if i_episode % 100 == 0:
                print(
                    f'\rEpisode {i_episode}\tAverage Score: {avg_scores:.2f}')
            if avg_scores >= 13.0:
                print(f'\nEnvironment solved in {i_episode - 100} episodes!'
                      f'\tAverage Score: {np.mean(scores_window):.2f}')
                torch.save(self.qnet_local.state_dict(), 'checkpoint.pth')
                break

        return scores

    def evaluate(self, env):

        brain_name = env.brain_names[0]
        env_info = env.reset(train_mode=False)[brain_name]
        state = env_info.vector_observations[0]

        score = 0
        for i in range(2000):

            action = self.act(state)
            env_info = env.step(action)[brain_name]
            next_state = env_info.vector_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0]
            state = next_state

            score += reward
            if done:
                break

        print(f'Total score: {score:.2f}')

Пример #19

Показать файл

Файл: dqn_agent.py Проект: rohitsharma10creator/udacity_navigation_2

class Agent():
    """Interacts with and learns from the environment"""

    def __init__(self, state_size, action_size, fc1_units=256, fc2_units=128, device=torch.device('cpu')):
        """DQN agent

        Args:
          state_size (int): dimension of each state
          action_size (int): dimension of each action (or the number of action choices)
          seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.device = device

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       fc1_units=fc1_units, fc2_units=fc2_units).to(self.device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        fc1_units=fc1_units, fc2_units=fc2_units).to(self.device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Initialze qnetwork_target parameters to qnetwork_local
        self.soft_update(self.qnetwork_local, self.qnetwork_target, 1)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, device=self.device)

        # Initialize the time step counter (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subnet and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Args:
          state (array_like): current state
          eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)

        # Set qnetwork_local to evaluation mode
        self.qnetwork_local.eval()

        # This operation should not be included in gradient calculation
        with torch.no_grad():
            action_values = self.qnetwork_local(state)

        # Set back qnetwork_local to training mode
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Args:
          experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
          gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)

        # Compute Q tagets for current states with actual rewards
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ----- Update the target network -----
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        theta_target = tau * theta_local + (1 - tau) * theta_target

        Args:
          local_model (torch.nn.Module): weights will be copied from
          target_model (torch.nn.MOdule): weights will be copied to
          tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1. - tau) * target_param.data)

Пример #20

Показать файл

class DDPG(Model):
    """ Interface """
    def __init__(self,
                 name,
                 args,
                 sess=None,
                 reuse=False,
                 log_tensorboard=True,
                 save=True):
        self.learn_steps = 0

        # hyperparameters
        self.gamma = args[name]['gamma']
        self.tau = args[name]['tau']
        self.init_noise_sigma = args[name]['init_noise_sigma']
        self.noise_decay = args[name]['noise_decay']

        # replay buffer
        self.buffer = ReplayBuffer(sample_size=args['batch_size'],
                                   max_len=args[name]['buffer_size'])

        super(DDPG, self).__init__(name,
                                   args,
                                   sess=sess,
                                   reuse=reuse,
                                   build_graph=True,
                                   log_tensorboard=log_tensorboard,
                                   save=save)

        self._initialize_target_net()

    @property
    def main_variables(self):
        return self.actor_critic.trainable_variables

    @property
    def _target_variables(self):
        return self._target_actor_critic.trainable_variables

    def act(self, state):
        self.sess.run(self.noise_op)
        state = state.reshape((-1, self.state_size))
        action = self.sess.run(self.actor_critic.actor_action,
                               feed_dict={self.actor_critic.state: state})
        self.sess.run(self.denoise_op)
        return np.squeeze(action)

    def step(self, state, action, reward, next_state, done):
        self.buffer.add(state, action, reward, next_state, done)

        if len(self.buffer) > self.buffer.sample_size + 100:
            self._learn()

    """ Implementation """

    def _build_graph(self):
        # env info
        self._setup_env()

        # main actor-critic
        self.actor_critic = self._create_actor_critic()
        # target actor-critic
        self._target_actor_critic = self._create_actor_critic(is_target=True)

        # losses
        self.actor_loss, self.critic_loss = self._loss()

        # optimizating operation
        self.opt_op = self._optimize([self.actor_loss, self.critic_loss])

        # target net update operations
        self.init_target_op, self.update_target_op = self._targetnet_ops()

        # operations that add/remove noise from parameters
        self.noise_op, self.denoise_op = self._noise_params()

    def _setup_env(self):
        self.state_size = self._args[self.name]['state_size']
        self.action_size = self._args[self.name]['action_size']
        self.env_info = {}
        with tf.name_scope('placeholders'):
            self.env_info['state'] = tf.placeholder(tf.float32,
                                                    shape=(None,
                                                           self.state_size),
                                                    name='state')
            self.env_info['action'] = tf.placeholder(tf.float32,
                                                     shape=(None,
                                                            self.action_size),
                                                     name='action')
            self.env_info['next_state'] = tf.placeholder(
                tf.float32, shape=(None, self.state_size), name='next_state')
            self.env_info['reward'] = tf.placeholder(tf.float32,
                                                     shape=(None, 1),
                                                     name='reward')
            self.env_info['done'] = tf.placeholder(tf.uint8,
                                                   shape=(None, 1),
                                                   name='done')

    def _create_actor_critic(self, is_target=False):
        name = 'target_actor_critic' if is_target else 'actor_critic'
        log_tensorboard = False if is_target else True
        actor_critic = ActorCritic(name,
                                   self._args,
                                   self.env_info,
                                   self.action_size,
                                   reuse=self.reuse,
                                   log_tensorboard=log_tensorboard,
                                   is_target=is_target)

        return actor_critic

    def _loss(self):
        with tf.name_scope('loss'):
            with tf.name_scope('l2_loss'):
                encoder_l2_loss = tf.losses.get_regularization_loss(
                    scope=self.actor_critic.variable_scope + '/state_encoder',
                    name='encoder_l2_loss')
                actor_l2_loss = tf.losses.get_regularization_loss(
                    scope=self.actor_critic.variable_scope + '/actor',
                    name='actor_l2_loss')
                critic_l2_loss = tf.losses.get_regularization_loss(
                    scope=self.actor_critic.variable_scope + '/critic',
                    name='critic_l2_loss')

            with tf.name_scope('actor_loss'):
                actor_loss = tf.negative(
                    tf.reduce_mean(self.actor_critic.Q_with_actor),
                    name='actor_loss') + encoder_l2_loss + actor_l2_loss

            with tf.name_scope('critic_loss'):
                target_Q = tf.stop_gradient(
                    self.env_info['reward'] +
                    self.gamma * tf.cast(1 - self.env_info['done'], tf.float32)
                    * self._target_actor_critic.Q_with_actor,
                    name='target_Q')
                critic_loss = tf.losses.mean_squared_error(
                    target_Q,
                    self.actor_critic.Q) + encoder_l2_loss + critic_l2_loss

            if self.log_tensorboard:
                tf.summary.scalar('actor_l2_loss_', actor_l2_loss)
                tf.summary.scalar('critic_l2_loss_', critic_l2_loss)
                tf.summary.scalar('encoder_l2_loss_', encoder_l2_loss)
                tf.summary.scalar('actor_loss_', actor_loss)
                tf.summary.scalar('critic_loss_', critic_loss)

        return actor_loss, critic_loss

    def _optimize(self, losses):
        with tf.variable_scope('optimizer'):
            actor_loss, critic_loss = losses
            actor_opt_op = self._optimize_objective(actor_loss, 'actor')
            critic_opt_op = self._optimize_objective(critic_loss, 'critic')

            opt_op = tf.group(actor_opt_op, critic_opt_op)

        return opt_op

    def _optimize_objective(self, loss, name):
        # params for optimizer
        learning_rate = self._args['actor_critic'][name][
            'learning_rate'] if 'learning_rate' in self._args['actor_critic'][
                name] else 1e-3
        beta1 = self._args['actor_critic'][name][
            'beta1'] if 'beta1' in self._args['actor_critic'][name] else .9
        beta2 = self._args['actor_critic'][name][
            'beta2'] if 'beta2' in self._args['actor_critic'][name] else .999
        clip_norm = self._args[name]['actor_critic'][
            'clip_norm'] if 'clip_norm' in self._args['actor_critic'] else 5.

        with tf.variable_scope(name + '_opt', reuse=self.reuse):
            # setup optimizer
            self._optimizer = tf.train.AdamOptimizer(
                learning_rate=learning_rate, beta1=beta1, beta2=beta2)

            tvars = self.actor_critic.actor_trainable_variables if name == 'actor' else self.actor_critic.critic_trainable_variables
            grads, tvars = list(
                zip(*self._optimizer.compute_gradients(loss, var_list=tvars)))
            grads, _ = tf.clip_by_global_norm(grads, clip_norm)
            opt_op = self._optimizer.apply_gradients(zip(grads, tvars))

        if self.log_tensorboard:
            with tf.name_scope(name):
                with tf.name_scope('gradients_'):
                    for grad, var in zip(grads, tvars):
                        if grad is not None:
                            tf.summary.histogram(var.name.replace(':0', ''),
                                                 grad)
                with tf.name_scope('params_'):
                    for var in tvars:
                        tf.summary.histogram(var.name.replace(':0', ''), var)

        return opt_op

    def _targetnet_ops(self):
        with tf.name_scope('target_net_op'):
            target_main_var_pairs = list(
                zip(self._target_variables, self.main_variables))
            init_target_op = list(
                map(lambda v: tf.assign(v[0], v[1], name='init_target_op'),
                    target_main_var_pairs))
            update_target_op = list(
                map(
                    lambda v: tf.assign(v[0],
                                        self.tau * v[1] +
                                        (1. - self.tau) * v[0],
                                        name='update_target_op'),
                    target_main_var_pairs))

        return init_target_op, update_target_op

    def _learn(self):
        states, actions, rewards, next_states, dones = self.buffer.sample()

        feed_dict = {
            self.env_info['state']: states,
            self.env_info['action']: actions,
            self.env_info['reward']: rewards,
            self.env_info['next_state']: next_states,
            self.env_info['done']: dones,
        }

        # update the main networks
        if self.log_tensorboard:
            _, summary = self.sess.run([self.opt_op, self.merged_op],
                                       feed_dict=feed_dict)
            self.writer.add_summary(summary, self.learn_steps)
        else:
            _ = self.sess.run(self.opt_op, feed_dict=feed_dict)

        # update the target networks
        self.sess.run(self.update_target_op)

        self.learn_steps += 1

    def _noise_params(self):
        with tf.variable_scope('noise'):
            noise_sigma = tf.get_variable('noise_sigma',
                                          initializer=self.init_noise_sigma,
                                          trainable=False)

            noise_decay_op = tf.assign(noise_sigma,
                                       self.noise_decay * noise_sigma,
                                       name='noise_decay_op')

            param_noise_pairs = []
            for var in self.actor_critic.actor_perturbable_variables:
                noise = tf.truncated_normal(tf.shape(var), stddev=noise_sigma)
                param_noise_pairs.append((var, noise))

            with tf.control_dependencies([noise_decay_op]):
                noise_op = list(
                    map(
                        lambda v: tf.assign(v[0], v[0] + v[1], name='noise_op'
                                            ), param_noise_pairs))
                denoise_op = list(
                    map(
                        lambda v: tf.assign(
                            v[0], v[0] - v[1], name='denoise_op'),
                        param_noise_pairs))

        return noise_op, denoise_op

    def _initialize_target_net(self):
        self.sess.run(self.init_target_op)

Пример #21

Показать файл

class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, agent_id, args):

        self.state_size = state_size
        self.action_size = action_size
        self.seed = args['seed']
        self.device = args['device']
        self.args = args

        # Q-Network
        self.actor_network = ActorNetwork(state_size, action_size,
                                          args).to(self.device)
        self.actor_target = ActorNetwork(state_size, action_size,
                                         args).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_network.parameters(),
                                          lr=args['LR_ACTOR'])

        #Model takes too long to run --> load model weights from previous run (took > 24hours on my machine)
        if not agent_id:
            self.actor_network.load_state_dict(torch.load(
                args['agent_p0_path']),
                                               strict=False)
            self.actor_target.load_state_dict(torch.load(
                args['agent_p0_path']),
                                              strict=False)
        else:
            self.actor_network.load_state_dict(torch.load(
                args['agent_p1_path']),
                                               strict=False)
            self.actor_target.load_state_dict(torch.load(
                args['agent_p1_path']),
                                              strict=False)

        # Replay memory
        self.memory = ReplayBuffer(action_size, args['BUFFER_SIZE'],
                                   args['BATCH_SIZE'], self.seed)

        # Noise process
        self.noise = OUNoise(action_size, self.seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory

        self.memory.add(state, action, reward, next_state, done)

        if len(self.memory) > self.args['BATCH_SIZE']:
            experiences = self.memory.sample()
            self.train(experiences)

    def act(self, current_state):

        with torch.no_grad():

            self.actor_network.eval()

            input_state = torch.from_numpy(current_state).float().to(
                self.device)

            with torch.no_grad():
                action = self.actor_network(input_state).cpu().data.numpy()

            self.actor_network.train()

            action += self.noise.sample()

        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def train(self, experiences):

        global states_
        global next_states_
        global actions_
        global max_min_actions_vector
        global max_min_states_vector

        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #

        with torch.no_grad():
            # Get predicted next-state actions and Q values from target models
            actions_next = self.actor_target(next_states)
            Q_targets_next = mCritic.target(next_states, actions_next)

            # Compute Q targets for current states (y_i)
            Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones))

        # Compute critic loss
        Q_expected = mCritic.network(states, actions)
        mCritic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        mCritic.optimizer.zero_grad()
        mCritic_loss.backward()
        mCritic.optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_network(states)
        actor_loss = -mCritic.network(states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(mCritic.network, mCritic.target, TAU)
        self.soft_update(self.actor_network, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Пример #22

Показать файл

class Agent():
    """ Interacts with and learns from then environment."""
    def __init__(self, state_size, action_size, seed, model=QNetwork):
        """Initialize an Agent object.
        
        Param
        =====
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            model (object): model to use
            
        Return
        ======
            None
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = seed

        # Q-Network
        self.qnetwork_local = model(state_size, action_size, seed).to(device)
        self.qnetwork_target = model(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=hyperparameters["lr"])

        # Replay memory
        self.memory = ReplayBuffer(action_size, hyperparameters["buffer_size"],
                                   hyperparameters["batch_size"], seed, device)
        # Initialize time step (for updating every hyperparameters["update_every"] steps)
        self.t_step = 0

        # Init tracking of params
        wandb.login()
        wandb.init(project=project_name, name=name, config=hyperparameters)
        jovian.log_hyperparams(hyperparameters)

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every hyperparameters["update_every"] time steps.
        self.t_step = (self.t_step + 1) % hyperparameters["update_every"]
        if self.t_step == 0:
            # If enough samples are availble in memory, get random subset and learn
            if len(self.memory) > hyperparameters["batch_size"]:
                experiences = self.memory.sample()
                self.learn(experiences, hyperparameters["gamma"])

    def act(self, state, eps=0.):
        """Return actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for espilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.
        
        Params:
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', d) tuples
            gamma (float): discount factor
        """

        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ---------------- update target network ----------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target,
                         hyperparameters["tau"])

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def get_model_name(self):
        return name

    def get_project_name(self):
        return project_name

Пример #23

Показать файл

class Agent():
    """Code adapted from the Udacity course"""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max action from max Q values (for next states) from target model

        indexes_of_Q_local_for_next_states = self.qnetwork_local(
            next_states).detach().max(1)[1].unsqueeze(1)
        Q_target_for_next_states = self.qnetwork_target(next_states).detach()
        Q_thetas = Q_target_for_next_states.gather(
            1, indexes_of_Q_local_for_next_states)

        Q_targets = rewards + (gamma * Q_thetas * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        Polyak averaging
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Пример #24

Показать файл

class DDPG:
    def __init__(self, env, batch_size, mem_size, discount, actor_params,
                 critic_params):
        self._batch_size = batch_size
        self._mem_size = mem_size
        self._discount = discount
        self._sess = tensorflow.Session()
        k_backend.set_session(self._sess)
        self._env = env
        self._state_dim = env.observation_space.shape[0]
        self._action_dim = env.action_space.shape[0]
        self._action_min = env.action_space.low
        self._action_max = env.action_space.high
        self._state_min = env.observation_space.low
        self._state_max = env.observation_space.high
        self._actor = Actor(self._sess, self._state_dim, self._action_dim,
                            self._action_min, self._action_max, actor_params)
        self._critic = Critic(self._sess, 0.5, self._state_dim,
                              self._action_dim, critic_params)
        self._memory = ReplayBuffer(mem_size)

    def get_action(self, state):
        return self._actor._model.predict(state)

    def train(self):
        '''
        No training takes place until the replay buffer contains
        at least batch size number of experiences
        '''

        if (self._memory.size() > self._batch_size):
            self._train()

    def _train(self):
        states, actions, rewards, done, next_states = self._memory.sample(
            self._batch_size)
        self._train_critic(states, actions, rewards, done, next_states)
        action_gradients = self._critic.action_gradients(states, actions)
        self._actor.train(states, action_gradients)

    def q_estimate(self, state, action):
        return self._critic._model.predict(state, action)

    def _get_q_targets(self, next_states, done, rewards):
        '''
        q = r if done else =  r + gamma * qnext
        '''
        # use actor network to determine the next action under current policy
        # estimate Q values from the critic network

        actions = self.get_action(next_states)
        qnext = self.q_estimate(next_states, actions)

        q_targets = [
            reward if end else reward * self._discount * next_q
            for (reward, next_q, end) in zip(rewards, qnext, done)
        ]
        return q_targets

    def _train_critic(self, states, actions, rewards, done, next_states):
        q_targets = self._get_q_targets(next_states, done, rewards)
        self._critic.train(states, actions, q_targets)

    def experience(self, state, action, reward, done, next_state):
        # store in replay buffer
        self._memory.add(state, action, reward, done, next_state)

        self.train()

Пример #25

Показать файл

class QMixTrainer:
    def __init__(self, env, args):
        self.env = env
        self.args = args
        self.agents = MultiAgents(args)
        self.train_datacollector = DataCollector(self.env, self.agents, args)
        self.replaybuffer = ReplayBuffer(args)

    def evaluate(self):
        mean_episode_reward = 0
        for epsd in range(self.args.eval_episodes):
            _, episode_reward = self.train_datacollector.collect_one_episode_data(
                if_train=False)
            mean_episode_reward += episode_reward

        return mean_episode_reward / self.args.eval_episodes

    def train(self):
        episode_rewards = []
        loss_history = []
        eval_episode_rewards = []
        train_steps = 0

        print("Initializing replay buffer...")
        episodes_data = []
        for epsd in range(10000):
            #print("simulating {} episode ...".format(epsd))
            episode_data, episode_reward = self.train_datacollector.collect_one_episode_data(
                epsd, if_train=False, if_init_buffer=True)
            if episode_reward == 1:
                print("goal !!!!!")
                episodes_data.append(episode_data)
        print("collected {} episodes".format(len(episodes_data)))
        l = len(episodes_data)
        batch_data = {}
        for key in episodes_data[0].keys():
            batch_data[key] = np.zeros((l, ) + episodes_data[0][key].shape)

        #batch_data = episodes_data[0]
        #episodes_data.pop(0)
        for epsd in range(l):
            for key in batch_data.keys():
                #print("key {} shape {}".format(key,batch_data[key].shape))
                batch_data[key][epsd] = episodes_data[epsd][key]

        self.replaybuffer.store_episode(batch_data)

        print("Start to train")
        plt.figure()
        for epoch in range(self.args.n_epoch):
            print("Training Epoch {} epsilon: {}".format(
                epoch, self.train_datacollector.epsilon))

            episodes_data = []
            reward_sum = 0
            for epsd in range(self.args.n_episodes_per_epoch):
                episode_data, episode_reward = self.train_datacollector.collect_one_episode_data(
                    epsd, if_train=True)
                #print("Episode {} reward is {}".format(epsd, episode_reward))
                episodes_data.append(episode_data)
                #reward_sum += episode_reward
                episode_rewards.append(episode_reward)
            #episode_rewards.append(reward_sum / self.args.n_episodes_per_epoch)

            batch_data = {}
            for key in episodes_data[0].keys():
                batch_data[key] = np.zeros((self.args.n_episodes_per_epoch, ) +
                                           episodes_data[0][key].shape)

            #batch_data = episodes_data[0]
            #episodes_data.pop(0)
            for epsd in range(self.args.n_episodes_per_epoch):
                for key in batch_data.keys():
                    #print("key {} shape {}".format(key,batch_data[key].shape))
                    batch_data[key][epsd] = episodes_data[epsd][key]

            self.replaybuffer.store_episode(batch_data)
            for t_stps in range(self.args.n_train_steps_per_epoch):
                mini_batch = self.replaybuffer.sample(
                    min(self.replaybuffer.current_size, self.args.batch_size))
                loss = self.agents.train(mini_batch, train_steps)
                loss_history.append(loss)
                train_steps = train_steps + 1

            if epoch % self.args.evaluate_freq == 0:
                mean_episode_reward = self.evaluate()
                eval_episode_rewards.append(mean_episode_reward)

                print(
                    "Evaluation Result (Mean Episode Reward) of Epoch {} is : {}"
                    .format(epoch, mean_episode_reward))

                plt.cla()
                plt.plot(range(len(episode_rewards)), episode_rewards)
                plt.xlabel('episode')
                plt.ylabel('episode reward')
                plt.savefig(
                    os.path.join(self.args.resource_dir,
                                 "episode_reward_epoch_{}.png".format(epoch)))
                '''
                plt.figure()
                plt.plot(range(len(eval_episode_rewards)), eval_episode_rewards)
                plt.xlabel('episode')
                plt.ylabel('episode reward')
                plt.savefig(os.path.join(self.args.resource_dir,"eval_episode_reward_epoch_{}.png".format(epoch)))
                '''

                np.savetxt(os.path.join(self.args.resource_dir,
                                        "episode_rewards.txt"),
                           episode_rewards,
                           fmt="%.4f")
                np.savetxt(os.path.join(self.args.resource_dir,
                                        "eval_episode_rewards.txt"),
                           eval_episode_rewards,
                           fmt="%.4f")
                np.savetxt(
                    os.path.join(self.args.resource_dir, "loss_history.txt"),
                    loss_history)
        plt.cla()
        plt.plot(range(len(episode_rewards)), episode_rewards)
        plt.xlabel('episode')
        plt.ylabel('episode reward')
        plt.savefig(
            os.path.join(self.args.resource_dir,
                         "episode_reward_epoch_{}.png".format(epoch)))
        '''
        plt.figure()
        plt.plot(range(len(eval_episode_rewards)), eval_episode_rewards)
        plt.xlabel('episode')
        plt.ylabel('episode reward')
        plt.savefig(os.path.join(self.args.resource_dir,"eval_episode_reward_epoch_{}.png".format(epoch)))
        '''

        np.savetxt(os.path.join(self.args.resource_dir, "episode_rewards.txt"),
                   episode_rewards,
                   fmt="%.4f")
        np.savetxt(os.path.join(self.args.resource_dir,
                                "eval_episode_rewards.txt"),
                   eval_episode_rewards,
                   fmt="%.4f")
        np.savetxt(os.path.join(self.args.resource_dir, "loss_history.txt"),
                   loss_history)

Пример #26

Показать файл

    def train(self, transitions: int, eps_max: float = 0.5, eps_min: float = 0., buffer_size: int = 10000,
              batch_size: int = 128, shaping_coef: float = 300., progress_upd_step: int = 0,
              start_training: int = 10000, to_sink: bool = False):
        history = ReplayBuffer(size=buffer_size)
        progress_upd_step = progress_upd_step if progress_upd_step else transitions // 100

        log = {
            "alpha": self.alpha,
            "gamma": self.gamma,
            "buffer_size": buffer_size,
            "batch_size": batch_size,
            "tau": self.tau,
            "shaping_coef": shaping_coef,
            "eps_max": eps_max,
            "eps_min": eps_min,
            "bins": self.num_bins,
            "to_sink": to_sink,
            "step": [],
            "reward_mean": [],
            "reward_std": []
        }

        state = self.reset()

        t = tqdm(range(transitions))
        for i in t:
            eps = eps_max - (eps_max - eps_min) * i / transitions
            if random() < eps:
                action = self.env.action_space.sample()
            else:
                action = self.act(state)

            next_state, reward, done, _ = self.env.step(action)
            reward += shaping_coef * (self.gamma * np.abs(next_state[1]) - np.abs(state[1]))
            done_ = next_state[0] > 0.5

            history.add((state, action, next_state, reward, done_))

            state = self.reset() if done else next_state

            if i > start_training:
                self.update(history.sample(batch_size))

            # soft update
            with torch.no_grad():
                for param, param_target in zip(self.dqn.parameters(), self.dqn_target.parameters()):
                    param_target.data.mul_(1 - self.tau)
                    param_target.data.add_(self.tau * param.data)

            if (i + 1) % progress_upd_step == 0:
                reward_mean, reward_std = self.evaluate_policy()

                log["step"].append(i)
                log["reward_mean"].append(reward_mean)
                log["reward_std"].append(reward_std)

                t.set_description(f"step: {i + 1} | Rmean = {reward_mean:0.4f} | Rstd = {reward_std:0.4f}")

                if to_sink and reward_mean >= 90 and self.evaluate_policy(episodes=100)[0] >= 90:
                    self.sink(history, start_training, eps, shaping_coef)
                    shaping_coef = 1
                    to_sink = False

        return log

Пример #27

Показать файл

Файл: agent.py Проект: shexpeditions/reinforcement_learning

class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0.0  # 0.0
        self.exploration_theta = 0.1  # 0.15
        self.exploration_sigma = 0.1  # 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def act_no_noise(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action)  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)

Пример #28

Показать файл

class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.eps = 3.0
        self.eps_decay = 0.9999

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size * 2, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size * 2, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size * 2, action_size * 2,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size * 2, action_size * 2,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=0)

        # Noise process
        self.noise = OUNoise((1, action_size), random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self,
             state,
             action,
             reward,
             next_state,
             done,
             agent_number,
             learn_iterations=5):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        #self.timestep += 1
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)
        # Learn, if enough samples are available in memory and at learning interval settings
        if len(self.memory
               ) > BATCH_SIZE:  #and self.timestep % LEARN_EVERY == 0:
            for _ in range(learn_iterations):
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA, agent_number)

    def act(self, states, add_noise):
        """Returns actions for both agents as per current policy, given their respective states."""
        states = torch.from_numpy(states).float().to(device)

        self.actor_local.eval()
        with torch.no_grad():
            actions = self.actor_local(states).cpu().data.numpy()
        self.actor_local.train()
        # add noise to actions
        if add_noise:
            actions += self.eps * self.noise.sample()
        actions = np.clip(actions, -1, 1)
        return actions

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma, agent_number):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        # Since the critic takes the actions of both agents we need to update only
        # one part of the given action
        if agent_number == 0:
            actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1)
        elif agent_number == 1:
            actions_next = torch.cat((actions[:, :2], actions_next), dim=1)
        # Compute Q targets for current states (y_i)
        Q_targets_next = self.critic_target(next_states, actions_next)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        # Since the critic takes the actions of both agents we need to update only
        # one part of the given action
        if agent_number == 0:
            actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1)
        elif agent_number == 1:
            actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1)
        # Compute actor loss
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

        # update epsilon
        self.eps *= self.eps_decay
        self.eps = max(self.eps, 1)
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Пример #29

Показать файл

Файл: train.py Проект: horacepan/qap

def train_v1(eps_start, eps_end, eps_decay, n_step, mem_capacity, num_episodes,
             embed_dim, iters):
    graph_generator = GraphGenerator(16, 16)
    memory = ReplayBuffer(mem_capacity)
    steps_done = 0
    gnn = Struc2Vec(embed_dim, iters)
    qnet = QNet(embed_dim)
    optimizer = optim.Adam(list(gnn.parameters()) + list(qnet.parameters()),
                           lr=0.0001,
                           weight_decay=1e-4)
    for e in range(num_episodes):
        node_labels, adj, edge_weights = graph_generator.next()
        vtx_feats = gnn(node_labels, adj, edge_weights)
        remaining_vertices = set([i for i in range(len(adj))])
        state = Variable(torch.zeros(embed_dim))
        curr_tour = []
        T = len(adj)
        rewards = []
        states = [state]

        for t in range(T):
            eps_threshold = util.get_eps_threshold(eps_start, eps_end,
                                                   eps_decay, steps_done)
            if random.random() > eps_threshold:
                # arg max action
                curr_vtx = arg_max_action(qnet, vtx_features,
                                          remaining_vertices)
            else:
                # random action
                curr_vtx = random.sample(remaining_vertices, 1)[0]

            action = vtx_feats[curr_vtx]
            # reward maintenance
            est_reward = qnet(state, curr_vtx)
            reward = get_reward(curr_tour, curr_vtx, edge_weights)
            rewards.append(reward)

            # update states
            curr_tour.append(curr_vtx)
            remaining_vertices.remove(curr_vtx)
            states.append(state + action)
            # wait till after doing the memory stuff to add the state

            # we only do these updates after n steps
            if t >= n_step:
                _, next_reward = arg_max_action(qnet, vtx_features,
                                                remaining_vertices)
                state_tminusn = states[-n_step]  # this is a torch tensor
                action_tminusn = vtx_feats[
                    curr_tour[-nstep]]  # this gives the vertex id
                reward_tminusn = sum(reward[-n:])
                memory.push(state_minusn, action_tminusn, reward_tminusn,
                            state, action)

                transitions = memory.sample(batch_size)
                # batch.state, batch.action, batch.reward, etc are now tuples
                # TODO: this looks a bit gross....
                batch = Transition(*zip(*batch))
                state_batch = torch.cat([s.unsqueeze(0) for s in batch.state],
                                        dim=0)
                action_batch = torch.cat(
                    [a.unsqueeze(0) for a in batch.action], dim=0)
                reward_batch = torch.cat(batch.reward)
                newstate_batch = torch.cat(
                    [ns.unsqueeze(0) for ns in batch.new_state], dim=0)
                max_action_batch = torch.cat(
                    [ma.unsqueeze(0) for ma in batch.max_action], dim=0)

                # TODO: make qnet allow batch
                # does the experience replay memory contain state/action/reward/next_state
                # from only the current episode's graph? Or can any graph seen before be
                # in the memory?
                # The argmax action is the thing taken at time t-n_step right?
                oldstate_action_value = qnet(state_batch, action_batch)
                newstate_action_value = qnet(new_state_batch, max_action_batch)
                expected_sa_values = reward_batch + gamma * newstate_action_value
                loss = F.mse_loss(oldstate_action_value, expected_sa_values)

                optimizer.zero_grad()
                loss.backward()
                # clamp grads?

            state += action
            steps_done += 1

Пример #30

Показать файл

class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 hidden_layers=[64, 64],
                 drop_p=0.3,
                 with_dueling=False,
                 isDDQN=False):
        """Initialize an Agent object.
        
        Params  
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            hidden_layers (array): Hidden number of nodes in each layer
            drop_p (float [0-1]) : Probability of dropping nodes (implementation of dropout)
            with_dueling (boolean) : If true, network is dueling network, otherwise false.
            isDDQN (boolean) : If true, double dqn in implemented, otherwise false.
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size,
                                       action_size,
                                       seed,
                                       hidden_layers=hidden_layers,
                                       drop_p=drop_p,
                                       dueling=with_dueling).to(device)
        self.qnetwork_target = QNetwork(state_size,
                                        action_size,
                                        seed,
                                        hidden_layers=hidden_layers,
                                        drop_p=drop_p,
                                        dueling=with_dueling).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

        # Parameter instance of DDQN.
        self.isDDQN = isDDQN

    def step(self, state, action, reward, next_state, done):
        """Takes a step and with each time step sample from buffer and learn"""
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        if self.isDDQN:
            # Get optimal action from local model and feed forward next_states on target network
            best_local_actions = self.qnetwork_local(states).max(
                1)[1].unsqueeze(1)
            double_dqn_targets = self.qnetwork_target(next_states)
            # Get value of the target dqn vialocal optimal action
            Q_targets_next = torch.gather(double_dqn_targets, 1,
                                          best_local_actions)
        else:
            # Get max predicted Q values (for next states) from target model (without ddqn)
            Q_targets_next = self.qnetwork_target(next_states).detach().max(
                1)[0].unsqueeze(1)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # update target network
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)