Python QNetwork.load_state_dict 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: model

클래스/타입: QNetwork

메소드/함수: load_state_dict

hotexamples.com에서의 예제들: 30

Python QNetwork.load_state_dict - 30개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 model.QNetwork.load_state_dict에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

QNetwork(30)

eval(30)

train(30)

state_dict(30)

load_state_dict(30)

parameters(30)

forward(23)

to(8)

set_weights(6)

cuda(5)

get_weights(4)

sample_action(3)

cpu(3)

save_weights(2)

load_weights(2)

decide_action(2)

load_model(2)

items(2)

criterion(2)

trainNet(1)

backward(1)

step(1)

update_mean(1)

spectrum(1)

update_nn(1)

update_target_network(1)

soft_update(1)

restore(1)

set_params(1)

save(1)

sample_actions(1)

qvalue(1)

predict_act(1)

__str__(1)

named_parameters(1)

loss_fn(1)

load(1)

initialize_weights(1)

get_action(1)

foward(1)

update_weights(1)

예제 #1

파일 보기

def load_checkpoints(filepath):
    checkpoint = torch.load(filepath)
    model = QNetwork(checkpoint['state_size'],
                     checkpoint['action_size'],
                     checkpoint['hidden_layers'])
    model.load_state_dict(checkpoint['state_dict'])
    return model

예제 #2

파일 보기

def load_model_into_agent(agent):
    """
    Loads a pretrained network into the created agent.
    """
    model = QNetwork(PARAM.STATE_SIZE, PARAM.ACTION_SIZE, 0).to(device)
    model.load_state_dict(torch.load(MODEL_TO_LOAD))
    agent.qnetwork_target = model
    agent.qnetwork_local = model

예제 #3

파일 보기

class DDQN(nn.Module):
    def __init__(self, obs, ac, config):

        super().__init__()

        self.q = QNetwork(obs, ac)
        self.target = QNetwork(obs, ac)

        self.target.load_state_dict(self.q.state_dict())

        self.target_net_update_freq = config.target_net_update_freq
        self.update_counter = 0

    def get_action(self, x):

        with torch.no_grad():
            a = self.q(x).max(1)[1]

        return a.item()

    def update_policy(self, adam, memory, params):

        b_states, b_actions, b_rewards, b_next_states, b_masks = memory.sample(
            params.batch_size)

        states = torch.tensor(b_states).float()
        actions = torch.tensor(b_actions).long().reshape(-1, 1)
        rewards = torch.tensor(b_rewards).float().reshape(-1, 1)
        next_states = torch.tensor(b_next_states).float()
        masks = torch.tensor(b_masks).float().reshape(-1, 1)

        current_q_values = self.q(states).gather(1, actions)

        # print(current_q_values[:5])

        with torch.no_grad():

            max_next_q_vals = self.target(next_states).max(1)[0].reshape(-1, 1)
            # max_next_q_vals = self.
        expected_q_vals = rewards + max_next_q_vals * 0.99 * masks
        # print(expected_q_vals[:5])
        loss = F.mse_loss(expected_q_vals, current_q_values)

        # input(loss)

        # print('\n'*5)

        adam.zero_grad()
        loss.backward()

        for p in self.q.parameters():
            p.grad.data.clamp_(-1., 1.)
        adam.step()

        self.update_counter += 1
        if self.update_counter % self.target_net_update_freq == 0:
            self.update_counter = 0
            self.target.load_state_dict(self.q.state_dict())

예제 #4

파일 보기

class DQNAgent():
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size

        self.policy_network = QNetwork(state_size, action_size).to(device)
        self.target_network = QNetwork(state_size, action_size).to(device)
        self.optimizer = optim.Adam(self.policy_network.parameters(), lr=LR)

        self.eps = EPS_START
        self.memory = ReplayBuffer(BUFFER_SIZE)
        self.t_step = 0
        self.learn_count = 0

    def step(self, state, action, reward, next_state, done):
        self.memory.store_transition(state, action, reward, next_state, done)

        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0 and len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample(BATCH_SIZE, device)
            self.learn(experiences)

    def act(self, state):
        if np.random.rand() < self.eps:
            return np.random.randint(self.action_size)
        else:
            state = torch.from_numpy(state).unsqueeze(0).to(device)
            action_values = self.policy_network(state)
            return torch.argmax(action_values).item()

    def update_eps(self):
        self.eps = max(EPS_END, EPS_DECAY * self.eps)

    def learn(self, experiences):
        states, actions, rewards, next_states, dones = experiences

        Q_current = self.policy_network(states).gather(1, actions)

        Q_targets_next = self.target_network(next_states).max(1)[0].unsqueeze(
            1)
        Q_targets = rewards + GAMMA * Q_targets_next * (1 - dones)

        loss = F.mse_loss(Q_current, Q_targets)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.learn_count += 1
        if self.learn_count % SYNC_TARGET_EVERY == 0:
            self.target_network.load_state_dict(
                self.policy_network.state_dict())

예제 #5

파일 보기

파일: ddqn_agent.py 프로젝트: jzchang288/Udacity_DRLND_Navigation

    def load_qnet(self, model_name):
        """Load Q-Network parameters from file.

        Params
        ======
            model_name (str): name of the Q-Network
        """
        # Saved QNetwork is alway the CPU version.
        qnetwork_loaded = QNetwork(self.aug_state_size,
                                   self.action_size,
                                   self.hsize1,
                                   self.hsize2,
                                   seed=None)
        qnetwork_loaded.load_state_dict(torch.load(model_name + '.pth'))
        self.qnetwork_local.update_weights(qnetwork_loaded.to(
            device))  # copy loaded network weights to local network

예제 #6

파일 보기

파일: evaluate.py 프로젝트: kumiko-oreyome/DSAI-HW4-Mountain-car

def evaluate(model_path,history_num,max_episode_steps,episode_num,result_save_path):
    checkpoint = torch.load(model_path)
    qnetwork = QNetwork(*checkpoint['model_hyper'])
    qnetwork.load_state_dict(checkpoint['model']) 
    

    env = gym.make('MountainCar-v0')
    test_success_history = []
    test_reward_history = []
    for episode in range(episode_num):
        print('episode %d'%(episode))
        observation = env.reset()
        #initialize state
        state = State(history_num)
        state.init_state(observation)
        done = False
        reward_sum = 0
    
        for t in range(max_episode_steps):
            env.render()
            state.display()
            # select a action with max q value action
            action = qnetwork.decide_action(state.toTensor().view(1,-1))
            action = action.sum().item() 
            observation, reward, done, info = env.step(action)
            reward_sum = reward_sum+reward

            if done: 
                print('done')
                print(reward_sum)
                success = False
                if observation[0]>=0.5:
                    success = True
                test_success_history.append(success)
                test_reward_history.append(reward_sum)
                break

            state.update_state_by_observation(observation,action)
            


    print('- '*100)
    print('save to %s'%(result_save_path))  
    with open(result_save_path,'wb') as f:
        pkl.dump((test_success_history,test_reward_history),f)

예제 #7

파일 보기

class AgentPriority():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 hidden_layers,
                 lr=5e-4,
                 alpha=0.5,
                 beta=0.4):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            hidden_layers (list[int, int, ...]): size of hidden layers
            lr (float): learning rate
            alpha (float (0<=alpha<=1)): parameter alpha for priority
            beta (float (0<=beta<=1)): parameter for importance sampling weight
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = seed

        # Q-Network
        self.lr = lr
        self.qnetwork_local = QNetwork(state_size, action_size, self.seed,
                                       hidden_layers).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, self.seed,
                                        hidden_layers).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=self.lr)

        # Replay memory
        self.alpha = alpha
        self.beta = beta
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed,
                                   self.alpha, self.beta)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

        # discount
        self.gamma = GAMMA

        self.checkpoint = {
            "input_size":
            self.state_size,
            "output_size":
            self.action_size,
            "hidden_layers":
            [each.out_features for each in self.qnetwork_local.hidden_layers],
            "state_dict":
            self.qnetwork_local.state_dict()
        }
        self.checkpointfile = 'priority_ddqn.pth'

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        delta = self.comp_delta(state, action, reward, next_state, done)
        self.memory.add(state, action, reward, next_state, done, delta)

        # Learn NUM_LEARNS times par every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0 and len(self.memory) >= MIN_BUF_SIZE:
            self.memory.set_priority_params(self.alpha, self.beta)
            for i in range(NUM_LEARNS):
                if i % SORT_EVERY == 0:
                    # Sort memory based on delta every SORT_EVERY learnings
                    self.memory.argsort_deltas()

                    # Update q_target with q_local
                    self.update_qtarget()

                    # If PARAMETER_ANNEALING is set to True,anneal alpha & beta.
                    if PARAMETER_ANNEALING:
                        self.parameter_anneal()

                experiences, weights, mem_idxs = self.memory.sample()
                self.learn(experiences, weights, mem_idxs)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy()).astype(np.int32)
        else:
            return random.choice(np.arange(self.action_size)).astype(np.int32)

    def learn(self, experiences, weights, mem_idxs):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
            mem_idxs (list of ints): indices in the replay buffer corresponding to
                                     the given experiences (used to update delta)
        """
        states, actions, rewards, next_states, dones = experiences

        # Get argmax of Q values (for next states) from Q_local model
        Q_local_actions = self.qnetwork_local(next_states).detach().max(
            1)[1].unsqueeze(1)

        # Evaluate that actions with Q_target model
        Q_targets_next = self.qnetwork_target(next_states).gather(
            1, Q_local_actions).detach()

        # Compute Q targets for current states
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # update deltas in self.memory
        deltas = (Q_targets - Q_expected).detach().cpu().numpy()
        self.memory.update_deltas(deltas, mem_idxs)

        # Compute loss
        loss = F.mse_loss(weights * Q_expected, weights * Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def update_qtarget(self):
        for target_param, local_param in zip(self.qnetwork_target.parameters(),
                                             self.qnetwork_local.parameters()):
            target_param.data.copy_(local_param.data)

    def comp_delta(self, state, action, reward, next_state, done):
        """Compute delta given an experience
        delta = reward + gamma*argmax_action(Q_target(next_state, a)) - Q_local(state, action)
        """
        state_ts = torch.from_numpy(np.expand_dims(state,
                                                   0)).float().to(device)
        action_ts = torch.from_numpy(np.array([[action]])).long().to(device)
        reward_ts = torch.from_numpy(np.array([[reward]])).float().to(device)
        next_state_ts = torch.from_numpy(np.expand_dims(next_state,
                                                        0)).float().to(device)
        done_ts = torch.from_numpy(np.array([[int(done)]])).float().to(device)

        Q_targets_next = self.qnetwork_target(next_state_ts).detach().max(
            1)[0].unsqueeze(1)
        Q_targets = reward_ts + (self.gamma * Q_targets_next * (1 - done_ts))
        Q_expected = self.qnetwork_local(state_ts).gather(1, action_ts)

        delta = (Q_targets - Q_expected).detach().cpu().numpy()[0, 0]
        return delta

    def get_gamma(self):
        return self.gamma

    def save_model(self):
        torch.save(self.checkpoint, self.checkpointfile)

    def set_lr(self, lr):
        self.lr = lr

    def load_model(self, filepath):
        checkpoint = torch.load(filepath)

        self.qnetwork_local = QNetwork(checkpoint["input_size"],
                                       checkpoint["output_size"], self.seed,
                                       checkpoint["hidden_layers"])
        self.qnetwork_local.load_state_dict(checkpoint["state_dict"])

    def set_uniform_sampling(self):
        """ Set alpha to 0.0 and beta to 1.0 so that the agent
        becomes equivalent to the uniform sampling.
        """
        self.alpha = 0.0
        self.beta = 1.0
        self.memory.set_priority_params(self.alpha, self.beta)

    def parameter_anneal(self):
        self.alpha = max(0.0, self.alpha - ALPHA_ANNEALING)
        self.beta = min(1.0, self.beta + BETA_ANNEALING)
        self.memory.set_priority_params(self.alpha, self.beta)

예제 #8

파일 보기

class Agent:
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, double_dqn=True):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
        """
        self.state_size = state_size
        self.action_size = action_size
        self.double_dqn = double_dqn
        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size).to(device)
        self.qnetwork_target = copy.deepcopy(self.qnetwork_local)

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def save(self, filename):
        torch.save(self.qnetwork_local.state_dict(), filename + ".local")
        torch.save(self.qnetwork_target.state_dict(), filename + ".target")

    def load(self, filename):
        if os.path.exists(filename + ".local"):
            self.qnetwork_local.load_state_dict(torch.load(filename + ".local"))
        if os.path.exists(filename + ".target"):
            self.qnetwork_target.load_state_dict(torch.load(filename + ".target"))

    def step(self, state, action, reward, next_state, done, train=True):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                if train:
                    self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):

        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        if self.double_dqn:
            # Double DQN
            q_best_action = self.qnetwork_local(next_states).max(1)[1]
            Q_targets_next = self.qnetwork_target(next_states).gather(1, q_best_action.unsqueeze(-1))
        else:
            # DQN
            Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(-1)

            # Compute Q targets for current states

        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)

예제 #9

파일 보기

파일: sac.py 프로젝트: dmitrySorokin/SAC

class SAC(object):
    def __init__(self, num_inputs, action_space, args):

        self.gamma = args.gamma
        self.tau = args.tau
        self.alpha = args.alpha

        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        self.device = torch.device("cuda" if args.cuda else "cpu")

        self.critic = QNetwork(num_inputs, action_space.shape[0],
                               args.hidden_size).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        self.critic_target = QNetwork(num_inputs, action_space.shape[0],
                                      args.hidden_size).to(self.device)
        hard_update(self.critic_target, self.critic)

        # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
        if self.automatic_entropy_tuning is True:
            self.target_entropy = -torch.prod(
                torch.Tensor(action_space.shape).to(self.device)).item()
            self.log_alpha = torch.zeros(1,
                                         requires_grad=True,
                                         device=self.device)
            self.alpha_optim = Adam([self.log_alpha], lr=args.lr)

        self.policy = GaussianPolicy(num_inputs, action_space.shape[0],
                                     args.hidden_size,
                                     action_space).to(self.device)
        self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

    def select_action(self, state, evaluate=False):
        state = torch.FloatTensor(state).to(self.device).unsqueeze(0)
        if evaluate is False:
            action, _, _ = self.policy.sample(state)
        else:
            _, _, action = self.policy.sample(state)
        return action.detach().cpu().numpy()[0]

    def update_parameters(self, memory, batch_size, updates):
        # Sample a batch from memory
        state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample(
            batch_size=batch_size)

        state_batch = torch.FloatTensor(state_batch).to(self.device)
        next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(
            self.device).unsqueeze(1)
        mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1)

        with torch.no_grad():
            next_state_action, next_state_log_pi, _ = self.policy.sample(
                next_state_batch)
            qf1_next_target, qf2_next_target = self.critic_target(
                next_state_batch, next_state_action)
            min_qf_next_target = torch.min(
                qf1_next_target,
                qf2_next_target) - self.alpha * next_state_log_pi
            next_q_value = reward_batch + mask_batch * self.gamma * min_qf_next_target
        qf1, qf2 = self.critic(
            state_batch, action_batch
        )  # Two Q-functions to mitigate positive bias in the policy improvement step
        qf1_loss = F.mse_loss(
            qf1, next_q_value
        )  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
        qf2_loss = F.mse_loss(
            qf2, next_q_value
        )  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
        critic_loss = qf1_loss + qf2_loss

        self.critic_optim.zero_grad()
        critic_loss.backward()
        self.critic_optim.step()

        pi, log_pi, _ = self.policy.sample(state_batch)
        qf1_pi, qf2_pi = self.critic(state_batch, pi)
        min_qf_pi = torch.min(qf1_pi, qf2_pi)
        policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean(
        )  # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))]

        self.policy_optim.zero_grad()
        policy_loss.backward()
        self.policy_optim.step()

        if self.automatic_entropy_tuning:
            alpha_loss = -(self.log_alpha *
                           (log_pi + self.target_entropy).detach()).mean()

            self.alpha_optim.zero_grad()
            alpha_loss.backward()
            self.alpha_optim.step()

            self.alpha = self.log_alpha.exp()
            alpha_tlogs = self.alpha.clone()  # For TensorboardX logs
        else:
            alpha_loss = torch.tensor(0.).to(self.device)
            alpha_tlogs = torch.tensor(self.alpha)  # For TensorboardX logs

        if updates % self.target_update_interval == 0:
            soft_update(self.critic_target, self.critic, self.tau)

        return qf1_loss.item(), qf2_loss.item(), policy_loss.item(
        ), alpha_loss.item(), alpha_tlogs.item()

    # Save model parameters
    def save_model(self,
                   env_name,
                   suffix="",
                   actor_path=None,
                   critic_path=None):
        if not os.path.exists('models/'):
            os.makedirs('models/')

        if actor_path is None:
            actor_path = "models/sac_actor_{}_{}".format(env_name, suffix)
        if critic_path is None:
            critic_path = "models/sac_critic_{}_{}".format(env_name, suffix)
        print('Saving models to {} and {}'.format(actor_path, critic_path))
        torch.save(self.policy.state_dict(), actor_path)
        torch.save(self.critic.state_dict(), critic_path)

    # Load model parameters
    def load_model(self, actor_path, critic_path, device='cpu'):
        print('Loading models from {} and {}'.format(actor_path, critic_path))
        if actor_path is not None:
            self.policy.load_state_dict(
                torch.load(actor_path, map_location=torch.device(device)))
        if critic_path is not None:
            self.critic.load_state_dict(
                torch.load(critic_path, map_location=torch.device(device)))

예제 #10

파일 보기

class SAC(object):
    def __init__(self, num_inputs, action_space, args):

        self.num_inputs = num_inputs
        self.action_space = action_space.shape[0]
        self.gamma = args.gamma
        self.tau = args.tau

        self.policy_type = args.policy
        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        self.critic = QNetwork(self.num_inputs, self.action_space,
                               args.hidden_size)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        if self.policy_type == "Gaussian":
            self.alpha = args.alpha
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning == True:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape)).item()
                self.log_alpha = torch.zeros(1, requires_grad=True)
                self.alpha_optim = Adam([self.log_alpha], lr=args.lr)
            else:
                pass

            self.policy = GaussianPolicy(self.num_inputs, self.action_space,
                                         args.hidden_size)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

            self.value = ValueNetwork(self.num_inputs, args.hidden_size)
            self.value_target = ValueNetwork(self.num_inputs, args.hidden_size)
            self.value_optim = Adam(self.value.parameters(), lr=args.lr)
            hard_update(self.value_target, self.value)
        else:
            self.policy = DeterministicPolicy(self.num_inputs,
                                              self.action_space,
                                              args.hidden_size)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

            self.critic_target = QNetwork(self.num_inputs, self.action_space,
                                          args.hidden_size)
            hard_update(self.critic_target, self.critic)

    def select_action(self, state, eval=False):
        state = torch.FloatTensor(state).unsqueeze(0)
        if eval == False:
            self.policy.train()
            action, _, _, _, _ = self.policy.sample(state)
        else:
            self.policy.eval()
            _, _, _, action, _ = self.policy.sample(state)
            if self.policy_type == "Gaussian":
                action = torch.tanh(action)
            else:
                pass
        #action = torch.tanh(action)
        action = action.detach().cpu().numpy()
        return action[0]

    def update_parameters(self, state_batch, action_batch, reward_batch,
                          next_state_batch, mask_batch, updates):
        state_batch = torch.FloatTensor(state_batch)
        next_state_batch = torch.FloatTensor(next_state_batch)
        action_batch = torch.FloatTensor(action_batch)
        reward_batch = torch.FloatTensor(reward_batch).unsqueeze(1)
        mask_batch = torch.FloatTensor(np.float32(mask_batch)).unsqueeze(1)
        """
        Use two Q-functions to mitigate positive bias in the policy improvement step that is known
        to degrade performance of value based methods. Two Q-functions also significantly speed
        up training, especially on harder task.
        """
        expected_q1_value, expected_q2_value = self.critic(
            state_batch, action_batch)
        new_action, log_prob, _, mean, log_std = self.policy.sample(
            state_batch)

        if self.policy_type == "Gaussian":
            if self.automatic_entropy_tuning:
                """
                Alpha Loss
                """
                alpha_loss = -(
                    self.log_alpha *
                    (log_prob + self.target_entropy).detach()).mean()
                self.alpha_optim.zero_grad()
                alpha_loss.backward()
                self.alpha_optim.step()
                self.alpha = self.log_alpha.exp()
                alpha_logs = self.alpha.clone()  # For TensorboardX logs
            else:
                alpha_loss = torch.tensor(0.)
                alpha_logs = self.alpha  # For TensorboardX logs
            """
            Including a separate function approximator for the soft value can stabilize training.
            """
            expected_value = self.value(state_batch)
            target_value = self.value_target(next_state_batch)
            next_q_value = reward_batch + mask_batch * self.gamma * (
                target_value).detach()
        else:
            """
            There is no need in principle to include a separate function approximator for the state value.
            We use a target critic network for deterministic policy and eradicate the value value network completely.
            """
            alpha_loss = torch.tensor(0.)
            alpha_logs = self.alpha  # For TensorboardX logs
            next_state_action, _, _, _, _, = self.policy.sample(
                next_state_batch)
            target_critic_1, target_critic_2 = self.critic_target(
                next_state_batch, next_state_action)
            target_critic = torch.min(target_critic_1, target_critic_2)
            next_q_value = reward_batch + mask_batch * self.gamma * (
                target_critic).detach()
        """
        Soft Q-function parameters can be trained to minimize the soft Bellman residual
        JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
        ∇JQ = ∇Q(st,at)(Q(st,at) - r(st,at) - γV(target)(st+1))
        """
        q1_value_loss = F.mse_loss(expected_q1_value, next_q_value)
        q2_value_loss = F.mse_loss(expected_q2_value, next_q_value)
        q1_new, q2_new = self.critic(state_batch, new_action)
        expected_new_q_value = torch.min(q1_new, q2_new)

        if self.policy_type == "Gaussian":
            """
            Including a separate function approximator for the soft value can stabilize training and is convenient to 
            train simultaneously with the other networks
            Update the V towards the min of two Q-functions in order to reduce overestimation bias from function approximation error.
            JV = 𝔼st~D[0.5(V(st) - (𝔼at~π[Qmin(st,at) - α * log π(at|st)]))^2]
            ∇JV = ∇V(st)(V(st) - Q(st,at) + (α * logπ(at|st)))
            """
            next_value = expected_new_q_value - (self.alpha * log_prob)
            value_loss = F.mse_loss(expected_value, next_value.detach())
        else:
            pass
        """
        Reparameterization trick is used to get a low variance estimator
        f(εt;st) = action sampled from the policy
        εt is an input noise vector, sampled from some fixed distribution
        Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))]
        ∇Jπ = ∇log π + ([∇at (α * logπ(at|st)) − ∇at Q(st,at)])∇f(εt;st)
        """
        policy_loss = ((self.alpha * log_prob) - expected_new_q_value).mean()

        # Regularization Loss
        mean_loss = 0.001 * mean.pow(2).mean()
        std_loss = 0.001 * log_std.pow(2).mean()

        policy_loss += mean_loss + std_loss

        self.critic_optim.zero_grad()
        q1_value_loss.backward()
        self.critic_optim.step()

        self.critic_optim.zero_grad()
        q2_value_loss.backward()
        self.critic_optim.step()

        if self.policy_type == "Gaussian":
            self.value_optim.zero_grad()
            value_loss.backward()
            self.value_optim.step()
        else:
            value_loss = torch.tensor(0.)

        self.policy_optim.zero_grad()
        policy_loss.backward()
        self.policy_optim.step()
        """
        We update the target weights to match the current value function weights periodically
        Update target parameter after every n(args.target_update_interval) updates
        """
        if updates % self.target_update_interval == 0 and self.policy_type == "Deterministic":
            soft_update(self.critic_target, self.critic, self.tau)

        elif updates % self.target_update_interval == 0 and self.policy_type == "Gaussian":
            soft_update(self.value_target, self.value, self.tau)
        return value_loss.item(), q1_value_loss.item(), q2_value_loss.item(
        ), policy_loss.item(), alpha_loss.item(), alpha_logs

    # Save model parameters
    def save_model(self,
                   env_name,
                   suffix="",
                   actor_path=None,
                   critic_path=None,
                   value_path=None):
        if not os.path.exists('models/'):
            os.makedirs('models/')

        if actor_path is None:
            actor_path = "models/sac_actor_{}_{}".format(env_name, suffix)
        if critic_path is None:
            critic_path = "models/sac_critic_{}_{}".format(env_name, suffix)
        if value_path is None:
            value_path = "models/sac_value_{}_{}".format(env_name, suffix)
        print('Saving models to {}, {} and {}'.format(actor_path, critic_path,
                                                      value_path))
        torch.save(self.value.state_dict(), value_path)
        torch.save(self.policy.state_dict(), actor_path)
        torch.save(self.critic.state_dict(), critic_path)

    # Load model parameters
    def load_model(self, actor_path, critic_path, value_path):
        print('Loading models from {}, {} and {}'.format(
            actor_path, critic_path, value_path))
        if actor_path is not None:
            self.policy.load_state_dict(torch.load(actor_path))
        if critic_path is not None:
            self.critic.load_state_dict(torch.load(critic_path))
        if value_path is not None:
            self.value.load_state_dict(torch.load(value_path))

예제 #11

파일 보기

파일: train.py 프로젝트: bluerobotcat/rl-football-veronica

class Agent():
    def __init__(self,
                 state_size,
                 action_size,
                 behavior_name,
                 index_player,
                 replay_memory_size=1e4,
                 batch_size=512,
                 gamma=0.99,
                 learning_rate=1e4,
                 target_tau=1e3,
                 update_rate=100,
                 seed=0):  #affect your agent vs other agents
        self.state_size = state_size
        self.current_state = []
        self.action_size = action_size
        self.buffer_size = int(replay_memory_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.learn_rate = learning_rate
        self.tau = target_tau
        self.update_rate = update_rate
        self.seed = random.seed(seed)
        self.behavior_name = behavior_name
        self.index_player = index_player
        self.close_ball_reward = 0
        self.touch_ball_reward = 0
        """
        Now we define two models: 
        (a) one netwoek will be updated every (step % update_rate == 0),
        (b) A target network, with weights updated to equal to equal to the network (a) at a slower (target_tau) rate.
        """

        self.network = QNetwork(state_size, action_size, seed).to(device)
        self.target_network = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.optimizer = optim.Adam(self.network.parameters(),
                                    lr=self.learn_rate)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.buffer_size,
                                   self.batch_size, seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def load_model(self, path_model, path_target=None):
        params = torch.load(path_model)
        self.network.set_params(params)
        self.network.load_state_dict(torch.load(path_model))
        if path_target != None:
            self.target_network.load_state_dict(torch.load(path_target))

    def model_step(self, state, action, reward, next_state):
        # save experience in replay memory
        self.memory.add(state, action, reward, next_state)

        # learn every UPDATE_EVERY time steps
        self.t_step = self.t_step + 1
        if self.t_step % self.update_rate == 0:

            # if enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences, self.gamma, self.t_step)

    def choose_action(self, state, eps=0.0):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.network.eval()
        with torch.no_grad():
            action_values = self.network(state)
        self.network.train()

        # epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy()
                             )  # return a number from 0 to action_size
        else:
            return random.choice(np.arange(
                self.action_size))  # return a number from 0 to action_size

    def learn(self, experiences, gamma, stp):
        states, actions, rewards, next_states = experiences

        # Get Q values from current observations (s,a) using model network
        # get max Q values for (s', a') from target model
        self.network.train()
        Q_sa = self.network(states).gather(1, actions)
        #print(Q_sa)
        Q_sa_prime_target_values = self.target_network(next_states).max(
            1)[0].to(device).float().detach()
        #Q_sa_prime_targets = Q_sa_prime_target_values.max(1)[0].unsqueeze(1)
        #print(Q_sa_prime_target_values)

        # compute Q targets for current states
        #print(rewards)

        Q_sa_targets = rewards + gamma * Q_sa_prime_target_values.unsqueeze(1)
        #print(Q_sa_targets)
        #input('train')

        #Q_sa_targets = Q_sa_targets.unsqueeze(1)

        # Compute loss (error)
        criterion = torch.nn.MSELoss(reduction='sum')
        loss = criterion(
            Q_sa.to(device),
            Q_sa_targets.to(device))  #F.mse_loss(Q_sa, Q_sa_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # update target network
        if stp % 100 == 0:
            print('Updating Model')
            self.soft_update(self.network, self.target_network, self.tau)

    def soft_update(self, local_model, target_model, tau):

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def Read(self):
        decision_steps, terminal_steps = env.get_steps(self.behavior_name)
        try:
            signal_front = np.array(
                sensor_front_sig(
                    decision_steps.obs[0][self.index_player, :]))  # 3 x 11 x 8
            signal_back = np.array(
                sensor_back_sig(
                    decision_steps.obs[1][self.index_player, :]))  # 3 x 3 x 8
            #pre_state = []
            signal_front = np.array(signal_front)
            #print(signal_front.shape)
            #print(signal_back.shape)
            r = np.concatenate((signal_front, signal_back), axis=1)
            #print(r.shape)
            #input('ff')
            #pre_state.extend(list(np.array(signal_front).flatten()))
            #pre_state.extend(list(np.array(signal_back).flatten()))
            #state = np.array(pre_state)
            self.current_state = r
            count_close_to_ball = 0
            count_touch_ball = 0
            count_back_touch = 0
            count_back_close = 0
            self.rew_d_to_our_post = 0
            self.rew_for_ball_dist = -0.1
            # Front Observation
            for i in range(len(signal_front[0])):
                if signal_front[0][i][0] == 1.0:
                    count_close_to_ball += 1
                    self.rew_for_ball_dist = max(
                        0.3 * (1 - signal_front[0][i][7]),
                        self.rew_for_ball_dist)

                    # Kicked the ball at the front
                    if signal_front[0][i][7] <= 0.03:
                        count_touch_ball += 1

                if signal_front[0][i][1] == 1.0:
                    self.rew_d_to_our_post = -0.1
                if signal_front[0][i][2] == 1.0:
                    self.rew_d_to_our_post = 0.1

            # Back observation
            for i in range(len(signal_back[0])):
                if signal_back[0][i][0] == 1.0:
                    count_back_close += 0.2

                    # Touches the ball at the back
                    if signal_back[0][i][7] <= 0.03:
                        count_back_touch += 0.3

            self.back_touch = 1 if count_back_touch > 0 else 0.2
            self.back_close = 1 if count_back_close > 0 else 0.1

            # add reward if kick the ball
            self.touch_ball_reward = 1 if count_touch_ball > 0 else -0.15
            # Penalize for back touching the ball
            if count_back_touch > 0:
                self.touch_ball_reward = -0.25

            # Penalize if the ball is not in view
            self.close_ball_reward = 0.25 if count_close_to_ball > 0 else -0.05
            # Penalize if the ball is behind the agent
            if count_back_close > 0:
                self.close_ball_reward = -0.1

            return self.current_state
        except:
            self.touch_ball_reward = 0
            self.close_ball_reward = 0

        return self.current_state

    def upd_after_goal(self, n_upds):
        self.memory.upd_goal(n_upds)
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences, self.gamma, self.t_step)

    def we_goll(self):
        self.memory.we_goll()
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences, self.gamma, self.t_step)
            experiences = self.memory.sample()
            self.learn(experiences, self.gamma, self.t_step)

    def us_goll(self):
        self.memory.us_goll()
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences, self.gamma, self.t_step)
            experiences = self.memory.sample()
            self.learn(experiences, self.gamma, self.t_step)

예제 #12

파일 보기

파일: sac.py 프로젝트: tienmanhptit1312/Meta-RL

class SAC(object):
    def __init__(self, num_inputs, action_space, variant):

        self.gamma = variant['gamma']
        self.tau = variant['tau']
        self.alpha = variant['alpha']
        self.policy_type = variant['policy_type']
        self.target_update_interval = variant['target_update_interval']
        self.automatic_entropy_tuning = variant['automatic_entropy_tuning']
        self.lr = variant.get("lr", 1e-3)

        self.device = torch.device("cuda" if variant['cuda'] else "cpu")
        self.hidden_size = variant.get('hidden_size', [128, 128])

        self.critic = QNetwork(num_inputs, action_space.shape[0],
                               self.hidden_size).to(self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=self.lr)

        self.critic_target = QNetwork(num_inputs, action_space.shape[0],
                                      self.hidden_size).to(self.device)
        hard_update(self.critic_target, self.critic)

        if self.policy_type == 'Gaussian':
            if self.automatic_entropy_tuning:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape).to(self.device)).item()
                self.log_alpha = torch.zeros(1,
                                             requires_grad=True,
                                             device=self.device)
                self.alpha_optim = Adam([self.log_alpha], lr=self.lr)

            self.policy = GaussianPolicy(num_inputs, action_space.shape[0],
                                         self.hidden_size,
                                         action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=self.lr)

        else:
            self.alpha = 0
            self.automatic_entropy_tuning = False
            self.policy = DeterministicPolicy(num_inputs,
                                              action_space.shape[0],
                                              self.hidden_size,
                                              action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=self.lr)

    def select_action(self, state, evaluate=False):

        state = torch.FloatTensor(state).to(self.device).unsqueeze(0)
        if evaluate is False:
            action, _, _ = self.policy.sample(state)
        else:
            _, _, action = self.policy.sample(state)
        return action.detach().cpu().numpy()[0]

    def update_parameters(self, memory, batch_size, updates):
        #sample a batch from memory
        state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample(
            batch_size=batch_size)

        state_batch = torch.FloatTensor(state_batch).to(self.device)
        next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(self.device)
        mask_batch = torch.FloatTensor(mask_batch).to(self.device)

        with torch.no_grad():
            next_state_action, next_state_log_pi, _ = self.policy.sample(
                next_state_batch)
            qf1_next_target, qf2_next_target = self.critic_target(
                next_state_batch, next_state_action)
            min_qf_next_target = torch.min(
                qf1_next_target,
                qf2_next_target) - self.alpha * next_state_log_pi
            next_q_value = reward_batch + mask_batch * self.gamma * (
                min_qf_next_target)
        qf1, qf2 = self.critic(
            state_batch, action_batch
        )  # Two Q-functions to mitigate positive bias in the policy improvement step
        qf1_loss = F.mse_loss(
            qf1, next_q_value
        )  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
        qf2_loss = F.mse_loss(
            qf2, next_q_value
        )  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]

        # samle a batch of action and appropriate log_pi
        pi, log_pi, _ = self.policy.sample(state_batch)

        qf1_pi, qf2_pi = self.critic(state_batch, pi)
        min_qf_pi = torch.min(qf1_pi, qf2_pi)

        policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean(
        )  # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))]

        self.critic_optim.zero_grad()
        qf1_loss.backward()
        self.critic_optim.step()

        self.critic_optim.zero_grad()
        qf2_loss.backward()
        self.critic_optim.step()

        self.policy_optim.zero_grad()
        policy_loss.backward()
        self.policy_optim.step()

        if self.automatic_entropy_tuning:
            alpha_loss = -(self.log_alpha *
                           (log_pi + self.target_entropy).detach()).mean()

            self.alpha_optim.zero_grad()
            alpha_loss.backward()
            self.alpha_optim.step()

            self.alpha = self.log_alpha.exp()
            # alpha_tlogs = self.alpha.clone()
        else:
            alpha_loss = torch.tensor(0.0).to(self.device)

        if update % self.target_update_interval == 0:
            soft_update(self.critic_target, self.critic, self.tau)

        return qf1_loss.item(), qf2_loss.item(), policy_loss.item(
        ), alpha_loss.item()

    def save_model(self,
                   env_nam,
                   suffix=".pkl",
                   actor_path=None,
                   critic_path=None):
        if not os.path.exists('models/'):
            os.makedirs('models/')
        if actor_path is None:
            actor_path = "models/sac_actor_{}_{}".format(env_name, suffix)
        if critic_path is None:
            critic_path = "models/sac_critic_{}_{}".format(env_name, suffix)

        print("Saving models to {} and {}".format(actor_path, critic_path))
        torch.save(self.policy.state_dict(), actor_path)
        torch.save(self.critic.state_dict(), critic_path)

    def load_model(self, actor_path, critic_path):
        print('loading models from {} and {}'.format(actor_path, critic_path))
        if actor_path is not None:
            self.policy.load_state_dict(torch.load(actor_path))
        if critic_path is not None:
            self.critic.load_state_dict(torch.load(critic_path))

예제 #13

파일 보기

파일: dqn_agent.py 프로젝트: wytyang00/Udacity-DRL-Nanodegree-Navigation-Project

class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed=SEED, batch_size=BATCH_SIZE,
                 buffer_size=BUFFER_SIZE, start_since=START_SINCE, gamma=GAMMA, target_update_every=T_UPDATE,
                 tau=TAU, lr=LR, weight_decay=WEIGHT_DECAY, update_every=UPDATE_EVERY, priority_eps=P_EPS,
                 a=A, initial_beta=INIT_BETA, n_multisteps=N_STEPS,
                 v_min=V_MIN, v_max=V_MAX, clip=CLIP, n_atoms=N_ATOMS,
                 initial_sigma=INIT_SIGMA, linear_type=LINEAR, factorized=FACTORIZED, **kwds):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            batch_size (int): size of each sample batch
            buffer_size (int): size of the experience memory buffer
            start_since (int): number of steps to collect before start training
            gamma (float): discount factor
            target_update_every (int): how often to update the target network
            tau (float): target network soft-update parameter
            lr (float): learning rate
            weight_decay (float): weight decay for optimizer
            update_every (int): update(learning and target update) interval
            priority_eps (float): small base value for priorities
            a (float): priority exponent parameter
            initial_beta (float): initial importance-sampling weight
            n_multisteps (int): number of steps to consider for each experience
            v_min (float): minimum reward support value
            v_max (float): maximum reward support value
            clip (float): gradient norm clipping (`None` to disable)
            n_atoms (int): number of atoms in the discrete support distribution
            initial_sigma (float): initial noise parameter weights
            linear_type (str): one of ('linear', 'noisy'); type of linear layer to use
            factorized (bool): whether to use factorized gaussian noise in noisy layers
        """
        if kwds != {}:
            print("Ignored keyword arguments: ", end='')
            print(*kwds, sep=', ')
        assert isinstance(state_size, int)
        assert isinstance(action_size, int)
        assert isinstance(seed, int)
        assert isinstance(batch_size, int) and batch_size > 0
        assert isinstance(buffer_size, int) and buffer_size >= batch_size
        assert isinstance(start_since, int) and batch_size <= start_since <= buffer_size
        assert isinstance(gamma, (int, float)) and 0 <= gamma <= 1
        assert isinstance(target_update_every, int) and target_update_every > 0
        assert isinstance(tau, (int, float)) and 0 <= tau <= 1
        assert isinstance(lr, (int, float)) and lr >= 0
        assert isinstance(weight_decay, (int, float)) and weight_decay >= 0
        assert isinstance(update_every, int) and update_every > 0
        assert isinstance(priority_eps, (int, float)) and priority_eps >= 0
        assert isinstance(a, (int, float)) and 0 <= a <= 1
        assert isinstance(initial_beta, (int, float)) and 0 <= initial_beta <= 1
        assert isinstance(n_multisteps, int) and n_multisteps > 0
        assert isinstance(v_min, (int, float)) and isinstance(v_max, (int, float)) and v_min < v_max
        if clip: assert isinstance(clip, (int, float)) and clip >= 0
        assert isinstance(n_atoms, int) and n_atoms > 0
        assert isinstance(initial_sigma, (int, float)) and initial_sigma >= 0
        assert isinstance(linear_type, str) and linear_type.strip().lower() in ('linear', 'noisy')
        assert isinstance(factorized, bool)

        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)

        self.state_size          = state_size
        self.action_size         = action_size
        self.seed                = seed
        self.batch_size          = batch_size
        self.buffer_size         = buffer_size
        self.start_since         = start_since
        self.gamma               = gamma
        self.target_update_every = target_update_every
        self.tau                 = tau
        self.lr                  = lr
        self.weight_decay        = weight_decay
        self.update_every        = update_every
        self.priority_eps        = priority_eps
        self.a                   = a
        self.beta                = initial_beta
        self.n_multisteps        = n_multisteps
        self.v_min               = v_min
        self.v_max               = v_max
        self.clip                = clip
        self.n_atoms             = n_atoms
        self.initial_sigma       = initial_sigma
        self.linear_type         = linear_type.strip().lower()
        self.factorized          = factorized

        # Distribution
        self.supports = torch.linspace(v_min, v_max, n_atoms, device=device)
        self.delta_z  = (v_max - v_min) / (n_atoms - 1)

        # Q-Network
        self.qnetwork_local  = QNetwork(state_size, action_size, n_atoms, linear_type, initial_sigma, factorized).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, n_atoms, linear_type, initial_sigma, factorized).to(device)
        self.qnetwork_target.load_state_dict(self.qnetwork_local.state_dict())

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr, weight_decay=weight_decay)

        # Replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, n_multisteps, gamma, a)
        # Initialize time step (for updating every UPDATE_EVERY steps and TARGET_UPDATE_EVERY steps)
        self.u_step = 0
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.u_step = (self.u_step + 1) % self.update_every
        if self.u_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) >= self.start_since:
                experiences, target_discount, is_weights, indices = self.memory.sample(self.beta)
                new_priorities = self.learn(experiences, is_weights, target_discount)
                self.memory.update_priorities(indices, new_priorities)

        # update the target network every TARGET_UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.target_update_every
        if self.t_step == 0:
            self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        with torch.no_grad():
            z_probs       = F.softmax(self.qnetwork_local(state), dim=-1)
            action_values = self.supports.mul(z_probs).sum(dim=-1, keepdim=False)

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        return random.choice(np.arange(self.action_size))

    def learn(self, experiences, is_weights, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            is_weights (torch.Tensor): tensor of importance-sampling weights
            gamma (float): discount factor for the target max-Q value

        Returns
        =======
            new_priorities (List[float]): list of new priority values for the given sample
        """
        states, actions, rewards, next_states, dones = experiences

        with torch.no_grad():
            rows         = tuple(range(next_states.size(0)))
            a_argmax     = F.softmax(self.qnetwork_local(next_states), dim=2)\
                               .mul(self.supports)\
                               .sum(dim=2, keepdim=False)\
                               .argmax(dim=1, keepdim=False)
            p            = F.softmax(self.qnetwork_target(next_states)[rows, a_argmax], dim=1)
            tz_projected = torch.clamp(rewards + (1 - dones) * gamma * self.supports, min=self.v_min, max=self.v_max)
            # """
            b            = (tz_projected - self.v_min) / self.delta_z
            u            = b.ceil()
            l            = b.floor()
            u_updates    = b - l + u.eq(l).type(u.dtype) # fixes the problem when having b == u == l
            l_updates    = u - b
            indices_flat = torch.cat((u.long(), l.long()), dim=1)
            indices_flat = indices_flat.add(
                               torch.arange(start=0,
                                            end=b.size(0) * b.size(1),
                                            step=b.size(1),
                                            dtype=indices_flat.dtype,
                                            layout=indices_flat.layout,
                                            device=indices_flat.device).unsqueeze(1)
                           ).view(-1)
            updates_flat = torch.cat((u_updates.mul(p), l_updates.mul(p)), dim=1).view(-1)
            target_distributions = torch.zeros_like(p)
            target_distributions.view(-1).index_add_(0, indices_flat, updates_flat)
            """
            b = ((tz_projected - V_MIN) / self.delta_z).t() # transpose for later for-loop convenience
            u = b.ceil()
            l = b.floor()
            u_updates = b - l + u.eq(l).type(u.dtype)
            l_updates = u - b
            target_distributions = torch.zeros_like(p)
            for u_indices, l_indices, u_update, l_update, prob in zip(u.long(), l.long(), u_updates, l_updates, p.t()):
                target_distributions[rows, u_indices] += u_update * prob
                target_distributions[rows, l_indices] += l_update * prob
            """

        pred_distributions = self.qnetwork_local(states)
        pred_distributions = pred_distributions.gather(dim=1, index=actions.unsqueeze(1).expand(-1, -1, pred_distributions.size(2))).squeeze(1)

        """
        cross_entropy = target_distributions.mul(pred_distributions.exp().sum(dim=-1, keepdim=True).log() - pred_distributions).sum(dim=-1, keepdim=False)
        new_priorities = cross_entropy.detach().add(self.priority_eps).cpu().numpy()
        loss = cross_entropy.mul(is_weights.view(-1)).mean()
        """
        kl_divergence = F.kl_div(F.log_softmax(pred_distributions, dim=-1), target_distributions, reduce=False).sum(dim=-1, keepdim=False)
        new_priorities = kl_divergence.detach().add(self.priority_eps).cpu().numpy()
        loss = kl_divergence.mul(is_weights.view(-1)).mean()
#         """

        self.optimizer.zero_grad()
        loss.backward()
        if self.clip:
            torch.nn.utils.clip_grad_norm_(self.qnetwork_local.parameters(), self.clip)
        self.optimizer.step()

        return new_priorities

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

예제 #14

파일 보기

파일: dqn_agent.py 프로젝트: atbasu/deep-reinforcement-learning-navigation

class Agent():
    """Interacts with and learns from the environment.
    
    Attributes:
        state_size (int): dimension of each state
        action_size (int): dimension of each action
        seed (int): random seed
    """
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 hidden_layers=[128, 64],
                 filename=None):
        """Initialize an Agent object.
        
        Args:
            filename: path of .pth file with trained weights
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        if filename:
            weights = torch.load(filename)
            self.qnetwork_local.load_state_dict(weights)
            self.qnetwork_target.load_state_dict(weights)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Args:
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Args:
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ## TODO: compute and minimize the loss
        Q_t_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        Q_t = rewards + (gamma * Q_t_next * (1 - dones))
        Q_e = self.qnetwork_local(states).gather(1, actions)
        loss = F.mse_loss(Q_e, Q_t)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Args:
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

예제 #15

파일 보기

class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, layer_spec, seed=0):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.layer_spec = layer_spec
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       layer_spec).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        layer_spec).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # (Prioritized) experience replay setup
        self.buffer_size = BUFFER_SIZE
        self.batch_size = BATCH_SIZE
        self.min_prio = MIN_PRIO
        self.alpha = ALPHA
        self.beta = INIT_BETA
        self.beta_increment = BETA_INC
        if USE_PER:
            self.memory = PrioritizedReplayBuffer(size=self.buffer_size,
                                                  alpha=self.alpha)
        else:
            self.memory = DequeReplayBuffer(action_size=self.action_size,
                                            buffer_size=self.buffer_size,
                                            batch_size=self.batch_size,
                                            seed=42)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

        # print info about Agent
        print('Units in the hidden layers are {}.'.format(str(layer_spec)))
        print('Using Double-DQN is \"{}\".'.format(str(USE_DDQN)))
        print('Using prioritized experience replay is \"{}\".'.format(
            str(USE_PER)))

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get subset and learn
            if len(self.memory) > BATCH_SIZE:
                self.beta = min(1., self.beta + self.beta_increment)
                experiences = self.memory.sample(self.batch_size,
                                                 beta=self.beta)
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        # Get TD step from experiences
        states, actions, rewards, next_states, dones, weights, idxes = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)

        # DOUBLE DQN: Select action based on _local, evaluate action based on _target
        if USE_DDQN:
            Q_action_select = self.qnetwork_local(next_states).detach().max(
                1)[1].unsqueeze(1)
            Q_targets_next = self.qnetwork_target(next_states).detach().gather(
                1, Q_action_select)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute (PER-weighted) MSE loss
        if USE_PER:
            TD_error = Q_targets - Q_expected
            weighted_TD_error = weights * (TD_error**2)
            loss = torch.mean(weighted_TD_error)
            # Update priorities in Replay Buffer
            prio_updates = np.abs(
                TD_error.detach().squeeze(1).cpu().numpy()) + self.min_prio
            self.memory.update_priorities(idxes, prio_updates.tolist())
        else:
            loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # soft-update target network
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def save_checkpoint(self):
        checkpoint = {
            'input_size': self.state_size,
            'output_size': self.action_size,
            'layer_spec': self.layer_spec,
            'state_dict': self.qnetwork_local.state_dict()
        }
        torch.save(checkpoint, 'checkpoint.pth')
        print('Checkpoint succesfully saved.')

    def load_checkpoint(self, filepath='checkpoint.pth'):
        checkpoint = torch.load(filepath)
        self.qnetwork_local = QNetwork(checkpoint['input_size'],
                                       checkpoint['output_size'],
                                       checkpoint['layer_spec']).to(device)
        self.qnetwork_local.load_state_dict(checkpoint['state_dict'])
        print('Checkpoint successfully loaded.')

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

예제 #16

파일 보기

파일: dqn_agent.py 프로젝트: aboerzel/udacity-deep-reinforcement-learning-p1-navigation

class Agent:
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 double_dqn=False,
                 dueling_network=False,
                 prioritized_replay=False):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            double_dqn (bool): use Double DQN method
            dueling_network (bool): use Dueling Network
            prioritized_replay (bool): use Prioritized Replay Buffer
        """
        self.state_size = state_size
        self.action_size = action_size
        self.dueling_network = dueling_network
        self.double_dqn = double_dqn
        self.prioritized_replay = prioritized_replay

        random.seed(seed)

        # Q-Network
        self.hidden_layers = [128, 32]

        if self.dueling_network:
            self.hidden_state_value_layers = [64, 32]

            self.qnetwork_local = DuelingQNetwork(
                state_size, action_size, seed, self.hidden_layers,
                self.hidden_state_value_layers).to(device)
            self.qnetwork_target = DuelingQNetwork(
                state_size, action_size, seed, self.hidden_layers,
                self.hidden_state_value_layers).to(device)
            self.qnetwork_target.eval()
        else:
            self.qnetwork_local = QNetwork(state_size, action_size, seed,
                                           self.hidden_layers).to(device)
            self.qnetwork_target = QNetwork(state_size, action_size, seed,
                                            self.hidden_layers).to(device)
            self.qnetwork_target.eval()

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.lr_scheduler = optim.lr_scheduler.ExponentialLR(
            self.optimizer, LR_DECAY)

        # Replay memory
        if prioritized_replay:
            self.memory = PrioritizedReplayBuffer(action_size,
                                                  BUFFER_SIZE,
                                                  BATCH_SIZE,
                                                  seed,
                                                  device,
                                                  alpha=0.6,
                                                  beta=0.4,
                                                  beta_scheduler=1.0)
        else:
            self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                       seed, device)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def load(self, filepath):
        # load weights from file
        state_dict = torch.load(filepath)
        self.qnetwork_local.load_state_dict(state_dict)
        self.qnetwork_local.eval()

    def save(self, filepath):
        # Save weights to file
        torch.save(self.qnetwork_local.state_dict(), filepath)

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        # Epsilon-greedy action selection
        if random.random() >= eps:
            state = torch.from_numpy(state).float().unsqueeze(0).to(device)

            self.qnetwork_local.eval()
            with torch.no_grad():
                action_values = self.qnetwork_local(state)
            self.qnetwork_local.train()

            return np.argmax(action_values.cpu().data.numpy()).astype(int)

        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done, w) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones, w = experiences

        with torch.no_grad():
            # Use of Double DQN method
            if self.double_dqn:
                # Select the greedy actions (maximum Q target for next states) from local model
                greedy_actions = self.qnetwork_local(next_states).max(
                    dim=1, keepdim=True)[1]

                # Get the Q targets (for next states) for the greedy actions from target model
                q_targets_next = self.qnetwork_target(next_states).gather(
                    1, greedy_actions)

            # Use of Fixed Q-Target
            else:
                # Get max predicted Q values (for next states) from target model
                q_targets_next = self.qnetwork_target(next_states).max(
                    dim=1, keepdim=True)[0]

        # Compute Q targets for current states
        q_targets = rewards + (gamma * q_targets_next * (1 - dones))

        # Get expected Q values from local model
        q_expected = self.qnetwork_local(states).gather(
            1, actions)  # shape: [batch_size, 1]

        # Compute loss
        if self.prioritized_replay:
            q_targets.sub_(q_expected)
            q_targets.squeeze_()
            q_targets.pow_(2)

            with torch.no_grad():
                td_error = q_targets
                td_error.pow_(0.5)
                self.memory.update_priorities(td_error)

            q_targets.mul_(w)
            loss = q_targets.mean()
        else:
            loss = F.mse_loss(q_expected, q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.lr_scheduler.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    @staticmethod
    def soft_update(local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

예제 #17

파일 보기

class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def dqn(self,
            env,
            brain_name,
            n_episodes=2000,
            max_t=1000,
            eps_start=1.0,
            eps_end=0.01,
            eps_decay=0.995):
        """Deep Q-Learning.
    
        Params
        ======
            n_episodes (int): maximum number of training episodes
            max_t (int): maximum number of timesteps per episode
            eps_start (float): starting value of epsilon, for epsilon-greedy action selection
            eps_end (float): minimum value of epsilon
            eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
        """
        scores = []  # list containing scores from each episode
        scores_window = deque(maxlen=100)  # last 100 scores
        eps = eps_start  # initialize epsilon
        for i_episode in range(1, n_episodes + 1):
            env_info = env.reset(
                train_mode=False)[brain_name]  # reset the environment
            state = env_info.vector_observations[0]  # get the current state
            score = 0  # reset the score
            for t in range(max_t):
                action = self.act(state, eps).astype(
                    int)  # choose action based on epsilon-greedy policy
                env_info = env.step(action)[
                    brain_name]  # send the action to the environment
                next_state = env_info.vector_observations[
                    0]  # get the next state
                reward = env_info.rewards[0]  # get the reward
                done = env_info.local_done[0]  # see if episode has finished
                self.step(state, action, reward, next_state,
                          done)  # make the agent take a step
                state = next_state  # update the state
                score += reward  # add the reward to the score
                if done:  # (if done)
                    break  # end episode
            scores_window.append(score)  # save most recent score
            scores.append(score)  # save most recent score
            eps = max(eps_end, eps_decay * eps)  # decrease epsilon
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_window)),
                  end="")
            if i_episode % 100 == 0:
                print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                    i_episode, np.mean(scores_window)))
            if np.mean(scores_window) >= 13.0:
                print(
                    '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                    .format(i_episode - 100, np.mean(scores_window)))
                torch.save(self.qnetwork_local.state_dict(), 'checkpoint.pth')
                break
        return scores

    def test(self, env, brain_name):
        self.qnetwork_local.load_state_dict(torch.load('checkpoint.pth'))
        # load environment variables
        # action_size, state_size = info.getInfo()
        env_info = env.reset(
            train_mode=False)[brain_name]  # reset the environment
        state = env_info.vector_observations[0]  # get the current state
        score = 0  # initialize the score
        while True:
            action = self.act(state).astype(int)  # select an action
            env_info = env.step(action)[
                brain_name]  # send the action to the environment
            next_state = env_info.vector_observations[0]  # get the next state
            reward = env_info.rewards[0]  # get the reward
            done = env_info.local_done[0]  # see if episode has finished
            score += reward  # update the score
            state = next_state  # roll over the state to next time step
            if done:  # exit loop if episode finished
                break

        return score

예제 #18

파일 보기

파일: agent.py 프로젝트: salvioli/deep-monkey

class Agent:
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 fc1_size=64,
                 fc2_size=64,
                 checkpoint_filename=''):
        """
        Initializes an agent object
        TODO make the structure of the qfunction approximator more flexible
        :param state_size: dimension of each state
        :param action_size: dimension of each action
        :param seed: random seed
        :param fc1_size: number of units of the first fully connected layer of the q function approximator
        :param fc2_size: number of units of the second fully connected layer of the q function approximator
        :param checkpoint_filename: name of the checkpoint file which contains the load_state_dict pickled
                                    weights of the q function approximator.
        :return agent: initialized agent
        """

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        self.fc1_size = fc1_size
        self.fc2_size = fc2_size

        self.BUFFER_SIZE = int(1e5)  # replay buffer size
        self.BATCH_SIZE = 64  # minibatch size
        self.GAMMA = 0.99  # discount factor
        self.TAU = 1e-3  # for soft update of target parameters
        self.LR = 5e-4  # learning rate
        self.UPDATE_EVERY = 4  # how often to update the network

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed, fc1_size,
                                       fc2_size).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed,
                                        fc1_size, fc2_size).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=self.LR)
        self.criterion = torch.nn.MSELoss()

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.BUFFER_SIZE,
                                   self.BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

        if checkpoint_filename != '':
            self.qnetwork_local.load_state_dict(
                torch.load(checkpoint_filename))

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, self.GAMMA)

    def act(self, state, eps=0.):
        """
        Returns actions for given state as per current policy
        """

        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """
        Update value parameters using given batch of experience tuples.
        :param experiences: (Tuple[torch.Variable]) tuple of (s, a, r, s', done) tuples
        :param gamma: (float) discount factor
        """

        y = self._q_target(experiences, gamma)
        y_pred = self._q_estimated(experiences, gamma)

        loss = self.criterion(y_pred, y)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, self.TAU)

    def soft_update(self, local_model, target_model, tau):
        """
        Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        :param local_model: (PyTorch model) weights will be copied from
        :param target_model: (PyTorch model) weights will be copied to
        :param tau: interpolation parameter
        :return:
        """

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def _q_target(self, experiences, gamma):
        """Method that calculates the target q value used for training"""
        raise NotImplementedError

    def _q_estimated(self, experiences, gamma):
        """Method that calculates the estimated q value used for training"""
        states, actions, rewards, next_states, dones = experiences
        # feedforward the local network
        return self.qnetwork_local(states).gather(1, actions)

예제 #19

파일 보기

def main(args):
    env = gym.make(args.env)
    if 'MiniGrid' in args.env:
        env = ImgObsWrapper(env)
    path = args.base_path + args.env
    os.makedirs(path, exist_ok=True)
    # obs_shape = np.prod(env.observation_space.shape).astype(int)
    obs_shape = env.observation_space.shape
    act_shape = env.action_space.n

    q = QNetwork(obs_shape, act_shape)
    q_target = QNetwork(obs_shape, act_shape)
    opt = optim.Adam(lr=args.lr, params=q.parameters())
    memory = Memory(capacity=args.memory)
    scheduler = LinearSchedule(schedule_timesteps=int(args.max_steps * 0.1), final_p=0.01)

    avg_rw = deque(maxlen=40)
    avg_len = deque(maxlen=40)

    def get_action(s, t):

        s = torch.Tensor(s[None,:])
        _q = q(s)
        if np.random.sample() > scheduler.value:
            best_action = np.argmax(_q.detach(), axis=-1).item()
        else:
            best_action = np.random.randint(0, act_shape)
            scheduler.update(t)
        return best_action

    def train(batch):
        batch = Transition(*zip(*batch))
        s = torch.Tensor(batch.state)
        a = torch.Tensor(one_hot(np.array(batch.action), num_classes=act_shape))
        r = torch.Tensor(batch.reward)
        d = torch.Tensor(batch.done)
        s1 = torch.Tensor(batch.next_state)

        value = (q(s) * a).sum(dim=-1)
        next_value = r + args.gamma * (1. - d) * torch.max(q_target(s1), dim=-1)[0]
        loss = (.5 * (next_value - value) ** 2).mean()
        opt.zero_grad()
        loss.backward()
        opt.step()

    state = env.reset()

    q_target.load_state_dict(q.state_dict())

    ep_rw = 0
    ep_len = 0
    ep = 0
    for t in range(args.max_steps):
        action = get_action(state, t)
        next_state, reward, done, _ = env.step(action)
        memory.push(state, action, next_state, reward, done)
        ep_rw += reward
        ep_len += 1

        state = next_state.copy()
        if done:
            ep += 1
            avg_rw.append(ep_rw)
            avg_len.append(ep_len)
            ep_rw = 0
            ep_len = 0
            state = env.reset()

        if t % args.train_every == 0 and len(memory) > args.batch_size:
            batch = memory.sample(batch_size=args.batch_size)
            train(batch)

        if t % args.update_every == 0:
            q_target.load_state_dict(q.state_dict())
            print(f't:{t}\tep:{ep}\tavg_rw:{np.mean(avg_rw)}\tavg_len:{np.mean(avg_len)}\teps:{scheduler.value}')

    env = Monitor(env, directory=path)

    for ep in range(4):
        s = env.reset()
        while True:
            a = get_action(s, t=0)
            s1, r, d, _ = env.step(a)
            s = s1.copy()
            if d:
                break

예제 #20

파일 보기

for i in range(0, len(env.OBSTACLE_X)):
    plt.plot(env.OBSTACLE_X[i],
             env.OBSTACLE_Y[i],
             marker="s",
             color="red",
             markersize=22)

index = 0
eps = 0
for i in range(3, 5):

    model = QNetwork(state_size=(len(OBSTACLE_X) + 1) * 2,
                     action_size=81,
                     seed=0)

    model.load_state_dict(
        torch.load('dqn_models{}checkpoint{}.pth'.format('/', index)))
    state = env.reset()
    env.render()
    time.sleep(5)
    for t in range(max_t):

        state = torch.from_numpy(state).float().unsqueeze(0)
        action_values = model(state)
        # Epsilon-greedy action selection
        if random.random() > eps:
            action = np.argmax(action_values.cpu().data.numpy())
        else:
            action = random.choice(np.arange(model.action_size))

        next_state, _, done, _ = env.step(action)
        state = next_state

예제 #21

파일 보기

class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed=SEED, batch_size=BATCH_SIZE,
                 buffer_size=BUFFER_SIZE, start_since=START_SINCE, gamma=GAMMA, target_update_every=T_UPDATE,
                 tau=TAU, lr=LR, weight_decay=WEIGHT_DECAY, update_every=UPDATE_EVERY, priority_eps=P_EPS,
                 a=A, initial_beta=INIT_BETA, n_multisteps=N_STEPS, clip=CLIP, initial_sigma=INIT_SIGMA, linear_type=LINEAR, **kwds):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            batch_size (int): size of each sample batch
            buffer_size (int): size of the experience memory buffer
            start_since (int): number of steps to collect before start training
            gamma (float): discount factor
            target_update_every (int): how often to update the target network
            tau (float): target network soft-update parameter
            lr (float): learning rate
            weight_decay (float): weight decay for optimizer
            update_every (int): update(learning and target update) interval
            priority_eps (float): small base value for priorities
            a (float): priority exponent parameter
            initial_beta (float): initial importance-sampling weight
            n_multisteps (int): number of steps to consider for each experience
            clip (float): gradient norm clipping (`None` to disable)
            initial_sigma (float): initial noise parameter weights
            linear_type (str): one of ('linear', 'noisy'); type of linear layer to use
        """
        if kwds != {}:
            print("Ignored keyword arguments: ", end='')
            print(*kwds, sep=', ')
        assert isinstance(state_size, int)
        assert isinstance(action_size, int)
        assert isinstance(seed, int)
        assert isinstance(batch_size, int) and batch_size > 0
        assert isinstance(buffer_size, int) and buffer_size >= batch_size
        assert isinstance(start_since, int) and batch_size <= start_since <= buffer_size
        assert isinstance(gamma, (int, float)) and 0 <= gamma <= 1
        assert isinstance(target_update_every, int) and target_update_every > 0
        assert isinstance(tau, (int, float)) and 0 <= tau <= 1
        assert isinstance(lr, (int, float)) and lr >= 0
        assert isinstance(weight_decay, (int, float)) and weight_decay >= 0
        assert isinstance(update_every, int) and update_every > 0
        assert isinstance(priority_eps, (int, float)) and priority_eps >= 0
        assert isinstance(a, (int, float)) and 0 <= a <= 1
        assert isinstance(initial_beta, (int, float)) and 0 <= initial_beta <= 1
        assert isinstance(n_multisteps, int) and n_multisteps > 0
        if clip: assert isinstance(clip, (int, float)) and clip >= 0
        assert isinstance(initial_sigma, (int, float)) and initial_sigma >= 0
        assert isinstance(linear_type, str) and linear_type.strip().lower() in ('linear', 'noisy')

        self.state_size          = state_size
        self.action_size         = action_size
        self.seed                = random.seed(seed)
        self.batch_size          = batch_size
        self.buffer_size         = buffer_size
        self.start_since         = start_since
        self.gamma               = gamma
        self.target_update_every = target_update_every
        self.tau                 = tau
        self.lr                  = lr
        self.weight_decay        = weight_decay
        self.update_every        = update_every
        self.priority_eps        = priority_eps
        self.a                   = a
        self.beta                = initial_beta
        self.n_multisteps        = n_multisteps
        self.clip                = clip
        self.initial_sigma       = initial_sigma
        self.linear_type         = linear_type.strip().lower()

        # Q-Network
        self.qnetwork_local  = QNetwork(state_size, action_size, linear_type, initial_sigma, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, linear_type, initial_sigma, seed).to(device)
        self.qnetwork_target.load_state_dict(self.qnetwork_local.state_dict())

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr, weight_decay=weight_decay)

        # Replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, n_multisteps, gamma, a, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps and TARGET_UPDATE_EVERY steps)
        self.u_step = 0
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.u_step = (self.u_step + 1) % self.update_every
        if self.u_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) >= self.start_since:
                experiences, target_discount, is_weights, indices = self.memory.sample(self.beta)
                new_priorities = self.learn(experiences, is_weights, target_discount)
                self.memory.update_priorities(indices, new_priorities)

        # update the target network every TARGET_UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.target_update_every
        if self.t_step == 0:
            self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        return random.choice(np.arange(self.action_size))

    def learn(self, experiences, is_weights, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            is_weights (torch.Tensor): tensor of importance-sampling weights
            gamma (float): discount factor for the target max-Q value

        Returns
        =======
            new_priorities (List[float]): list of new priority values for the given sample
        """
        states, actions, rewards, next_states, dones = experiences

        with torch.no_grad():
            target = rewards + gamma * (1 - dones) * self.qnetwork_target(next_states)\
                                                         .gather(dim=1, index=self.qnetwork_local(next_states)\
                                                                                  .argmax(dim=1, keepdim=True))

        pred = self.qnetwork_local(states)

        diff = target.sub(pred.gather(dim=1, index=actions))
        new_priorities = diff.detach().abs().add(P_EPS).cpu().numpy().reshape((-1,))
        loss = diff.pow(2).mul(is_weights).mean()

        self.optimizer.zero_grad()
        loss.backward()
        if self.clip:
            torch.nn.utils.clip_grad_norm_(self.qnetwork_local.parameters(), CLIP)
        self.optimizer.step()

        return new_priorities

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

예제 #22

파일 보기

파일: dqn.py 프로젝트: xuezzee/-

class Agent:
    def __init__(self, state_size, action_size, num_agents, double_dqn=False):
        self.action_size = action_size
        self.double_dqn = double_dqn

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size).to(device)
        self.qnetwork_target = copy.deepcopy(self.qnetwork_local)
        self.optimizer = torch.optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.lr_scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=4000, gamma=0.98, last_epoch=-1)

        # Replay memory
        self.memory = ReplayBuffer(BUFFER_SIZE)
        self.num_agents = num_agents
        self.t_step = 0

    def reset(self):
        self.finished = [False] * self.num_agents


    # Decide on an action to take in the environment

    def act(self, state, eps=0.):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)

        # Epsilon-greedy action selection
        if random.random() > eps:
              return torch.argmax(action_values).item()
        else: return torch.randint(self.action_size, ()).item()


    # Record the results of the agent's action and update the model

    def step(self, handle, state, action, reward, next_state, agent_done):
        if not self.finished[handle]:
            # Save experience in replay memory
            self.memory.push(state, action, reward, next_state, agent_done)
            self.finished[handle] = agent_done

        # Perform a gradient update every UPDATE_EVERY time steps
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0 and len(self.memory) > BATCH_SIZE * 1: # 320
            self.learn(*self.memory.sample(BATCH_SIZE, device))


    def learn(self, states, actions, rewards, next_states, dones):
        self.qnetwork_local.train()

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        if self.double_dqn:
              Q_best_action = self.qnetwork_local(next_states).argmax(1)
              Q_targets_next = self.qnetwork_target(next_states).gather(1, Q_best_action.unsqueeze(-1))
        else: Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(-1)

        # Compute Q targets for current states
        Q_targets = rewards + GAMMA * Q_targets_next * (1 - dones)

        # Compute loss and perform a gradient step
        self.optimizer.zero_grad()
        loss = F.mse_loss(Q_expected, Q_targets)
        loss.backward()
        self.optimizer.step()
        self.lr_scheduler.step()

        # Update the target network parameters to `tau * local.parameters() + (1 - tau) * target.parameters()`
        for target_param, local_param in zip(self.qnetwork_target.parameters(), self.qnetwork_local.parameters()):
            target_param.data.copy_(TAU * local_param.data + (1.0 - TAU) * target_param.data)


    # Checkpointing methods

    def save(self, path, *data):
        torch.save(self.qnetwork_local.state_dict(), path / 'model_checkpoint.local')
        torch.save(self.qnetwork_target.state_dict(), path / 'model_checkpoint.target')
        torch.save(self.optimizer.state_dict(), path / 'model_checkpoint.optimizer')
        with open(path / 'model_checkpoint.meta', 'wb') as file:
            pickle.dump(data, file)

    def load(self, path, *defaults):
        try:
            print("Loading model from checkpoint...")
            self.qnetwork_local.load_state_dict(torch.load(path / 'model_checkpoint.local'))
            self.qnetwork_target.load_state_dict(torch.load(path / 'model_checkpoint.target'))
            self.optimizer.load_state_dict(torch.load(path / 'model_checkpoint.optimizer'))
            with open(path / 'model_checkpoint.meta', 'rb') as file:
                return pickle.load(file)
        except:
            print("No checkpoint file was found")
            return defaults

예제 #23

파일 보기

class SAC(object):
    def __init__(self, num_inputs, action_space, args):

        self.num_inputs = num_inputs
        self.action_space = action_space.shape[0]
        self.gamma = args.gamma
        self.tau = args.tau

        self.policy_type = args.policy
        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        self.device = torch.device("cuda" if args.cuda else "cpu")

        self.critic = QNetwork(self.num_inputs, self.action_space,
                               args.hidden_size).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        if self.policy_type == "Gaussian":
            self.alpha = args.alpha
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning == True:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape).to(self.device)).item()
                self.log_alpha = torch.zeros(1,
                                             requires_grad=True,
                                             device=self.device)
                self.alpha_optim = Adam([self.log_alpha], lr=args.lr)

            self.policy = GaussianPolicy(self.num_inputs, self.action_space,
                                         args.hidden_size).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

            self.value = ValueNetwork(self.num_inputs,
                                      args.hidden_size).to(self.device)
            self.value_target = ValueNetwork(self.num_inputs,
                                             args.hidden_size).to(self.device)
            self.value_optim = Adam(self.value.parameters(), lr=args.lr)
            hard_update(self.value_target, self.value)
        else:
            self.policy = DeterministicPolicy(self.num_inputs,
                                              self.action_space,
                                              args.hidden_size).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

            self.critic_target = QNetwork(self.num_inputs, self.action_space,
                                          args.hidden_size).to(self.device)
            hard_update(self.critic_target, self.critic)

    def select_action(self, state, eval=False):
        state = torch.FloatTensor(state).to(self.device).unsqueeze(0)
        if eval == False:
            self.policy.train()
            action, _, _ = self.policy.sample(state)
        else:
            self.policy.eval()
            _, _, action = self.policy.sample(state)
        action = action.detach().cpu().numpy()
        return action[0]

    def update_parameters(self, state_batch, action_batch, reward_batch,
                          next_state_batch, mask_batch, updates):
        state_batch = torch.FloatTensor(state_batch).to(self.device)
        next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(
            self.device).unsqueeze(1)
        mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1)

        qf1, qf2 = self.critic(
            state_batch, action_batch
        )  # Two Q-functions to mitigate positive bias in the policy improvement step
        pi, log_pi, _ = self.policy.sample(state_batch)

        if self.policy_type == "Gaussian":
            if self.automatic_entropy_tuning:
                alpha_loss = -(self.log_alpha *
                               (log_pi + self.target_entropy).detach()).mean()
                self.alpha_optim.zero_grad()
                alpha_loss.backward()
                self.alpha_optim.step()
                self.alpha = self.log_alpha.exp()
                alpha_logs = torch.tensor(self.alpha)  # For TensorboardX logs
            else:
                alpha_loss = torch.tensor(0.).to(self.device)
                alpha_logs = torch.tensor(self.alpha)  # For TensorboardX logs

            vf = self.value(
                state_batch
            )  # separate function approximator for the soft value can stabilize training.
            with torch.no_grad():
                vf_next_target = self.value_target(next_state_batch)
                next_q_value = reward_batch + mask_batch * self.gamma * (
                    vf_next_target)
        else:
            alpha_loss = torch.tensor(0.).to(self.device)
            alpha_logs = self.alpha  # For TensorboardX logs
            with torch.no_grad():
                next_state_action, _, _, _, _, = self.policy.sample(
                    next_state_batch)
                # Use a target critic network for deterministic policy and eradicate the value value network completely.
                qf1_next_target, qf2_next_target = self.critic_target(
                    next_state_batch, next_state_action)
                min_qf_next_target = torch.min(qf1_next_target,
                                               qf2_next_target)
                next_q_value = reward_batch + mask_batch * self.gamma * (
                    min_qf_next_target)

        qf1_loss = F.mse_loss(
            qf1, next_q_value
        )  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
        qf2_loss = F.mse_loss(
            qf2, next_q_value
        )  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
        qf1_pi, qf2_pi = self.critic(state_batch, pi)
        min_qf_pi = torch.min(qf1_pi, qf2_pi)

        if self.policy_type == "Gaussian":
            vf_target = min_qf_pi - (self.alpha * log_pi)
            value_loss = F.mse_loss(
                vf, vf_target.detach()
            )  # JV = 𝔼st~D[0.5(V(st) - (𝔼at~π[Qmin(st,at) - α * log π(at|st)]))^2]

        policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean(
        )  # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))]

        # Regularization Loss
        # mean_loss = 0.001 * mean.pow(2).mean()
        # std_loss = 0.001 * log_std.pow(2).mean()

        # policy_loss += mean_loss + std_loss

        self.critic_optim.zero_grad()
        qf1_loss.backward()
        self.critic_optim.step()

        self.critic_optim.zero_grad()
        qf2_loss.backward()
        self.critic_optim.step()

        if self.policy_type == "Gaussian":
            self.value_optim.zero_grad()
            value_loss.backward()
            self.value_optim.step()
        else:
            value_loss = torch.tensor(0.).to(self.device)

        self.policy_optim.zero_grad()
        policy_loss.backward()
        self.policy_optim.step()
        """
        We update the target weights to match the current value function weights periodically
        Update target parameter after every n(args.target_update_interval) updates
        """
        if updates % self.target_update_interval == 0 and self.policy_type == "Deterministic":
            soft_update(self.critic_target, self.critic, self.tau)

        elif updates % self.target_update_interval == 0 and self.policy_type == "Gaussian":
            soft_update(self.value_target, self.value, self.tau)
        return value_loss.item(), qf1_loss.item(), qf2_loss.item(
        ), policy_loss.item(), alpha_loss.item(), alpha_logs.item()

    # Save model parameters
    def save_model(self,
                   env_name,
                   suffix="",
                   actor_path=None,
                   critic_path=None,
                   value_path=None):
        if not os.path.exists('models/'):
            os.makedirs('models/')

        if actor_path is None:
            actor_path = "models/sac_actor_{}_{}".format(env_name, suffix)
        if critic_path is None:
            critic_path = "models/sac_critic_{}_{}".format(env_name, suffix)
        if value_path is None:
            value_path = "models/sac_value_{}_{}".format(env_name, suffix)
        print('Saving models to {}, {} and {}'.format(actor_path, critic_path,
                                                      value_path))
        torch.save(self.value.state_dict(), value_path)
        torch.save(self.policy.state_dict(), actor_path)
        torch.save(self.critic.state_dict(), critic_path)

    # Load model parameters
    def load_model(self, actor_path, critic_path, value_path):
        print('Loading models from {}, {} and {}'.format(
            actor_path, critic_path, value_path))
        if actor_path is not None:
            self.policy.load_state_dict(torch.load(actor_path))
        if critic_path is not None:
            self.critic.load_state_dict(torch.load(critic_path))
        if value_path is not None:
            self.value.load_state_dict(torch.load(value_path))

예제 #24

파일 보기

파일: agents.py 프로젝트: es94129/DQN-DDQN-Dueling

class DQN_Agent:
    def __init__(self, state_size, action_size, seed=42):
        self.action_size = action_size

        # Q-Network
        self.q_eval = QNetwork(state_size, action_size, seed).to(device)
        self.q_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.RMSprop(self.q_eval.parameters(), lr=LR)

        # Replay Buffer
        self.memory = ReplayBuffer(seed=seed)

        self.step_count = 0
        self.seed = random.seed(seed)
    
    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.q_eval.eval()
        with torch.no_grad():
            q_values = self.q_eval(state)
        self.q_eval.train()

        epsilon = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * self.step_count / EPS_DECAY)
        if random.random() > epsilon:
            # greedy
            return np.argmax(q_values.cpu().data.numpy())
        else:
            # explore
            return random.choice(np.arange(self.action_size))
        
    def step(self, state, action, reward, next_state, done):       
        self.memory.push(state, action, reward, next_state, done)

        loss_value = None
        if len(self.memory) >= BATCH_SIZE:
            # sample transitions from replay buffer
            states, actions, rewards, next_states, dones = self.memory.sample()
 
            #  r                                   if done
            #  r + max_a \gamma Q(s, a; \theta')   if not done
            q_next_values = self.q_target(next_states).detach().max(1)[0].unsqueeze(1)
            q_learning_targets = rewards + GAMMA * q_next_values * (1 - dones)

            # Q(s, a; \theta)
            q_values = self.q_eval(states).gather(1, actions)

            # perform gradient descent on the loss
            loss = F.mse_loss(q_values, q_learning_targets)
            loss_value = loss.data.item()

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            # update target Q-Network
            self.update_target()

        self.step_count += 1
        return loss_value
    
    def update_target(self):
        if self.step_count % UPDATE_TARGET_STEPS == 0:
            self.q_target.load_state_dict(self.q_eval.state_dict())

예제 #25

파일 보기

파일: agent.py 프로젝트: ronrest/deepQ_bananas

class Agent():
    """ Creates an agent that interacts with a Unity-ML Environment
        using a Deep Q-learning model (in pytorch).
    """
    def __init__(self,
                 n_state,
                 n_actions,
                 n_hidden=32,
                 n_layers=2,
                 seed=333,
                 snapshotfile="snapshot.pth"):
        """ Initialize the agent.

        Args:
            n_state     (int):  Number of features that represent the state
            n_actions   (int):  Number of actions available to agent
            n_hidden    (int):  Number of units in hidden neural net layers
            n_layers    (int):  Number of layers for neural network
            seed        (int):  Set the random seed (for reproducibility)
            snapshotfile (str): Filepath to use for saving weights
        """
        self.n_state = n_state
        self.n_actions = n_actions
        self.seed = random.seed(seed)
        self.snapshotfile = snapshotfile

        # Deep Q-Network
        self.qnetwork_local = QNetwork(n_state, n_actions, seed,
                                       n_hidden=64).to(device)
        self.qnetwork_target = QNetwork(n_state, n_actions, seed,
                                        n_hidden=64).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.loss_func = torch.nn.MSELoss(reduce=True)

        # Experience Replay Memory
        self.memory = ReplayBuffer(n_actions, EXPERIENCE_MEMORY_SIZE,
                                   BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

        # TODO: have the is_training attribute control eval and train
        #       mode in pytprch network
        self.is_training = True

    def memorize_and_learn_step(self, state, action, reward, next_state, done):
        """ Given  S,A,R',S' and if it is finished, it saves the eperience
            to memory, and occasionally samples from memorized experiences and
            learns from those memories.
        """
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Once every UPDATE_EVERY steps, randomly sample memories to learn from
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def choose_action(self, state, epsilon=0.0):
        """ Given an environment state, it returns an action using epsilon
            greedy policy.

        Args:
            state   (array_like): current state
            epsilon (float)     : probability of choosing a random action
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():  # temporarially set requires_grad flag to false
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > epsilon:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.n_actions))

    def learn(self, experiences, gamma):
        """ Update the weights of the neural network representing the Q values,
            given a batch of experience tuples.

        Args:
            experiences (tuple of torch.Variable): tuple with the following
                        torch tensors
                        (states, actions, rewards, next_states, dones)
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Q_TARGET
        next_logits = self.qnetwork_target(
            next_states).detach()  # no need to calculate gradients, so detach
        q_next = torch.max(next_logits, dim=1, keepdim=True)[0]
        # where dones=1, it  will ignore q_next, and just use current reward
        q_target = rewards + ((1 - dones) * (gamma * q_next))

        # Q_CURRENT - based on action taken in experience
        current_logits = self.qnetwork_local(states)
        q_pred = torch.gather(current_logits, 1, actions)

        # LOSS
        loss = self.loss_func(q_pred, q_target)
        # loss = F.mse_loss(q_pred, q_target)

        # OPTIMIZE WEIGHTS
        self.optimizer.zero_grad()  # zero the parameter gradients
        loss.backward()
        self.optimizer.step()

        # UPDATE TARGET NETWORK
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """ Performs a soft update on the target Q network weights, by
            shifting them slightly towards the local Q network by a factor of
            `tau`.

            θ_target = τ*θ_local + (1 - τ)*θ_target

        Args:
            local_model  (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def snapshot(self, file=None):
        """ Takes a snapshot file of the neural netowrk weights """
        file = self.snapshotfile if file is None else file
        torch.save(self.qnetwork_local.state_dict(), file)

    def load_snapshot(self, file=None):
        """ Loads the neural network weights from a file """
        file = self.snapshotfile if file is None else file
        self.qnetwork_local.load_state_dict(torch.load(file))
        self.qnetwork_target.load_state_dict(torch.load(file))

예제 #26

파일 보기

파일: dqn_agent.py 프로젝트: luoyif/WildFireModel

class Agent():
    """
    Initialize Agent, inclduing:
        DQN Hyperparameters
        Local and Targat State-Action Policy Networks
        Replay Memory Buffer from Replay Buffer Class (define below)
    """
    def __init__(self,
                 state_size,
                 action_size,
                 dqn_type='DQN',
                 replay_memory_size=1e5,
                 batch_size=64,
                 gamma=0.99,
                 learning_rate=1e-3,
                 target_tau=2e-3,
                 update_rate=4,
                 seed=0):
        """
        DQN Agent Parameters
        ====== 
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            dqn_type (string): can be either 'DQN' for vanillia dqn learning (default) or 'DDQN' for double-DQN.
            replay_memory size (int): size of the replay memory buffer (typically 5e4 to 5e6)
            batch_size (int): size of the memory batch used for model updates (typically 32, 64 or 128)
            gamma (float): paramete for setting the discoun ted value of future rewards (typically .95 to .995)
            learning_rate (float): specifies the rate of model learing (typically 1e-4 to 1e-3))
            seed (int): random seed for initializing training point.
        """
        self.dqn_type = dqn_type
        self.state_size = state_size
        self.action_size = action_size
        self.buffer_size = int(replay_memory_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.learn_rate = learning_rate
        self.tau = target_tau
        self.update_rate = update_rate
        self.seed = random.seed(seed)
        """
        # DQN Agent Q-Network
        # For DQN training, two nerual network models are employed;
        # (a) A network that is updated every (step % update_rate == 0)
        # (b) A target network, with weights updated to equal the network at a slower (target_tau) rate.
        # The slower modulation of the target network weights operates to stablize learning.
        """
        self.network = QNetwork(state_size, action_size, seed).to(device)
        self.target_network = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.optimizer = optim.Adam(self.network.parameters(),
                                    lr=self.learn_rate,
                                    betas=BETAS)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.buffer_size,
                                   self.batch_size, seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    ########################################################
    # STEP() method
    #
    def step(self, state, action, reward, next_state, done, update=True):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.update_rate
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                if update:
                    self.learn(experiences, self.gamma)

########################################################
# ACT() method
#

    def act(self, state, eps=0.0):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.network.eval()
        with torch.no_grad():
            action_values = self.network(state)
        self.network.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

########################################################
# LEARN() method
# Update value parameters using given batch of experience tuples.

    def learn(self, experiences, gamma, DQN=True):
        """
        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """

        states, actions, rewards, next_states, dones = experiences

        # Get Q values from current observations (s, a) using model nextwork
        Qsa = self.network(states).gather(1, actions)

        if (self.dqn_type == 'DDQN'):
            #Double DQN
            #************************
            Qsa_prime_actions = self.network(next_states).detach().max(
                1)[1].unsqueeze(1)
            Qsa_prime_targets = self.target_network(
                next_states)[Qsa_prime_actions].unsqueeze(1)

        else:
            #Regular (Vanilla) DQN
            #************************
            # Get max Q values for (s',a') from target model
            Qsa_prime_target_values = self.target_network(next_states).detach()
            Qsa_prime_targets = Qsa_prime_target_values.max(1)[0].unsqueeze(1)

        # Compute Q targets for current states
        Qsa_targets = rewards + (gamma * Qsa_prime_targets * (1 - dones))

        # Compute loss (error)
        loss = F.mse_loss(Qsa, Qsa_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.network, self.target_network, self.tau)

    ########################################################
    """
    Soft update model parameters.
    θ_target = τ*θ_local + (1 - τ)*θ_target
    """

    def soft_update(self, local_model, target_model, tau):
        """
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def save_the_model(self, iteration, f_name):
        if not os.path.exists('./save/dqn/'):
            os.makedirs('./save/dqn/')
        f_name = 'dqn_param_' + str(iteration) + '_' + f_name + '_model.pth'
        torch.save(self.network.state_dict(), './save/dqn/' + f_name)
        print('DQN Model Saved')

    def load_the_model(self, iteration, f_name):
        f_path = './save/dqn/dqn_param_' + str(
            iteration) + '_' + f_name + '_model.pth'
        self.network.load_state_dict(torch.load(f_path))
        print('DQN Model Loaded')

예제 #27

파일 보기

class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 duel=True,
                 qnetwork_weights=None):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.duel = duel

        # 0 - walk forward
        # 1 - walk backward
        # 2 - turn left
        # 3 - turn right

        self.seed = random.seed(seed)

        # Q-Network

        self.qnetwork_local = QNetwork(state_size,
                                       action_size,
                                       seed,
                                       duel=True).to(device)
        self.qnetwork_target = QNetwork(state_size,
                                        action_size,
                                        seed,
                                        duel=True).to(device)

        # load with trained weights if needed
        if qnetwork_weights is not None:
            self.qnetwork_local.load_state_dict(qnetwork_weights)
            self.qnetwork_target.load_state_dict(qnetwork_weights)

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy()).astype(int)
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """

        # experiences is already sent to GPU in the replay buffer class, so no need to worry about it here
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # calculate MSE
        loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

예제 #28

파일 보기

class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed, filepath):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.avarage_score = 0
        self.start_epoch = 0
        self.seed = random.randint(0, seed)
        random.seed(seed)
        print("seed ", seed, "  self.seed ", self.seed)
        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       self.seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        self.seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        if filepath:
            self.load_model(filepath)

        # Replay memory
        print("buffer size ", BUFFER_SIZE)
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   self.seed)
        print("memory ", self.memory)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            #print("experiences ",experiences)
            self.learn_DDQN(experiences, GAMMA)
            self.t_step = (self.t_step + 1) % UPDATE_EVERY
            if self.t_step == 0:
                self.update_network(self.qnetwork_local, self.qnetwork_target)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn_DDQN(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        # Get max predicted Q values (for next states) from target model
        Q_targets_next_argmax = self.qnetwork_local(next_states).squeeze(
            0).detach().max(1)[1].unsqueeze(1)
        #Q_targets_next0 = self.qnetwork_target(next_states).squeeze(0).detach()
        #Q_targets_next = Q_targets_next0.max(1)[0].unsqueeze(1)
        Q_targets_next = self.qnetwork_target(next_states).squeeze(0).gather(
            1, Q_targets_next_argmax)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).squeeze(0).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        #self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next0 = self.qnetwork_target(next_states).squeeze(0).detach()
        Q_targets_next = Q_targets_next0.max(1)[0].unsqueeze(1)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).squeeze(0).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        #self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def save_model(self, filepath, epoch, score, last=False):
        checkpoint = {
            'input_size':
            self.state_size,
            'output_size':
            self.action_size,
            'hidden_layers':
            [each.in_features for each in self.qnetwork_local.hidden_layers],
            'state_dict':
            self.qnetwork_local.state_dict(),
            'optimizer_state_dict':
            self.optimizer.state_dict(),
            'epoch':
            epoch,
            'avarage_score':
            score
        }
        checkpoint['hidden_layers'].append(
            self.qnetwork_local.hidden_layers[-1].out_features)
        torch.save(checkpoint, filepath)
        if last:
            torch.save(self.qnetwork_local.state_dict(),
                       '{}_state_dict_{}.pt'.format(last, epoch))
        #print("checkpoint['hidden_layers'] ",checkpoint['hidden_layers'])

    def load_model(self, filepath):
        print("seed ", self.seed)
        if os.path.isfile(filepath):
            print("=> loading checkpoint '{}'".format(filepath))
            checkpoint = torch.load(filepath)
            print("checkpoint['hidden_layers'] ", checkpoint['hidden_layers'])
            self.qnetwork_local = QNetwork(
                checkpoint['input_size'], checkpoint['output_size'], self.seed,
                checkpoint['hidden_layers']).to(device)
            self.qnetwork_local.load_state_dict(checkpoint['state_dict'])
            self.qnetwork_local.to(device)
            self.qnetwork_target = QNetwork(
                checkpoint['input_size'], checkpoint['output_size'], self.seed,
                checkpoint['hidden_layers']).to(device)
            self.qnetwork_target.load_state_dict(checkpoint['state_dict'])
            self.qnetwork_target.to(device)
            if 'optimizer_state_dict' in checkpoint:
                self.optimizer.load_state_dict(
                    checkpoint['optimizer_state_dict'])
                for state in self.optimizer.state.values():
                    for k, v in state.items():
                        if isinstance(v, torch.Tensor):
                            state[k] = v.to(device)
                print(self.optimizer)
            if 'epoch' in checkpoint:
                self.start_epoch = checkpoint['epoch']
            if 'avarage_score' in checkpoint:
                self.avarage_score = checkpoint['avarage_score']

            print(self.qnetwork_target)
            print(self.optimizer)
        else:
            print("=> no checkpoint found at '{}'".format(filepath))

    def update_network(self, local_model, target_model):
        for target, local in zip(target_model.parameters(),
                                 local_model.parameters()):
            target.data.copy_(local.data)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

예제 #29

파일 보기

class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 seed=0,
                 gamma=0.99,
                 learning_rate=5e-4,
                 use_RB=True,
                 RB_size=int(1e5),
                 RB_batch_size=64,
                 use_TM=True,
                 TM_update_every=4,
                 use_DDQN=True,
                 use_PER=False,
                 PER_epsilon=0.01,
                 PER_alpha=0.5,
                 PER_beta=0.4,
                 PER_beta_increment=0.001,
                 use_DUELING=True):
        """Initialize an Agent object.
        
        Params
        ======
            state_size                  (int)   : dimension of each state
            action_size                 (int)   : dimension of each action
            seed                        (int)   : random seed
            gamma                       (float) : discount factor
            learning_rate               (float) : learning rate of the model

            use_RB                      (bool)  : Use a replay buffer
            RB_size                     (int)   : replay buffer size
            RB_batch_size               (int)   : minibatch size of the learning

            use_TM                      (bool)  : Use a target model
            TM_update_every             (int)   : update target model every t steps

            use_DDQN                    (bool)  : Use Double DQN, only valid if use target model
            
            use_PER                     (bool)  : Use a prioritized replay buffer
            PER_epsilon                 (float) : Small value added to priorities to avoid zero probabilities
            PER_alpha                   (float) : Power used to compute the sampling probabilities
                                                  [0-1] : 0=> Uniform sampling 1=>Fully prioritized
            PER_beta                    (float) : Used in importance-sampling - Initial value increased to 1
            PER_beta_increment          (float) : To increment beta at each sampling

            use_DUELING                 (bool)  : Use DUELING network
        """
        # Control some parameters
        assert not use_PER or (
            use_PER and use_RB
        ), "Use replay buffer if use PER"  # To make sure we remember to update RB params
        assert not use_DDQN or (use_DDQN
                                and use_TM), "Use target model if use DDQN"

        self.state_size = state_size
        self.action_size = action_size

        self.gamma = gamma

        # Q-Network
        self.qnetwork_policy = QNetwork(state_size,
                                        action_size,
                                        seed,
                                        use_DUELING=use_DUELING).to(device)
        self.optimizer = optim.Adam(self.qnetwork_policy.parameters(),
                                    lr=learning_rate)

        self.use_DDQN = use_DDQN
        self.use_TM = use_TM
        if use_TM:
            self.qnetwork_target = QNetwork(state_size,
                                            action_size,
                                            seed,
                                            use_DUELING=use_DUELING).to(device)
            self.TM_update_every = TM_update_every

        # Initialize time step
        self.t_step = 0

        # Replay memory
        self.use_RB = use_RB
        self.RB_batch_size = RB_batch_size
        self.use_PER = use_PER
        if use_PER:
            self.memory = ReplayBufferPER(RB_size,
                                          RB_batch_size,
                                          seed,
                                          epsilon=PER_epsilon,
                                          alpha=PER_alpha,
                                          beta=PER_beta,
                                          beta_increment=PER_beta_increment)
        elif use_RB:
            self.memory = ReplayBuffer(RB_size, RB_batch_size, seed)

        # Init the seed
        random.seed(seed)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        # Epsilon-greedy action selection
        if random.random() > eps:
            state = torch.from_numpy(state).float().unsqueeze(0).to(device)
            self.qnetwork_policy.eval()
            with torch.no_grad():
                action_values = self.qnetwork_policy(state)
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory if any
        if self.use_PER:
            # Need to compute the error of this experience
            Q_target, Q_expected = self._QValues([(state, action, reward,
                                                   next_state, done)])
            error = (Q_target - Q_expected).cpu().squeeze().data.item()

            self.memory.add(error, (state, action, reward, next_state, done))
        elif self.use_RB:
            self.memory.add((state, action, reward, next_state, done))
        else:
            self.experiences = [(state, action, reward, next_state, done)]

        # One more step.
        self.t_step += 1

        # If no replay buffer or not enough samples available in memory, learn
        if not self.use_RB or len(self.memory) > self.RB_batch_size:
            self._learn()

    def _QValues(self, batch):
        """Execute a forward path for the QNetworks to get the QValues (expected and target)
           So the TD error can be computed or used to learn

           Params
           ======

           batch : Array of tuple <state, action, reward, next_state, done>
        """

        # Get the types by line
        mini_batch = np.array(batch).transpose()

        states = torch.Tensor(np.vstack(mini_batch[0])).float().to(device)
        actions = torch.Tensor(np.vstack(mini_batch[1])).long().to(device)
        rewards = torch.Tensor(np.vstack(mini_batch[2])).float().to(device)
        next_states = torch.Tensor(np.vstack(mini_batch[3])).float().to(device)
        dones = torch.Tensor(np.vstack(
            mini_batch[4]).astype(int)).float().to(device)

        # Get max predicted Q values (for next states) from target model
        if not self.use_TM or (self.use_TM and self.use_DDQN):
            self.qnetwork_policy.eval()
            with torch.no_grad():
                action_values_policy = self.qnetwork_policy(next_states)

        if self.use_TM:
            self.qnetwork_target.eval()
            with torch.no_grad():
                action_values_target = self.qnetwork_target(next_states)

        if self.use_TM:
            if self.use_DDQN:
                Q_targets_next = action_values_target.gather(
                    dim=1,
                    index=action_values_policy.max(dim=1, keepdim=True)[1])
            else:
                Q_targets_next = action_values_target.max(dim=1,
                                                          keepdim=True)[0]
        else:
            Q_targets_next = action_values_policy.max(dim=1, keepdim=True)[0]

        # Need to be at zero if we were done
        Q_targets_next *= torch.ones_like(dones) - dones

        # Compute the Q targets for current states
        Q_targets = rewards + self.gamma * Q_targets_next

        # Get the Q values from policy model
        self.qnetwork_policy.train()
        Q_expected = self.qnetwork_policy(states).gather(dim=1, index=actions)

        return Q_targets, Q_expected

    def _learn(self):
        """Update value parameters using given a batch of experience tuples."""

        if self.use_PER:
            experiences, indexes, IS_weights = self.memory.sample()
            IS_weights = torch.Tensor(np.vstack(IS_weights)).float().to(device)
        elif self.use_RB:
            experiences = self.memory.sample()
        else:
            experiences = self.experiences

        # Get the Qvalues for those experiences
        Q_targets, Q_expected = self._QValues(experiences)

        if self.use_PER:
            # Update priorities of the replay buffer
            errors = (Q_targets - Q_expected).cpu().squeeze().data.numpy()
            self.memory.update_priorities(indexes, errors)

            # Update Qs with the importance-sampling weight correction
            Q_expected *= IS_weights**0.5
            Q_targets *= IS_weights**0.5

        # Loss computation
        loss = F.mse_loss(Q_expected, Q_targets)
        #loss = F.smooth_l1_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        if self.use_TM:
            self.t_step %= self.TM_update_every
            if self.t_step == 0:
                self.qnetwork_target.load_state_dict(
                    self.qnetwork_policy.state_dict())

    def save_weights(self, file='checkpoint.pth'):
        """Save the agent network weights in a checkpoint file"""
        torch.save(self.qnetwork_policy.state_dict(), file)

    def load_weights(self, file='checkpoint.pth'):
        """Load the agent network weights from a checkpoint file"""
        self.qnetwork_policy.load_state_dict(torch.load(file))

예제 #30

파일 보기

class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def save(self, filename):
        """Saves the agent to the local workplace

        Params
        ======
            filename (string): where to save the weights
        """

        checkpoint = {
            'input_size':
            self.state_size,
            'output_size':
            self.action_size,
            'hidden_layers':
            [each.out_features for each in self.qnetwork_local.hidden_layers],
            'state_dict':
            self.qnetwork_local.state_dict()
        }

        torch.save(checkpoint, filename)

    def load_weights(self, filename):
        """ Load weights to update agent's Q-Network.
        Expected is a format like the one produced by self.save()

        Params
        ======
            filename (string): where to load data from. 
        """
        checkpoint = torch.load(filename)
        if not checkpoint['input_size'] == self.state_size:
            print(
                f"Error when loading weights from checkpoint {filename}: input size {checkpoint['input_size']} doesn't match state size of agent {self.state_size}"
            )
            return None
        if not checkpoint['output_size'] == self.action_size:
            print(
                f"Error when loading weights from checkpoint {filename}: output size {checkpoint['output_size']} doesn't match action space size of agent {self.action_size}"
            )
            return None
        my_hidden_layers = [
            each.out_features for each in self.qnetwork_local.hidden_layers
        ]
        if not checkpoint['hidden_layers'] == my_hidden_layers:
            print(
                f"Error when loading weights from checkpoint {filename}: hidden layers {checkpoint['hidden_layers']} don't match agent's hidden layers {my_hidden_layers}"
            )
            return None
        self.qnetwork_local.load_state_dict(checkpoint['state_dict'])
        self.qnetwork_target = self.qnetwork_local