def __init__(self, env_id, input_shape, n_actions, icm, n_threads=8):
        names = [str(i) for i in range(1, n_threads + 1)]

        global_actor_critic = ActorCritic(input_shape, n_actions)
        global_actor_critic.share_memory()
        global_optim = SharedAdam(global_actor_critic.parameters())

        if not icm:
            global_icm = None
            global_icm_optim = None
        else:
            global_icm = ICM(input_shape, n_actions)
            global_icm.share_memory()
            global_icm_optim = SharedAdam(global_icm.parameters())

        self.ps = [
            mp.Process(target=worker,
                       args=(name, input_shape, n_actions, global_actor_critic,
                             global_icm, global_optim, global_icm_optim,
                             env_id, n_threads, icm)) for name in names
        ]

        [p.start() for p in self.ps]
        [p.join() for p in self.ps]
Пример #2
0
def main():

    envs = [make_env() for i in range(num_envs)]
    envs = SubprocVecEnv(envs)

    state_shape = envs.observation_space.shape
    num_actions = envs.action_space.n
    num_rewards = len(task_rewards[mode])

    full_rollout = True

    env_model = EnvModel(envs.observation_space.shape, num_pixels, num_rewards)
    env_model.load_state_dict(torch.load("env_model_" + mode))

    distil_policy = ActorCritic(envs.observation_space.shape,
                                envs.action_space.n)
    distil_optimizer = optim.Adam(distil_policy.parameters())

    imagination = ImaginationCore(1,
                                  state_shape,
                                  num_actions,
                                  num_rewards,
                                  env_model,
                                  distil_policy,
                                  full_rollout=full_rollout)

    actor_critic = I2A(state_shape,
                       num_actions,
                       num_rewards,
                       256,
                       imagination,
                       full_rollout=full_rollout)
    #rmsprop hyperparams:
    lr = 7e-4
    eps = 1e-5
    alpha = 0.99
    optimizer = optim.RMSprop(actor_critic.parameters(),
                              lr,
                              eps=eps,
                              alpha=alpha)

    #if USE_CUDA:
    #    env_model     = env_model.cuda()
    #    distil_policy = distil_policy.cuda()
    #    actor_critic  = actor_critic.cuda()

    gamma = 0.99
    entropy_coef = 0.01
    value_loss_coef = 0.5
    max_grad_norm = 0.5
    num_steps = 5
    num_frames = int(10e5)

    rollout = RolloutStorage(num_steps, num_envs, envs.observation_space.shape)
    #rollout.cuda()

    all_rewards = []
    all_losses = []

    state = envs.reset()
    current_state = torch.FloatTensor(np.float32(state))

    rollout.states[0].copy_(current_state)

    episode_rewards = torch.zeros(num_envs, 1)
    final_rewards = torch.zeros(num_envs, 1)

    for i_update in tqdm(range(num_frames)):

        for step in range(num_steps):
            #if USE_CUDA:
            #    current_state = current_state.cuda()
            action = actor_critic.act(autograd.Variable(current_state))

            next_state, reward, done, _ = envs.step(
                action.squeeze(1).cpu().data.numpy())

            reward = torch.FloatTensor(reward).unsqueeze(1)
            episode_rewards += reward
            masks = torch.FloatTensor(1 - np.array(done)).unsqueeze(1)
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            #if USE_CUDA:
            #    masks = masks.cuda()

            current_state = torch.FloatTensor(np.float32(next_state))
            rollout.insert(step, current_state, action.data, reward, masks)

        _, next_value = actor_critic(
            autograd.Variable(rollout.states[-1], volatile=True))
        next_value = next_value.data

        returns = rollout.compute_returns(next_value, gamma)

        logit, action_log_probs, values, entropy = actor_critic.evaluate_actions(
            autograd.Variable(rollout.states[:-1]).view(-1, *state_shape),
            autograd.Variable(rollout.actions).view(-1, 1))

        distil_logit, _, _, _ = distil_policy.evaluate_actions(
            autograd.Variable(rollout.states[:-1]).view(-1, *state_shape),
            autograd.Variable(rollout.actions).view(-1, 1))

        distil_loss = 0.01 * (F.softmax(logit).detach() *
                              F.log_softmax(distil_logit)).sum(1).mean()

        values = values.view(num_steps, num_envs, 1)
        action_log_probs = action_log_probs.view(num_steps, num_envs, 1)
        advantages = autograd.Variable(returns) - values

        value_loss = advantages.pow(2).mean()
        action_loss = -(autograd.Variable(advantages.data) *
                        action_log_probs).mean()

        optimizer.zero_grad()
        loss = value_loss * value_loss_coef + action_loss - entropy * entropy_coef
        loss.backward()
        nn.utils.clip_grad_norm(actor_critic.parameters(), max_grad_norm)
        optimizer.step()

        distil_optimizer.zero_grad()
        distil_loss.backward()
        optimizer.step()

        if i_update % 100 == 0:
            all_rewards.append(final_rewards.mean())
            all_losses.append(loss.item())

            #clear_output(True)
            plt.figure(figsize=(20, 5))
            plt.subplot(131)
            plt.title('epoch %s. reward: %s' %
                      (i_update, np.mean(all_rewards[-10:])))
            plt.plot(all_rewards)
            plt.subplot(132)
            plt.title('loss %s' % all_losses[-1])
            plt.plot(all_losses)
            plt.show()

        rollout.after_update()

    torch.save(actor_critic.state_dict(), "i2a_" + mode)
Пример #3
0
class PPO:
    def __init__(self, device, state_dim, action_dim, action_std, lr, betas,
                 gamma, K_epochs, eps_clip):
        self.lr = lr
        self.device = device
        self.betas = betas
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs

        self.policy = ActorCritic(state_dim, action_dim, action_std).to(device)
        #self.optimizer = RAdam(self.policy.parameters(), lr=lr, betas=betas)
        self.optimizer = optim.Adam(self.policy.parameters(), lr=lr)

        self.policy_old = ActorCritic(state_dim, action_dim,
                                      action_std).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())

        self.MseLoss = nn.MSELoss()

    def select_action(self, state, memory):
        if np.any(np.isnan(state)):
            print('in select action: state is nan', state)
        state = torch.FloatTensor(state.reshape(1, -1)).to(self.device)
        return self.policy_old.act(state, memory).cpu().data.numpy().flatten()

    def update(self, memory):
        # Monte Carlo estimate of rewards:
        rewards = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(memory.rewards),
                                       reversed(memory.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)

        # Normalizing the rewards:
        rewards = torch.tensor(rewards).to(self.device)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5)

        # convert list to tensor
        old_states_ = torch.squeeze(
            torch.stack(memory.states).to(self.device)).detach()
        old_actions_ = torch.squeeze(
            torch.stack(memory.actions).to(self.device)).detach()
        old_logprobs_ = torch.squeeze(torch.stack(memory.logprobs)).to(
            self.device).detach()

        batch_size = old_states_.shape[0]
        mini_batch_size = batch_size // 8  # 64

        # Optimize policy for K epochs:
        for _ in range(self.K_epochs):
            # Evaluating old actions and values :
            for i in range(batch_size // mini_batch_size):
                rand_ids = np.random.randint(0, batch_size, mini_batch_size)
                old_states = old_states_[rand_ids, :]
                old_actions = old_actions_[rand_ids, :]
                old_logprobs = old_logprobs_[rand_ids, :]
                rewards_batch = rewards[rand_ids]

                logprobs, state_values, dist_entropy = self.policy.evaluate(
                    old_states, old_actions)

                # Finding the ratio (pi_theta / pi_theta__old):
                ratios = torch.exp(logprobs - old_logprobs.detach())

                # Finding Surrogate Loss:
                advantages = rewards_batch - state_values.detach()
                ## torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip)
                #surr = -torch.min(ratios, 1) * advantages  # as per the paper

                len_adv = advantages.shape[0]
                advantages = advantages.reshape((len_adv, 1))
                surr1 = ratios * advantages
                surr2 = 1 * advantages  ## as per the paper

                surr = -torch.min(surr1, surr2).mean()
                w_crit_loss = 1
                loss = surr + w_crit_loss * (rewards_batch - state_values).pow(
                    2).mean()  #- 0.01 * dist_entropy

                # take gradient step
                self.optimizer.zero_grad()
                loss.mean().backward()
                self.optimizer.step()

        # Copy new weights into old policy:
        self.policy_old.load_state_dict(self.policy.state_dict())
Пример #4
0
def train(args, dynet):
    torch.manual_seed(args.seed)

    embedding_size = args.embedding_size

    lstm_size = args.lstm_size

    num_modules = len(dynet.library) + 1

    libr = librarian.SimpleLibrarian(num_modules, embedding_size)
    print type(libr)
    model = ActorCritic(num_modules, libr, lstm_size)

    env = learning_env.Environment(args, dynet, libr)

    optimizer = optim.Adam(model.parameters(), lr=args.ac_lr)

    model.train()

    values = []
    log_probs = []

    state = env.reset()
    #state = torch.from_numpy(state)
    done = True

    episode_length = 0
    while True:
        episode_length += 1
        # Sync with the shared model
        # model.load_state_dict(shared_model.state_dict())
        if done:
            cx = Variable(torch.zeros(1, lstm_size))
            hx = Variable(torch.zeros(1, lstm_size))
        else:
            cx = Variable(cx.data)
            hx = Variable(hx.data)

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.num_steps):
            value, logit, (hx, cx) = model((state.unsqueeze(0)), (hx, cx))
            prob = F.softmax(logit)
            log_prob = F.log_softmax(logit)
            entropy = -(log_prob * prob).sum(1)
            entropies.append(entropy)

            action = prob.multinomial().data
            log_prob = log_prob.gather(1, Variable(action))

            state, reward, done, _ = env.step(action.numpy()[0, 0])
            done = done or episode_length >= args.num_steps
            reward = max(min(reward, 1), -1)

            if done:
                episode_length = 0
                state = env.reset()

            #state = torch.from_numpy(state)
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                break

        R = torch.zeros(1, 1)
        if not done:
            value, _, _ = model((state.unsqueeze(0)), (hx, cx))
            R = value.data

        values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = rewards[i] + args.gamma * \
                values[i + 1].data - values[i].data
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                log_probs[i] * Variable(gae) - 0.01 * entropies[i]

        optimizer.zero_grad()
        (policy_loss + 0.5 * value_loss).backward()

        global_norm = 0
        for param in model.parameters():
            global_norm += param.grad.data.pow(2).sum()
        global_norm = math.sqrt(global_norm)
        ratio = 40 / global_norm
        if ratio < 1:
            for param in model.parameters():
                param.grad.data.mul_(ratio)
        optimizer.step()
Пример #5
0
class PPO(nn.Module):
    def __init__(self,
                 state_dim,
                 action_dim,
                 eps=0.2,
                 gamma=0.99,
                 lambda_=0.95,
                 K_epoch=80,
                 batch_size=64):
        super(PPO, self).__init__()
        self.eps = eps
        self.gamma = gamma
        self.lambda_ = lambda_
        self.K_epoch = K_epoch
        self.batch_size = batch_size

        self.model = ActorCritic(state_dim, action_dim)
        self.model_old = ActorCritic(state_dim, action_dim)
        for param in self.model_old.parameters():
            param.requires_grad = False
        self.copy_weights()

    def forward(self, x):
        self.pi, self.v = self.model_old(x)

        return self.pi, self.v

    def copy_weights(self):
        self.model_old.load_state_dict(self.model.state_dict())

    def update(self, buffer, optimizer):
        self.model.train()
        self.model_old.eval()
        self.advantage_fcn(buffer.data)

        batch_loss, batch_clip_loss, batch_vf_loss = [], [], []
        for epoch in range(self.K_epoch):
            for state, action, next_s, reward, log_prob_old, entropy, advantage in buffer.get_data(
                    self.batch_size):
                pi, v = self.model(state)
                log_prob_pi = pi.log_prob(action)

                prob_ratio = torch.exp(log_prob_pi - log_prob_old)

                first_term = prob_ratio * advantage
                second_term = self.clip_by_value(prob_ratio) * advantage
                loss_clip = (torch.min(first_term, second_term)).mean()

                _, v_next = self.model_old(next_s)
                v_target = reward + self.gamma * v_next
                loss_vf = ((v - v_target)**2).mean(
                )  # squared error loss: (v(s_t) - v_target)**2

                loss = -(loss_clip - loss_vf
                         )  #-(loss_clip - 0.5*loss_vf + 0.01*entropy.mean())

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                batch_loss.append(loss.detach().numpy())
                batch_clip_loss.append(loss_clip.detach().numpy())
                batch_vf_loss.append(loss_vf.detach().numpy())

        self.copy_weights()
        buffer.reset()

    def advantage_fcn(self, buffer, normalize=True):
        _, v_st1 = self.model(torch.stack(buffer['next_s']))
        _, v_s = self.model(torch.stack(buffer['s']))
        deltas = torch.stack(buffer['r']) + self.gamma * v_st1 - v_s

        advantage, temp = [], 0
        idxs = torch.tensor(range(len(deltas) - 1, -1, -1))  #reverse
        reverse_deltas = deltas.index_select(0, idxs)
        for delta_t in reverse_deltas:
            temp = delta_t + self.lambda_ * self.gamma * temp
            advantage.append(temp)

        advantage = torch.as_tensor(advantage[::-1])  #re-reverse
        if normalize:
            advantage = (advantage - advantage.mean()) / advantage.std()

        buffer['advantage'] = advantage.unsqueeze(1)

    def clip_by_value(self, x):
        return x.clamp(1 - self.eps, 1 + self.eps)  # clamp(min, max)
Пример #6
0
def worker(name, input_shape, n_actions, global_agent, global_icm,
           optimizer, icm_optimizer, env_id, n_threads, icm=False):
    T_MAX = 20

    local_agent = ActorCritic(input_shape, n_actions)

    if icm:
        local_icm = ICM(input_shape, n_actions)
        algo = 'ICM'
    else:
        intrinsic_reward = T.zeros(1)
        algo = 'A3C'

    memory = Memory()

    env = gym.make(env_id)

    t_steps, max_eps, episode, scores, avg_score = 0, 1000, 0, [], 0

    while episode < max_eps:
        obs = env.reset()
        hx = T.zeros(1, 256)
        score, done, ep_steps = 0, False, 0
        while not done:
            state = T.tensor([obs], dtype=T.float)
            action, value, log_prob, hx = local_agent(state, hx)
            obs_, reward, done, info = env.step(action)
            t_steps += 1
            ep_steps += 1
            score += reward
            reward = 0  # turn off extrinsic rewards
            memory.remember(obs, action, reward, obs_, value, log_prob)
            obs = obs_
            if ep_steps % T_MAX == 0 or done:
                states, actions, rewards, new_states, values, log_probs = \
                        memory.sample_memory()
                if icm:
                    intrinsic_reward, L_I, L_F = \
                            local_icm.calc_loss(states, new_states, actions)

                loss = local_agent.calc_loss(obs, hx, done, rewards, values,
                                             log_probs, intrinsic_reward)

                optimizer.zero_grad()
                hx = hx.detach_()
                if icm:
                    icm_optimizer.zero_grad()
                    (L_I + L_F).backward()

                loss.backward()
                T.nn.utils.clip_grad_norm_(local_agent.parameters(), 40)

                for local_param, global_param in zip(
                                        local_agent.parameters(),
                                        global_agent.parameters()):
                    global_param._grad = local_param.grad
                optimizer.step()
                local_agent.load_state_dict(global_agent.state_dict())

                if icm:
                    for local_param, global_param in zip(
                                            local_icm.parameters(),
                                            global_icm.parameters()):
                        global_param._grad = local_param.grad
                    icm_optimizer.step()
                    local_icm.load_state_dict(global_icm.state_dict())
                memory.clear_memory()

        if name == '1':
            scores.append(score)
            avg_score = np.mean(scores[-100:])
            print('{} episode {} thread {} of {} steps {:.2f}M score {:.2f} '
                  'intrinsic_reward {:.2f} avg score (100) {:.1f}'.format(
                      algo, episode, name, n_threads,
                      t_steps/1e6, score,
                      T.sum(intrinsic_reward),
                      avg_score))
        episode += 1
    if name == '1':
        x = [z for z in range(episode)]
        fname = algo + '_CartPole_no_rewards.png'
        plot_learning_curve(x, scores, fname)