def rankcaptions(filenames, top_n=5):
        # n_captions = top_n # the captions it ranks as highest should all be relevant
        n_captions = 1 # RCL: image caption mismatch when n_captions is not just one
        batch_size = 128
        image_features, captions = loadFeaturesTargets(filenames, 'val2014', n_captions=n_captions)
        stream = DataETL.getFinalStream(
              image_features
            , captions
            , ("image_vects", "word_vects")
            , ("image_vects_k", "word_vects_k")
            , batch_size=batch_size
            )

        f_emb = ModelIO.load('/home/luke/datasets/coco/predict/fullencoder_maxfeatures.50000')
        im_emb, s_emb = None, None
        print "Computing Image and Text Embeddings"
        for batch in stream.get_epoch_iterator():
            im_vects = batch[0]
            s_vects = batch[1]
            batch_im_emb, batch_s_emb = f_emb(im_vects, s_vects)
            im_emb = vStackMatrices(im_emb, batch_im_emb)
            s_emb = vStackMatrices(s_emb, batch_s_emb)

        # account for make sure theres matching fns for each of the n_captions
        image_fns = fillOutFilenames(filenames, n_captions=n_captions)

        print "Computing Cosine Distances and Ranking Captions"
        relevant_captions = ModelEval.getRelevantCaptions(
            im_emb, s_emb, image_fns, captions, z=n_captions, top_n=top_n
        )
        dict2json(relevant_captions, "rankcaptions_fullencoder_maxfeatures.50000.json", cls=DecimalEncoder)
        return relevant_captions
    def rank_function(self=None):
        teX, teY, _ = cocoXYFilenames(n_captions=5)
        sources = ('X', 'Y')
        sources_k = ('X_k', 'Y_k')
        stream = DataETL.getFinalStream(teX, teY, sources=sources,
                            sources_k=sources_k, batch_size=1000,
                            shuffle=False)
        images, captions, _0, _1 = stream.get_epoch_iterator().next()

        predict_dir = '/home/luke/datasets/coco/predict/'
        # encoder_name = '+coco_encoder_lstm_dim.300'
        encoder_name = 'sbu.100000+coco_encoder_lstm_dim.300_adadelta'
        # encoder_name = 'fullencoder_maxfeatures.50000_epochsampler'
        f_emb = ModelIO.load(predict_dir + encoder_name)
        image_embs, caption_embs = f_emb(images, captions)
        ModelEval.ImageSentenceRanking(image_embs, caption_embs)
    def rank_function(self=None):
        teX, teY, _ = cocoXYFilenames(n_captions=5)
        sources = ('X', 'Y')
        sources_k = ('X_k', 'Y_k')
        stream = DataETL.getFinalStream(teX,
                                        teY,
                                        sources=sources,
                                        sources_k=sources_k,
                                        batch_size=1000,
                                        shuffle=False)
        images, captions, _0, _1 = stream.get_epoch_iterator().next()

        predict_dir = '/home/luke/datasets/coco/predict/'
        # encoder_name = '+coco_encoder_lstm_dim.300'
        encoder_name = 'sbu.100000+coco_encoder_lstm_dim.300_adadelta'
        # encoder_name = 'fullencoder_maxfeatures.50000_epochsampler'
        f_emb = ModelIO.load(predict_dir + encoder_name)
        image_embs, caption_embs = f_emb(images, captions)
        ModelEval.ImageSentenceRanking(image_embs, caption_embs)
    def rankcaptions(filenames, top_n=5):
        # n_captions = top_n # the captions it ranks as highest should all be relevant
        n_captions = 1  # RCL: image caption mismatch when n_captions is not just one
        batch_size = 128
        image_features, captions = loadFeaturesTargets(filenames,
                                                       'val2014',
                                                       n_captions=n_captions)
        stream = DataETL.getFinalStream(image_features,
                                        captions,
                                        ("image_vects", "word_vects"),
                                        ("image_vects_k", "word_vects_k"),
                                        batch_size=batch_size)

        f_emb = ModelIO.load(
            '/home/luke/datasets/coco/predict/fullencoder_maxfeatures.50000')
        im_emb, s_emb = None, None
        print "Computing Image and Text Embeddings"
        for batch in stream.get_epoch_iterator():
            im_vects = batch[0]
            s_vects = batch[1]
            batch_im_emb, batch_s_emb = f_emb(im_vects, s_vects)
            im_emb = vStackMatrices(im_emb, batch_im_emb)
            s_emb = vStackMatrices(s_emb, batch_s_emb)

        # account for make sure theres matching fns for each of the n_captions
        image_fns = fillOutFilenames(filenames, n_captions=n_captions)

        print "Computing Cosine Distances and Ranking Captions"
        relevant_captions = ModelEval.getRelevantCaptions(im_emb,
                                                          s_emb,
                                                          image_fns,
                                                          captions,
                                                          z=n_captions,
                                                          top_n=top_n)
        dict2json(relevant_captions,
                  "rankcaptions_fullencoder_maxfeatures.50000.json",
                  cls=DecimalEncoder)
        return relevant_captions
class REINFORCE:
    def __init__(self,
                 state_dim,
                 action_dim,
                 gamma=0.99,
                 hidden_dim=64,
                 policy_lr=0.001,
                 baseline_lr=0.001):
        self._V = StateValueFunction(state_dim, hidden_dim=hidden_dim)
        self._pi = Policy(state_dim, action_dim, hidden_dim=hidden_dim)
        # self._V.cuda()
        # self._pi.cuda()
        self._gamma = gamma
        self._loss_function = nn.MSELoss()
        self._V_optimizer = optim.Adam(self._V.parameters(), lr=baseline_lr)
        self._pi_optimizer = optim.Adam(self._pi.parameters(), lr=policy_lr)
        self._action_dim = action_dim
        # --- ModelIO ---
        self._modelio = ModelIO(model_path=Path(__file__).resolve().parent /
                                'models')

    def get_action(self, s):
        mu_action = self._pi(tt(s))
        # mu_action = self._pi(tt(s)).detach().numpy()
        action_sampled = np.random.normal(loc=mu_action.detach().numpy(),
                                          scale=0.1,
                                          size=1)
        action_sampled = np.clip(action_sampled, a_min=-1.0, a_max=1.0)

        log_prob = torch.log(mu_action + torch.normal(mean=mu_action))
        return action_sampled, log_prob

    def train(self, env, episodes, time_steps):
        stats = EpisodeStats(episode_lengths=np.zeros(episodes),
                             episode_rewards=np.zeros(episodes))

        for i_episode in range(1, episodes + 1):
            # Generate an episode.
            # An episode is an array of (state, action, reward) tuples
            episode = []
            s = env.reset()
            for t in range(time_steps):
                a, log_prob_a = self.get_action(s)
                ns, r, d, _ = env.step(a)

                stats.episode_rewards[i_episode - 1] += r
                stats.episode_lengths[i_episode - 1] = t

                episode.append((s, a, log_prob_a, r))

                if d:
                    break
                s = ns

            # collect all rewards at one place
            T = len(episode)
            G = 0.0

            for t in reversed(range(T)):
                s, a, log_prob, r = episode[t]
                G = self._gamma * G + r

                baseline = self._V(tt(s))
                advantage = G - baseline
                self._train_baseline(G, baseline)
                self._train_policy(advantage, t, log_prob)

            print("\r{} Steps in Episode {}/{}. Reward {}".format(
                len(episode), i_episode, episodes,
                sum([e[3] for i, e in enumerate(episode)])))
        return stats

    def _train_baseline(self, G, baseline):
        self._V_optimizer.zero_grad()
        loss = self._loss_function(tt(np.array([G])), baseline)
        loss.backward(retain_graph=True)
        self._V_optimizer.step()

    def _train_policy(self, error, t, log_prob_a):
        self._pi_optimizer.zero_grad()
        neg_log_prob_a = -log_prob_a
        target = np.power(self._gamma, t) * error * neg_log_prob_a
        target.backward(retain_graph=True)
        self._pi_optimizer.step()

    def save_models(self, model_name):
        self._modelio.save(model=self._pi,
                           model_name=f'r_c_policy_{model_name}.pt')
        self._modelio.save(model=self._V,
                           model_name=f'r_c_baseline_{model_name}.pt')

    def load_models(self, model_name):
        # if self._model
        self._modelio.load(model=self._pi,
                           model_name=f'r_c_policy_{model_name}.pt')
        self._modelio.load(model=self._V,
                           model_name=f'r_c_baseline_{model_name}.pt')
        mlb = MultiLabelBinarizer()
        mlb.fit(vect.transform(captions))
        return vect, mlb
    # if not multilabel:
    return vect

dataset_name = 'coco_train2014'
n_sbu=None
if n_sbu:
    dataset_name += "+sbu%d" % n_sbu
# global vectorizer
vect_name = 'tokenizer_%s' % dataset_name
mlb_name = 'mlb_%s' % dataset_name
try:
    if mlb_name:
        mlb = ModelIO.load(mlb_name)
        print "MLB loaded from file"
    vect = ModelIO.load(vect_name)
    # vect = ModelIO.load('tokenizer_reddit') # gloveglove
    print "Tokenizer loaded from file."
except:
    if mlb_name:
        vect, mlb = prepVect(n_sbu=n_sbu, n_captions=1, multilabel=True)
        ModelIO.save(vect, vect_name)
        ModelIO.save(mlb, mlb_name)
        print "Saved %s, %s for future use." % (vect_name, mlb_name)
    else:
        vect = prepVect(n_sbu=n_sbu, n_captions=1)
        ModelIO.save(vect, vect_name)
        print "Saved %s for future use." % vect_name
class DDPG:
    def __init__(self,
                 state_dim,
                 action_dim,
                 gamma=0.99,
                 noise_std=0.02,
                 hidden_dim=64,
                 actor_lr=0.001,
                 critic_lr=0.001,
                 verbose=False):
        self.gamma = gamma
        # self.tau = 0.01
        self.tau = 0.001
        self.actor = Actor(state_dim,
                           noise_std=noise_std,
                           hidden_dim=hidden_dim)
        self.actor_target = Actor(state_dim,
                                  noise_std=noise_std,
                                  hidden_dim=hidden_dim)
        self.critic = Critic(state_dim, 
                            action_dim, 
                            hidden_dim=hidden_dim)
        self.critic_target = Critic(state_dim,
                                    action_dim,
                                    hidden_dim=hidden_dim)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=actor_lr)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=critic_lr)
        self.buffer = ReplayBuffer(max_size=1e5)
        self.logging_period = 10 if verbose else 100
        # --- ModelIO ---
        self.modelio = ModelIO(model_path=Path(__file__).resolve().parent /
                               'models')

    def update_target(self, target, source):
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data *
                                    (1.0 - self.tau))

    def get_action(self, state, is_testing=False):
        """
        used for test time (not training)
        """
        action = self.actor(state, is_testing).detach()
        # env_action = torch.clamp(action, min=-1.0, max=1.0).detach().numpy()
        action = self.actor(state, is_testing).detach().numpy()
        return action

    def train(self, env, episodes, timesteps):
        stats = EpisodeStats(episode_lengths=np.zeros(episodes),
                             episode_rewards=np.zeros(episodes))
        for i_episode in range(1, episodes + 1):
            state = env.reset()
            for t in range(timesteps):
                # --- choose action
                action = self.actor(state).detach()
                env_action = torch.clamp(action, min = -2.0,
                                         max = 2.0).detach().numpy()
                
                next_state, reward, done, _, _ = env.step(env_action)

                # --- saving stats
                stats.episode_rewards[i_episode - 1] += reward
                stats.episode_lengths[i_episode - 1] = t

                # --- save the transision
                self.buffer.add_transition(
                    state=torch.from_numpy(state).float().to(device),
                    action=action,
                    next_state=torch.from_numpy(next_state).float().to(device),
                    reward=reward,
                    done=done)

                # --- sample a batch of transitions
                batch = self.buffer.next_random_batch(batch_size=32)

                # --- train
                self.train_batch(batch)

                # --- update target networks
                self.update_target(target=self.actor_target, source=self.actor)
                self.update_target(target=self.critic_target,
                                   source=self.critic)

                if done:
                    break
                state = next_state

            # logging
            if i_episode % self.logging_period == 0:
                print((f"{int(stats.episode_lengths[i_episode - 1])} Steps in"
                       f"Episode {i_episode}/{episodes}. "
                       f"Reward {stats.episode_rewards[i_episode-1]}"))

            # snapshot instants
            snaps_moments = np.array([400, 800, 1200, 1600])
            for snap in snaps_moments:
                if (i_episode == snap):
                    self.save_models(model_name=snap)


        return stats

    def train_batch(self, batch):
        states, actions, next_states, rewards, dones = batch
        batch_rewards = torch.FloatTensor(rewards).to(device)
        batch_states = torch.stack(states).to(device)
        batch_actions = torch.stack(actions).to(device)
        batch_next_states = torch.stack(next_states).to(device)

        batch_na = self.actor_target(batch_next_states)
        batch_q_ns_na = self.critic_target(batch_next_states,
                                           batch_na.detach().view(-1, 1))
        update_targets = batch_rewards.view(-1, 1) + self.gamma * batch_q_ns_na
        batch_q_s_a = self.critic(batch_states, batch_actions.view(-1, 1))
        critic_loss = F.mse_loss(batch_q_s_a, update_targets)

        actor_loss = -self.critic(batch_states,
                                  self.actor(batch_states).view(-1, 1)).mean()
        # actor_loss = self.critic(batch_states,
        #                           self.actor(batch_states).view(-1, 1)).mean()

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

    def save_models(self, model_name):
        self.modelio.save(model=self.actor,
                          model_name=f'ddpg_c_actor_{model_name}.pt')
        self.modelio.save(model=self.critic,
                          model_name=f'ddpg_c_critic_{model_name}.pt')

    def load_models(self, model_name):
        # if self._model
        self.modelio.load(model=self.actor,
                          model_name=f'ddpg_c_actor_{model_name}.pt')
        self.modelio.load(model=self.actor_target,
                          model_name=f'ddpg_c_actor_{model_name}.pt')
        self.modelio.load(model=self.critic,
                          model_name=f'ddpg_c_critic_{model_name}.pt')
        self.modelio.load(model=self.critic_target,
                          model_name=f'ddpg_c_critic_{model_name}.pt')
Exemplo n.º 8
0
class ActorCritic:
    def __init__(self, state_dim, action_dim, gamma, d2c=None):
        self._V = StateValueFunction(state_dim)
        self._pi = Policy(state_dim, action_dim)
        self.d2c = d2c  # discrete to continuous actions
        # self._V.cuda()
        # self._pi.cuda()
        self._gamma = gamma
        self._loss_function = nn.MSELoss()
        self._V_optimizer = optim.Adam(self._V.parameters(), lr=0.001)
        self._pi_optimizer = optim.Adam(self._pi.parameters(), lr=0.0001)
        self._action_dim = action_dim
        # --- ModelIO ---
        self._modelio = ModelIO(model_path=Path(__file__).resolve().parent /
                                'models')
        self._baseline_model_name = 'ac_baseline.pt'
        self._policy_model_name = 'ac_policy.pt'

    def get_action(self, s):
        probs = self._pi(tt(s))
        action = np.random.choice(a=self._action_dim,
                                  p=np.squeeze(probs.detach().numpy()))
        log_prob = torch.log(probs.squeeze(0)[action])

        # converting the discrete action [0,1,2,...]
        # to an action in the continuous
        # range (actionspace.low <--> actionspace.high)
        if self.d2c:
            action = self.d2c(action)

        return action, log_prob

    def train(self, env, episodes, time_steps):
        stats = EpisodeStats(episode_lengths=np.zeros(episodes),
                             episode_rewards=np.zeros(episodes))

        for i_episode in range(1, episodes + 1):
            # Generate an episode.
            # An episode is an array of (state, action, reward) tuples
            s = env.reset()
            comounded_decay = 1
            for t in range(time_steps):
                a, log_prob_a = self.get_action(s)
                ns, r, d, _ = env.step(a)

                stats.episode_rewards[i_episode - 1] += r
                stats.episode_lengths[i_episode - 1] = t

                target = r
                if not d:
                    target = target + self._gamma * self._V(
                        tt(ns)).cpu().detach()
                baseline = self._V(tt(s))
                advantage = target - baseline
                comounded_decay *= self._gamma
                self._train_baseline(target, baseline)
                self._train_policy(advantage, comounded_decay, log_prob_a)

                if d:
                    break
                s = ns

            print(
                f"{stats.episode_lengths[i_episode-1]} Steps in Episode {i_episode}/{episodes}. Reward {stats.episode_rewards[i_episode-1]}"
            )
        return stats

    def _train_policy(self, advantage, comp_decay, log_prob_a):
        self._pi_optimizer.zero_grad()
        neg_log_prob_a = -log_prob_a
        target_objective = comp_decay * advantage * neg_log_prob_a
        target_objective.backward()
        self._pi_optimizer.step()

    def _train_baseline(self, target, baseline):
        self._V_optimizer.zero_grad()
        loss = self._loss_function(tt(np.array([target])), baseline)
        loss.backward(retain_graph=True)
        self._V_optimizer.step()

    def save_models(self):
        self._modelio.save(model=self._pi, model_name=self._policy_model_name)
        self._modelio.save(model=self._V, model_name=self._baseline_model_name)

    def load_models(self):
        # if self._model
        self._modelio.load(model=self._pi, model_name=self._policy_model_name)
        self._modelio.load(model=self._V, model_name=self._baseline_model_name)
Exemplo n.º 9
0
class DQN:
    def __init__(self, state_dim, action_dim, gamma, d2c=None):
        self._q = Q(state_dim, action_dim)
        self._q_target = Q(state_dim, action_dim)

        # self._q.cuda()
        # self._q_target.cuda()

        self._gamma = gamma
        self._loss_function = nn.MSELoss()
        self._q_optimizer = optim.Adam(self._q.parameters(), lr=0.0001)
        self._action_dim = action_dim
        self._replay_buffer = ReplayBuffer(5000)
        self._d2c = d2c
        # --- ModelIO ---
        self._modelio = ModelIO(model_path=Path(__file__).resolve().parent /
                                'models')
        self._q_model_name = 'q.pt'
        self._target_model_name = 'target.pt'

    def get_action(self, x, epsilon):
        u = np.argmax(self._q(tt(x)).cpu().detach().numpy())
        r = np.random.uniform()
        if r < epsilon:
            u = np.random.randint(self._action_dim)
        if self._d2c:
            u = self._d2c(u)
        return u

    def train(self, env, episodes, time_steps, epsilon):
        stats = EpisodeStats(episode_lengths=np.zeros(episodes),
                             episode_rewards=np.zeros(episodes))

        for i_episode in range(1, episodes + 1):
            state = env.reset()
            for t in range(time_steps):
                action = self.get_action(state, epsilon)
                next_state, reward, done, _ = env.step(action)

                stats.episode_rewards[i_episode - 1] += reward
                stats.episode_lengths[i_episode - 1] = t
                # calculate priority of the transition (td-error)
                q_s_a = self._q(tt(state)).cpu().detach().numpy()[int(
                    np.squeeze(action))]
                target = reward + self._gamma * np.max(
                    self._q_target(tt(next_state)).cpu().detach().numpy())
                priority = -abs(target - q_s_a) + np.random.randn() * 1e-2
                # add the experience into the buffer
                self._replay_buffer.add_transition(state=state,
                                                   action=action,
                                                   next_state=next_state,
                                                   reward=reward,
                                                   done=done,
                                                   priority=priority)
                # sample a batch of experiences
                samples = self._replay_buffer.next_batch(batch_size=64)

                # update q network parameters
                self._train_batch(samples)
                # update target network periodically/slowly
                soft_update(target=self._q_target, source=self._q, tau=0.01)

                if done:
                    break
                state = next_state
            print(
                f"{int(stats.episode_lengths[i_episode-1])} Steps in Episode {i_episode}/{episodes}. Reward {stats.episode_rewards[i_episode-1]}"
            )

        return stats

    def _train_batch(self, batch):

        states, actions, next_states, rewards, dones = batch
        batch_size = len(rewards)
        # calculating q(s,a)
        batch_actions = np.array(actions).squeeze()
        batch_qs = self._q(tt(np.array(states)))
        batch_qs = batch_qs[np.arange(batch_size), batch_actions]
        # calculating r + gamma * max_a' q(s', a')
        targets = tt(np.array(rewards))
        non_terminal_idx = np.array(dones)
        batch_next_qs = self._q_target(tt(np.array(next_states)))
        batch_max_next_qs, batch_argmax_next_qs = batch_next_qs.max(1)
        targets[non_terminal_idx] = targets[
            non_terminal_idx] + self._gamma * batch_max_next_qs[
                non_terminal_idx]

        self._q_optimizer.zero_grad()
        loss = self._loss_function(batch_qs, targets)
        loss.backward()
        self._q_optimizer.step()

    def save_models(self):
        self._modelio.save(model=self._q, model_name=self._q_model_name)
        self._modelio.save(model=self._q_target,
                           model_name=self._target_model_name)

    def load_models(self):
        # if self._model
        self._modelio.load(model=self._q, model_name=self._q_model_name)
        self._modelio.load(model=self._q_target,
                           model_name=self._target_model_name)
Exemplo n.º 10
0
class PPO:
    def __init__(self,
                 state_dim,
                 action_dim,
                 action_std=0.1,
                 gamma=0.99,
                 hidden_dim=64,
                 actor_lr=0.001,
                 critic_lr=0.001,
                 K_epochs=5,
                 eps_clip=0.2,
                 entropy_coeff=0.02,
                 verbose=False):
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs
        self.entropy_coeff = entropy_coeff
        self.verbose = verbose

        self.critic = Critic(state_dim, hidden_dim=hidden_dim).to(device)
        self.actor = Actor(state_dim,
                           action_dim,
                           action_std=action_std,
                           hidden_dim=hidden_dim).to(device)
        self.actor_old = Actor(state_dim,
                               action_dim,
                               action_std=action_std,
                               hidden_dim=hidden_dim).to(device)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=actor_lr)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=critic_lr)
        self.actor_old.load_state_dict(self.actor.state_dict())
        self.buffer = Buffer()
        # --- ModelIO ---
        self.modelio = ModelIO(model_path=Path(__file__).resolve().parent /
                               'models')

    def get_action(self, state):
        """
        used for test time (not training)
        """
        action, _ = self.actor_old(state)
        action = torch.clamp(action, min=-1.0, max=1.0).detach().numpy()
        return action

    def train(self, env, episodes, timesteps, update_timestep):
        stats = EpisodeStats(episode_lengths=np.zeros(episodes),
                             episode_rewards=np.zeros(episodes))
        timestep = 0
        for i_episode in range(1, episodes + 1):
            state = env.reset()
            for t in range(timesteps):
                timestep += 1

                # Running policy_old:
                action, action_logprob = self.actor_old(state)
                env_action = torch.clamp(action, min=-1.0,
                                         max=1.0).detach().numpy()
                next_state, reward, done, _ = env.step(env_action)

                # saving stats
                stats.episode_rewards[i_episode - 1] += reward
                stats.episode_lengths[i_episode - 1] = t

                # Saving the experience in buffer:
                self.buffer.states.append(
                    torch.from_numpy(state).float().to(device))
                self.buffer.actions.append(action)
                self.buffer.logprobs.append(action_logprob)
                self.buffer.rewards.append(reward)
                self.buffer.is_terminals.append(done)

                # update if its time
                if timestep % update_timestep == 0:
                    self.update()
                    self.buffer.clear_buffer()
                    timestep = 0

                if done:
                    break
                state = next_state

            # logging
            if self.verbose:
                if i_episode % 10 == 0:
                    print((
                        f"{int(stats.episode_lengths[i_episode - 1])} Steps in"
                        f"Episode {i_episode}/{episodes}. "
                        f"Reward {stats.episode_rewards[i_episode-1]}"))
            else:
                if i_episode % 1000 == 0:
                    print((
                        f"{int(stats.episode_lengths[i_episode - 1])} Steps in"
                        f"Episode {i_episode}/{episodes}. "
                        f"Reward {stats.episode_rewards[i_episode-1]}"))

        return stats

    def update(self):
        # Monte Carlo estimate of the return over all steps
        # (possibly across episodes):
        rewards = np.zeros_like(self.buffer.rewards, dtype=np.float32)
        discounted_reward = 0
        for i, (reward, is_terminal) in enumerate(
                zip(reversed(self.buffer.rewards),
                    reversed(self.buffer.is_terminals))):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards[-(i + 1)] = discounted_reward

        # Normalizing the rewards:
        rewards = torch.tensor(rewards).to(device)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5)

        # convert list to tensor
        old_states = torch.stack(self.buffer.states).to(device).detach()
        old_actions = torch.stack(self.buffer.actions).to(device).detach()
        old_logprobs = torch.stack(self.buffer.logprobs).to(device).detach()

        # Optimize policy for K epochs:
        for _ in range(self.K_epochs):
            # Evaluating old actions and values:
            logprobs, dist_entropy = self.actor.evaluate(
                old_states, old_actions)

            # getting the state_values from the critic
            state_values = self.critic(old_states)

            # Finding the ratio (pi_theta / pi_theta__old):
            ratios = torch.exp(logprobs - old_logprobs.detach())

            # Finding Surrogate Loss:
            advantages = rewards - state_values.detach()
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1 - self.eps_clip,
                                1 + self.eps_clip) * advantages
            actor_loss = -torch.min(surr1,
                                    surr2) - self.entropy_coeff * dist_entropy
            critic_loss = 0.5 * F.mse_loss(state_values, rewards)

            # take gradient step (actor)
            self.actor_optimizer.zero_grad()
            actor_loss.mean().backward()
            self.actor_optimizer.step()

            # take gradient step (critic)
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()

        # Copy new weights into old policy:
        self.actor_old.load_state_dict(self.actor.state_dict())

    def save_models(self, model_name):
        self.modelio.save(model=self.actor,
                          model_name=f'ppo_c_actor_{model_name}.pt')
        self.modelio.save(model=self.critic,
                          model_name=f'ppo_c_critic_{model_name}.pt')

    def load_models(self, model_name):
        # if self._model
        self.modelio.load(model=self.actor,
                          model_name=f'ppo_c_actor_{model_name}.pt')
        self.modelio.load(model=self.actor_old,
                          model_name=f'ppo_c_actor_{model_name}.pt')
        self.modelio.load(model=self.critic,
                          model_name=f'ppo_c_critic_{model_name}.pt')
        mlb.fit(vect.transform(captions))
        return vect, mlb
    # if not multilabel:
    return vect


dataset_name = 'coco_train2014'
n_sbu = None
if n_sbu:
    dataset_name += "+sbu%d" % n_sbu
# global vectorizer
vect_name = 'tokenizer_%s' % dataset_name
mlb_name = 'mlb_%s' % dataset_name
try:
    if mlb_name:
        mlb = ModelIO.load(mlb_name)
        print "MLB loaded from file"
    vect = ModelIO.load(vect_name)
    # vect = ModelIO.load('tokenizer_reddit') # gloveglove
    print "Tokenizer loaded from file."
except:
    if mlb_name:
        vect, mlb = prepVect(n_sbu=n_sbu, n_captions=1, multilabel=True)
        ModelIO.save(vect, vect_name)
        ModelIO.save(mlb, mlb_name)
        print "Saved %s, %s for future use." % (vect_name, mlb_name)
    else:
        vect = prepVect(n_sbu=n_sbu, n_captions=1)
        ModelIO.save(vect, vect_name)
        print "Saved %s for future use." % vect_name