def _train_nstep(self):
        '''
            Episodic training loop for synchronous training over multiple environments
        '''
        start = time.time()
        num_updates = self.total_steps // (self.num_envs * self.nsteps)
        s = 0
        R_std = np.ones((len(self.env)))
        rolling = rolling_stats(R_std)
        # main loop
        for t in range(1,num_updates+1):
            states, next_states, actions, extr_rewards, intr_rewards, hidden_batch, dones, infos, extr_values, intr_values = self.runner.run()
            R_extr = self.multistep_target(extr_rewards, extr_values, dones, clip=False)
            R_intr = self.multistep_target(intr_rewards, intr_values, np.zeros_like(dones), clip=False)
            R_mean, R_std = rolling.update(R_intr.mean(axis=0))
            self.runner.R_std = R_std
                
            # stack all states, next_states, actions and Rs across all workers into a single batch
            states, next_states, actions, R_extr, R_intr = fold_batch(states),fold_batch(next_states), fold_batch(actions), fold_batch(R_extr), fold_batch(R_intr)
            l = self.model.backprop(states, next_states, R_extr, R_intr, actions, hidden_batch[0], dones)

            if self.render_freq > 0 and t % (self.validate_freq * self.render_freq) == 0:
                render = True
            else:
                render = False
     
            if self.validate_freq > 0 and t % self.validate_freq == 0:
                self.validation_summary(t,l,start,render)
                start = time.time()
            
            if self.save_freq > 0 and  t % self.save_freq == 0:
                s += 1
                self.saver.save(self.sess, str(self.model_dir + self.current_time + '/' + str(s) + ".ckpt") )
                print('saved model')
Пример #2
0
        def run(self, ):
            rollout = []
            for t in range(self.num_steps):
                policies, values_extr, values_intr = self.model.forward(
                    self.states)
                actions = [
                    np.random.choice(policies.shape[1], p=policies[i])
                    for i in range(policies.shape[0])
                ]
                next_states, extr_rewards, dones, infos = self.env.step(
                    actions)
                rollout.append(
                    (self.states, next_states, actions, extr_rewards,
                     values_extr, values_intr, policies, dones))
                self.states = next_states

            states, next_states, actions, extr_rewards, values_extr, values_intr, policies, dones = stack_many(
                zip(*rollout))
            intr_rewards = self.model.intrinsic_reward(fold_batch(states),
                                                       fold_batch(actions),
                                                       fold_batch(next_states),
                                                       self.state_mean,
                                                       self.state_std)
            intr_rewards = unfold_batch(intr_rewards, self.num_steps,
                                        len(self.env))
            return states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, policies, dones
Пример #3
0
    def _train_nstep(self):
        batch_size = (self.num_envs * self.nsteps)
        start = time.time()
        num_updates = self.total_steps // batch_size
        s = 0
        self.populate_memory()

        # main loop
        for t in range(1,num_updates+1):
            states, actions, rewards, hidden_init, prev_acts_rewards, dones, last_values = self.rollout()

            R = self.nstep_return(rewards, last_values, dones, clip=False)
            # stack all states, actions and Rs across all workers into a single batch
            prev_acts_rewards, actions, rewards, R = fold_batch(prev_acts_rewards), fold_batch(actions), fold_batch(rewards), fold_batch(R)
            
            reward_states, sample_rewards = self.sample_reward()
            replay_states, replay_actions, replay_R, Qaux_target, replay_hidden, replay_actsrews, replay_dones = self.sample_replay()
                        
            l = self.model.backprop(states, R, actions, hidden_init, dones, prev_acts_rewards,
                reward_states, sample_rewards, Qaux_target, replay_actions, replay_states, replay_R, replay_hidden, replay_dones, replay_actsrews)
            
            if self.render_freq > 0 and t % ((self.validate_freq // batch_size) * self.render_freq) == 0:
                render = True
            else:
                render = False
     
            if self.validate_freq > 0 and t % (self.validate_freq //batch_size) == 0:
                self.validation_summary(t,l,start,render)
                start = time.time()
            
            if self.save_freq > 0 and  t % (self.save_freq // batch_size) == 0:
                s += 1
                self.saver.save(s)
                print('saved model')
Пример #4
0
    def sample_replay(self):
        workers = np.random.choice(self.num_envs, replace=False, size=2) # randomly sample from one of n workers
        sample_start = np.random.randint(1, len(self.replay) - self.nsteps -2)
        replay_sample = []
        for i in range(sample_start, sample_start+self.nsteps):
            replay_sample.append(self.replay[i])
                
        replay_states = np.stack([replay_sample[i][0][workers] for i in range(len(replay_sample))])
        replay_actions = np.stack([replay_sample[i][1][workers] for i in range(len(replay_sample))])
        replay_rewards = np.stack([replay_sample[i][2][workers] for i in range(len(replay_sample))])
        replay_values = np.stack([replay_sample[i][3][workers] for i in range(len(replay_sample))])
        replay_dones = np.stack([replay_sample[i][4][workers] for i in range(len(replay_sample))])
        #print('replay dones shape', replay_dones.shape)
        #print('replay_values shape', replay_values.shape)
        
        next_state = self.replay[sample_start+self.nsteps][0][workers] # get state 
        _, replay_last_values = self.model.evaluate(next_state)
        replay_R = GAE(replay_rewards, replay_values, replay_last_values, replay_dones, gamma=0.99, lambda_=0.95) + replay_values

        if self.model.pixel_control:
            prev_states = self.replay[sample_start-1][0][workers]
            Qaux_value = self.model.get_pixel_control(next_state)
            pixel_rewards = self.pixel_rewards(prev_states, replay_states)
            Qaux_target = self.auxiliary_target(pixel_rewards, np.max(Qaux_value, axis=1), replay_dones)
        else:
            Qaux_target = np.zeros((len(replay_states),1,1,1)) # produce fake Qaux to save writing unecessary code
        
        return fold_batch(replay_states), fold_batch(replay_actions), fold_batch(replay_R), fold_batch(Qaux_target), fold_batch(replay_dones)
Пример #5
0
 def update(self, x):
     if self.lastFrame:  # assume image obs
         return self.rolling.update(fold_batch(
             x[...,
               -1:]))  #[time,batch,height,width,stack] -> [height, width,1]
     else:
         return self.rolling.update(
             fold_batch(x))  #[time,batch,*shape] -> [*shape]
    def _train_nstep(self):
        '''
            template for multi-step training loop for synchronous training over multiple environments
        '''
        start = time.time()
        batch_size = self.num_envs * self.nsteps
        num_updates = self.total_steps // batch_size
        # main loop
        for t in range(self.t, num_updates + 1):
            states, actions, rewards, dones, infos, values, last_values = self.runner.run(
            )
            if self.return_type == 'nstep':
                R = self.nstep_return(rewards,
                                      last_values,
                                      dones,
                                      gamma=self.gamma)
            elif self.return_type == 'GAE':
                R = self.GAE(rewards,
                             values,
                             last_values,
                             dones,
                             gamma=self.gamma,
                             lambda_=self.lambda_) + values
            elif self.return_type == 'lambda':
                R = self.lambda_return(rewards,
                                       values,
                                       last_values,
                                       dones,
                                       gamma=self.gamma,
                                       lambda_=self.lambda_,
                                       clip=False)
            # stack all states, actions and Rs from all workers into a single batch
            states, actions, R = fold_batch(states), fold_batch(
                actions), fold_batch(R)
            l = self.model.backprop(states, R, actions)

            if self.render_freq > 0 and t % (
                (self.validate_freq // batch_size) * self.render_freq) == 0:
                render = True
            else:
                render = False

            if self.validate_freq > 0 and t % (self.validate_freq //
                                               batch_size) == 0:
                self.validation_summary(t, l, start, render)
                start = time.time()

            if self.save_freq > 0 and t % (self.save_freq // batch_size) == 0:
                self.s += 1
                self.save(self.s)
                print('saved model')

            if self.target_freq > 0 and t % (
                    self.target_freq // batch_size
            ) == 0:  # update target network (for value based learning e.g. DQN)
                self.update_target()

            self.t += 1
Пример #7
0
    def _train_nstep(self):
        batch_size = (self.num_envs * self.nsteps)
        start = time.time()
        num_updates = self.total_steps // batch_size
        s = 0
        # main loop
        for t in range(1, num_updates + 1):
            states, actions, rewards, hidden_batch, dones, infos, values, last_values = self.runner.run(
            )

            if self.return_type == 'nstep':
                R = self.nstep_return(rewards,
                                      last_values,
                                      dones,
                                      gamma=self.gamma)
            elif self.return_type == 'GAE':
                R = self.GAE(rewards,
                             values,
                             last_values,
                             dones,
                             gamma=self.gamma,
                             lambda_=self.lambda_) + values
            elif self.return_type == 'lambda':
                R = self.lambda_return(rewards,
                                       values,
                                       last_values,
                                       dones,
                                       gamma=self.gamma,
                                       lambda_=self.lambda_)

            # stack all states, actions and Rs across all workers into a single batch
            states, actions, R = fold_batch(states), fold_batch(
                actions), fold_batch(R)
            l = self.model.backprop(states, R, actions, hidden_batch[0], dones)

            if self.render_freq > 0 and t % (
                (self.validate_freq // batch_size) * self.render_freq) == 0:
                render = True
            else:
                render = False

            if self.validate_freq > 0 and t % (self.validate_freq //
                                               batch_size) == 0:
                self.validation_summary(t, l, start, render)
                start = time.time()

            if self.save_freq > 0 and t % (self.save_freq // batch_size) == 0:
                s += 1
                self.saver.save(self.sess,
                                str(self.model_dir + str(s) + ".ckpt"))
                print('saved model')
Пример #8
0
    def train_nstep(self):
        batch_size = self.num_envs * self.nsteps
        num_updates = self.total_steps // batch_size
        # main loop
        start = time.time()
        for t in range(self.t, num_updates + 1):
            states, locs, actions, rewards, dones, infos, values, last_values = self.rollout(
            )
            if self.return_type == 'nstep':
                R = self.nstep_return(rewards,
                                      last_values,
                                      dones,
                                      gamma=self.gamma)
            elif self.return_type == 'GAE':
                R = self.GAE(rewards,
                             values,
                             last_values,
                             dones,
                             gamma=self.gamma,
                             lambda_=self.lambda_) + values
            elif self.return_type == 'lambda':
                R = self.lambda_return(rewards,
                                       values,
                                       last_values,
                                       dones,
                                       gamma=self.gamma,
                                       lambda_=self.lambda_,
                                       clip=False)
            # stack all states, actions and Rs from all workers into a single batch
            states, locs, actions, R = fold_batch(states), fold_batch(
                locs), fold_batch(actions), fold_batch(R)
            #print('locs', locs.shape)
            l = self.model.backprop(states, locs, R, actions)

            if self.validate_freq > 0 and t % (self.validate_freq //
                                               batch_size) == 0:
                self.validation_summary(t, l, start, False)
                start = time.time()

            if self.save_freq > 0 and t % (self.save_freq // batch_size) == 0:
                self.s += 1
                self.save(self.s)
                print('saved model')

            if self.target_freq > 0 and t % (
                    self.target_freq // batch_size
            ) == 0:  # update target network (for value based learning e.g. DQN)
                self.update_target()

            self.t += 1
Пример #9
0
    def _train_nstep(self):
        batch_size = self.num_envs * self.nsteps
        num_updates = self.total_steps // batch_size
        s = 0
        self.state_min = 0
        self.state_max = 0
        self.populate_memory()
        # main loop
        start = time.time()
        for t in range(1, num_updates + 1):
            states, actions, rewards, values, dones, infos, last_values = self.runner.run(
            )

            # R = self.nstep_return(rewards, last_values, dones, clip=False)
            R = self.GAE(
                rewards, values, last_values, dones, gamma=0.99,
                lambda_=0.95) + values

            # stack all states, actions and Rs across all workers into a single batch
            states, actions, rewards, R = fold_batch(states), fold_batch(
                actions), fold_batch(rewards), fold_batch(R)

            #self.state_mean, self.state_std = self.obs_running.update(states[...,-1:]) # update state normalisation statistics
            self.update_minmax(states)

            reward_states, sample_rewards = self.sample_reward()
            replay_states, replay_actions, replay_R, Qaux_target, replay_dones = self.sample_replay(
            )

            l = self.model.backprop(states, R, actions, dones, reward_states,
                                    sample_rewards, Qaux_target,
                                    replay_actions, replay_states, replay_R,
                                    replay_dones)

            if self.render_freq > 0 and t % (
                (self.validate_freq // batch_size) * self.render_freq) == 0:
                render = True
            else:
                render = False

            if self.validate_freq > 0 and t % (self.validate_freq //
                                               batch_size) == 0:
                self.validation_summary(t, l, start, render)
                start = time.time()

            if self.save_freq > 0 and t % (self.save_freq // batch_size) == 0:
                s += 1
                self.saver.save(self.sess,
                                str(self.model_dir + '/' + str(s) + ".ckpt"))
                print('saved model')
Пример #10
0
    def _train_nstep(self):
        num_updates = self.total_steps // (self.num_envs * self.nsteps)
        s = 0
        self.state_mean, self.state_std = self.state_obs.update(
            self.init_state_obs(10000 // self.num_envs))
        self.states = self.env.reset()
        print(self.state_mean.shape, self.state_std.shape)
        start = time.time()
        # main loop
        batch_size = self.num_envs * self.nsteps
        for t in range(1, num_updates + 1):
            states, next_states, actions, rewards, dones, values = self.rollout(
            )
            _, last_values = self.model.evaluate(next_states[-1])

            R = self.nstep_return(rewards, last_values, dones)
            Adv = R - values
            #delta = rewards + self.gamma * values[:-1] - values[1:]
            #Adv = self.multistep_target(delta, values[-1], dones, gamma=self.gamma*self.lambda_)

            # stack all states, next_states, actions and Rs across all workers into a single batch
            states, next_states, actions, R, Adv = fold_batch(
                states), fold_batch(next_states), fold_batch(
                    actions), fold_batch(R), fold_batch(Adv)
            mean, std = self.state_mean, self.state_std

            l = self.model.backprop(states, next_states, R, Adv, actions, mean,
                                    std)

            # self.state_mean, self.state_std = self.state_obs.update(states)

            if self.render_freq > 0 and t % (self.validate_freq *
                                             self.render_freq) == 0:
                render = True
            else:
                render = False

            if self.validate_freq > 0 and t % (self.validate_freq //
                                               batch_size) == 0:
                self.validation_summary(t, l, start, render)
                start = time.time()

            if self.save_freq > 0 and t % (self.save_freq // batch_size) == 0:
                s += 1
                self.saver.save(
                    self.sess,
                    str(self.model_dir + self.current_time + '/' + str(s) +
                        ".ckpt"))
                print('saved model')
Пример #11
0
        def run(self):
            rollout = []
            for t in range(self.num_steps):
                Qsa = self.Q.forward(self.states)
                actions = np.argmax(Qsa, axis=1)
                random = np.random.uniform(size=(self.num_envs))
                random_actions = np.random.randint(self.action_size,
                                                   size=(self.num_envs))
                actions = np.where(random < self.epsilon, random_actions,
                                   actions)
                next_states, rewards, dones, infos = self.env.step(actions)
                rollout.append((self.states, actions, rewards, dones, infos))
                self.states = next_states
                self.schedule.step()
                #print('epsilon', self.epsilon)

            states, actions, rewards, dones, infos = zip(*rollout)
            states, actions, rewards, dones = np.stack(states), np.stack(
                actions), np.stack(rewards), np.stack(dones)
            TargetQsa = unfold_batch(self.TargetQ.forward(fold_batch(states)),
                                     self.num_steps,
                                     self.num_envs)  # Q(s,a; theta-1)
            values = np.sum(TargetQsa * one_hot(actions, self.action_size),
                            axis=-1)  # Q(s, argmax_a Q(s,a; theta); theta-1)

            last_actions = np.argmax(self.Q.forward(next_states), axis=1)
            last_TargetQsa = self.TargetQ.forward(
                next_states)  # Q(s,a; theta-1)
            last_values = np.sum(
                last_TargetQsa * one_hot(last_actions, self.action_size),
                axis=-1)  # Q(s, argmax_a Q(s,a; theta); theta-1)
            return states, actions, rewards, dones, infos, values, last_values
Пример #12
0
 def _train_nstep(self):
     batch_size = self.num_envs * self.nsteps
     num_updates = self.total_steps // batch_size
     alpha_step = 1/num_updates
     s = 0
     mini_batch_size = self.nsteps//self.num_minibatches
     start = time.time()
     # main loop
     for t in range(1,num_updates+1):
         states, actions, rewards, values, last_values, old_policies, dones, infos = self.runner.run()
         Adv = self.GAE(rewards, values, last_values, dones, gamma=0.99, lambda_=self.lambda_)
         R = Adv + values
         l = 0
         idxs = np.arange(len(states))
         for epoch in range(self.num_epochs):
             np.random.shuffle(idxs)
             for batch in range(0,len(states), mini_batch_size):
                 batch_idxs = idxs[batch:batch + mini_batch_size]
                 # stack all states, actions and Rs across all workers into a single batch
                 mb_states, mb_actions, mb_R, mb_Adv, mb_old_policies = fold_batch(states[batch_idxs]), \
                                                 fold_batch(actions[batch_idxs]), fold_batch(R[batch_idxs]), \
                                                 fold_batch(Adv[batch_idxs]), fold_batch(old_policies[batch_idxs])
                 
                 l += self.model.backprop(mb_states, mb_R, mb_Adv, mb_actions, mb_old_policies, self.alpha)
         
         l /= (self.num_epochs*self.num_minibatches)
        
         #self.alpha -= alpha_step
         
         if self.render_freq > 0 and t % ((self.validate_freq  // batch_size) * self.render_freq) == 0:
             render = True
         else:
             render = False
     
         if self.validate_freq > 0 and t % (self.validate_freq // batch_size) == 0:
             self.validation_summary(t,l,start,render)
             start = time.time()
         
         if self.save_freq > 0 and  t % (self.save_freq // batch_size) == 0:
             s += 1
             self.saver.save(self.sess, str(self.model_dir + '/' + str(s) + ".ckpt") )
             print('saved model')
    def _train_nstep(self):
        start = time.time()
        num_updates = self.total_steps // (self.num_envs * self.nsteps)
        s = 0
        #self.validate(self.val_envs[0], 1, 1000)
        self.populate_memory()
        # main loop
        for t in range(1, num_updates + 1):
            states, actions, rewards, values, dones, last_values, prev_state, Qaux = self.runner.run(
            )
            self.state_mean, self.state_std = self.obs_running.update(
                fold_batch(states).mean(axis=0)[:, :, -1:])
            pixel_rewards = self.pixel_rewards(prev_state, states)
            pix_rew_mean, pix_rew_std = self.aux_reward_rolling.update(
                self.auxiliary_target(pixel_rewards, np.max(Qaux, axis=-1),
                                      dones).mean())
            Qaux_target = self.auxiliary_target(pixel_rewards / pix_rew_std,
                                                np.max(Qaux, axis=-1), dones)

            # R = self.nstep_return(rewards, last_values, dones, clip=False)
            Adv = self.GAE(rewards,
                           values,
                           last_values,
                           dones,
                           gamma=0.99,
                           lambda_=0.95)
            R = Adv + values
            #self.print_stats('R', R)
            #self.print_stats('Adv', Adv)
            # stack all states, actions and Rs across all workers into a single batch
            states, actions, rewards, R, Adv, Qaux_target = fold_batch(
                states), fold_batch(actions), fold_batch(rewards), fold_batch(
                    R), fold_batch(Adv), fold_batch(Qaux_target)

            reward_states, sample_rewards = self.sample_reward()
            #replay_states, replay_actions, replay_R, Qaux_target, replay_dones = self.sample_replay()

            l = self.model.backprop(states, R, Adv, actions, dones,
                                    reward_states, sample_rewards, Qaux_target)

            if self.render_freq > 0 and t % (self.validate_freq *
                                             self.render_freq) == 0:
                render = True
            else:
                render = False

            if self.validate_freq > 0 and t % self.validate_freq == 0:
                self.validation_summary(t, l, start, render)
                start = time.time()

            if self.save_freq > 0 and t % self.save_freq == 0:
                s += 1
                self.saver.save(
                    self.sess,
                    str(self.model_dir + self.current_time + '/' + str(s) +
                        ".ckpt"))
                print('saved model')
Пример #14
0
    def sample_replay(self):
        sample_start = np.random.randint(1, len(self.replay) - self.nsteps - 2)
        replay_sample = []
        for i in range(sample_start, sample_start + self.nsteps):
            replay_sample.append(self.replay[i])

        replay_states = np.stack(
            [replay_sample[i][0] for i in range(len(replay_sample))])
        replay_actions = np.stack(
            [replay_sample[i][1] for i in range(len(replay_sample))])
        replay_rewards = np.stack(
            [replay_sample[i][2] for i in range(len(replay_sample))])
        replay_values = np.stack(
            [replay_sample[i][3] for i in range(len(replay_sample))])
        replay_dones = np.stack(
            [replay_sample[i][4] for i in range(len(replay_sample))])
        #print('replay_hiddens dones shape', replay_dones.shape)

        next_state = self.replay[sample_start + self.nsteps][0]  # get state
        _, replay_last_values = self.model.forward(next_state)
        replay_R = self.GAE(replay_rewards,
                            replay_values,
                            replay_last_values,
                            replay_dones,
                            gamma=0.99,
                            lambda_=0.95) + replay_values

        if self.model.pixel_control:
            prev_states = self.replay[sample_start - 1][0]
            Qaux_value = self.model.get_pixel_control(next_state)
            pixel_rewards = self.pixel_rewards(prev_states, replay_states)
            Qaux_target = self.auxiliary_target(pixel_rewards,
                                                np.max(Qaux_value, axis=-1),
                                                replay_dones)
        else:
            Qaux_target = np.zeros(
                (len(replay_states), 1, 1,
                 1))  # produce fake Qaux to save writing unecessary code

        return fold_batch(replay_states), fold_batch(
            replay_actions), fold_batch(replay_R), fold_batch(
                Qaux_target), fold_batch(replay_dones)
    def _train_nstep(self):
        num_updates = self.total_steps // (self.num_envs * self.nsteps)
        s = 0
        self.runner.state_mean, self.runner.state_std = self.state_obs.update(self.init_state_obs(10000//self.num_envs))
        self.runner.states = self.env.reset()
        rolling = rolling_stats()
        start = time.time()
        # main loop
        for t in range(1,num_updates+1):
            states, next_states, actions, rewards, dones, values = self.runner.run()
            _, last_values = self.model.forward(next_states[-1])

            R_mean, R_std = rolling.update(self.nstep_return(rewards, last_values, dones).ravel().mean(axis=0))
            rewards /= R_std
            #print('rewards', rewards)

            R = self.nstep_return(rewards, last_values, dones)
            Adv = R - values
            #delta = rewards + self.gamma * values[:-1] - values[1:]
            #Adv = self.multistep_target(delta, values[-1], dones, gamma=self.gamma*self.lambda_)
                
            # stack all states, next_states, actions and Rs across all workers into a single batch
            states, next_states, actions, R, Adv = fold_batch(states), fold_batch(next_states), fold_batch(actions), fold_batch(R), fold_batch(Adv)
            mean, std = np.stack([self.runner.state_mean for i in range(4)], -1), np.stack([self.runner.state_std for i in range(4)], -1)
            
            l = self.model.backprop(states, next_states, R, Adv, actions, mean, std)
            
            #self.runner.state_mean, self.runner.state_std = self.state_obs.update(states)
            
            if self.render_freq > 0 and t % (self.validate_freq * self.render_freq) == 0:
                render = True
            else:
                render = False
     
            if self.validate_freq > 0 and t % self.validate_freq == 0:
                self.validation_summary(t,l,start,render)
                start = time.time()
            
            if self.save_freq > 0 and  t % self.save_freq == 0:
                s += 1
                self.saver.save(self.sess, str(self.model_dir + self.current_time + '/' + str(s) + ".ckpt") )
                print('saved model')
Пример #16
0
 def init_state_obs(self, num_steps):
     rollout = []
     states = self.env.reset()
     for i in range(1, num_steps + 1):
         rand_actions = np.random.randint(0,
                                          self.model.action_size,
                                          size=self.num_envs)
         #print('rand_actions.shape', rand_actions.shape)
         next_states, rewards, dones, infos = self.env.step(rand_actions)
         rollout.append([states, next_states, rand_actions, rewards])
         states = next_states
         if i % self.nsteps == 0:
             mb_states, mb_next_states, mb_actions, mb_rewards = stack_many(
                 zip(*rollout))
             #print('states, next_states, actions, rewards', mb_states.shape, mb_next_states.shape, mb_actions.shape, mb_rewards.shape)
             self.runner.state_mean, self.runner.state_std = self.state_rolling.update(
                 mb_states)
             self.forward_model.backprop(mb_states[0],
                                         fold_batch(mb_next_states),
                                         fold_batch(mb_actions),
                                         fold_batch(mb_rewards),
                                         len(mb_states))
             rollout = []
Пример #17
0
        def sample_replay(self):
            states, actions, rewards, dones, next_states = self.replay.sample(
                self.num_steps)
            TargetQsa = unfold_batch(self.TargetQ.forward(fold_batch(states)),
                                     self.num_steps,
                                     self.num_envs)  # Q(s,a; theta-1)
            values = np.sum(TargetQsa * one_hot(actions, self.action_size),
                            axis=-1)  # Q(s, argmax_a Q(s,a; theta); theta-1)

            last_actions = np.argmax(self.Q.forward(next_states), axis=1)
            last_TargetQsa = self.TargetQ.forward(
                next_states)  # Q(s,a; theta-1)
            last_values = np.sum(
                last_TargetQsa * one_hot(last_actions, self.action_size),
                axis=-1)  # Q(s, argmax_a Q(s,a; theta); theta-1)
            return states, actions, rewards, dones, 0, values, last_values
    def _train_nstep(self):
        batch_size = (self.num_envs * self.nsteps)
        num_updates = self.total_steps // batch_size
        s = 0
        rolling = RunningMeanStd()
        self.state_rolling = rolling_obs(shape=())
        self.init_state_obs(128 * 50)
        self.runner.states = self.env.reset()
        forward_filter = RewardForwardFilter(self.gamma)
        # main loop
        start = time.time()
        for t in range(1, num_updates + 1):
            states, next_states, actions, extr_rewards, intr_rewards, extr_values, intr_values, dones, infos = self.runner.run(
            )
            policy, last_extr_values, last_intr_values = self.model.forward(
                next_states[-1])

            self.runner.state_mean, self.runner.state_std = self.state_rolling.update(
                next_states)  # update state normalisation statistics

            int_rff = np.array([
                forward_filter.update(intr_rewards[i])
                for i in range(len(intr_rewards))
            ])
            R_intr_mean, R_intr_std = rolling.update(int_rff.ravel())
            intr_rewards /= R_intr_std

            if self.return_type == 'GAE':
                R_extr = self.GAE(extr_rewards,
                                  extr_values,
                                  last_extr_values,
                                  dones,
                                  gamma=0.999,
                                  lambda_=self.lambda_) + extr_values
                R_intr = self.GAE(intr_rewards,
                                  intr_values,
                                  last_intr_values,
                                  np.zeros_like(dones),
                                  gamma=0.99,
                                  lambda_=self.lambda_) + intr_values
            else:
                R_extr = self.nstep_return(extr_rewards,
                                           last_extr_values,
                                           dones,
                                           gamma=0.999,
                                           clip=False)
                R_intr = self.nstep_return(
                    intr_rewards,
                    last_intr_values,
                    np.zeros_like(dones),
                    gamma=0.99,
                    clip=False)  # non episodic intr reward signal

            Adv = self.model.extr_coeff * (
                R_extr - extr_values) + self.model.intr_coeff * (R_intr -
                                                                 intr_values)

            # stack all states, next_states, actions and Rs across all workers into a single batch
            states, next_states, actions, R_extr, R_intr, Adv = fold_batch(
                states), fold_batch(next_states), fold_batch(
                    actions), fold_batch(R_extr), fold_batch(
                        R_intr), fold_batch(Adv)

            l = self.model.backprop(states, next_states, R_extr, R_intr, Adv,
                                    actions, self.runner.state_mean,
                                    self.runner.state_std)

            #start= time.time()
            if self.render_freq > 0 and t % (
                (self.validate_freq // batch_size) * self.render_freq) == 0:
                render = True
            else:
                render = False

            if self.validate_freq > 0 and t % (self.validate_freq //
                                               batch_size) == 0:
                self.validation_summary(t, l, start, render)
                start = time.time()

            if self.save_freq > 0 and t % (self.save_freq // batch_size) == 0:
                s += 1
                self.saver.save(self.sess,
                                str(self.model_dir + '/' + str(s) + ".ckpt"))
                print('saved model')
Пример #19
0
    def _train_nstep(self):
        batch_size = (self.num_envs * self.nsteps)
        num_updates = self.total_steps // batch_size
        s = 0
        rolling = RunningMeanStd(shape=())
        self.state_rolling = rolling_obs(shape=(), lastFrame=False)
        self.init_state_obs(128 * 50)
        self.runner.states = self.env.reset()
        forward_filter = RewardForwardFilter(self.gamma)

        # main loop
        start = time.time()
        for t in range(1, num_updates + 1):
            states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, old_policies, dones = self.runner.run(
            )
            self.runner.state_mean, self.runner.state_std = self.state_rolling.update(
                next_states)  # update state normalisation statistics

            policy, extr_last_values, intr_last_values = self.model.forward(
                next_states[-1])
            int_rff = np.array([
                forward_filter.update(intr_rewards[i])
                for i in range(len(intr_rewards))
            ])
            R_intr_mean, R_intr_std = rolling.update(int_rff.ravel())
            intr_rewards /= R_intr_std

            Adv_extr = self.GAE(extr_rewards,
                                values_extr,
                                extr_last_values,
                                dones,
                                gamma=0.999,
                                lambda_=self.lambda_)
            Adv_intr = self.GAE(
                intr_rewards,
                values_intr,
                intr_last_values,
                np.zeros_like(dones),
                gamma=0.99,
                lambda_=self.lambda_)  # non episodic intr reward signal
            R_extr = Adv_extr + values_extr
            R_intr = Adv_intr + values_intr
            total_Adv = self.model.extr_coeff * Adv_extr + self.model.intr_coeff * Adv_intr

            # perform minibatch gradient descent for K epochs
            l = 0
            idxs = np.arange(len(states))
            for epoch in range(self.num_epochs):
                mini_batch_size = self.nsteps // self.num_minibatches
                np.random.shuffle(idxs)
                for batch in range(0, len(states), mini_batch_size):
                    batch_idxs = idxs[batch:batch + mini_batch_size]
                    # stack all states, next_states, actions and Rs across all workers into a single batch
                    mb_states, mb_nextstates, mb_actions, mb_Rextr, mb_Rintr, mb_Adv, mb_old_policies = fold_batch(states[batch_idxs]), fold_batch(next_states[batch_idxs]), \
                                                    fold_batch(actions[batch_idxs]), fold_batch(R_extr[batch_idxs]), fold_batch(R_intr[batch_idxs]), \
                                                    fold_batch(total_Adv[batch_idxs]), fold_batch(old_policies[batch_idxs])

                    #mb_nextstates = mb_nextstates[np.where(np.random.uniform(size=(batch_size)) < self.pred_prob)]

                    mean, std = self.runner.state_mean, self.runner.state_std
                    l += self.model.backprop(mb_states, mb_nextstates,
                                             mb_Rextr, mb_Rintr, mb_Adv,
                                             mb_actions, mb_old_policies, mean,
                                             std)

            l /= (self.num_epochs * self.num_minibatches)

            if self.render_freq > 0 and t % (
                (self.validate_freq // batch_size) * self.render_freq) == 0:
                render = True
            else:
                render = False

            if self.validate_freq > 0 and t % (self.validate_freq //
                                               batch_size) == 0:
                self.validation_summary(t, l, start, render)
                start = time.time()

            if self.save_freq > 0 and t % (self.save_freq // batch_size) == 0:
                s += 1
                self.saver.save(self.sess,
                                str(self.model_dir + '/' + str(s) + ".ckpt"))
                print('saved model')
Пример #20
0
    def _train_nstep(self):
        start = time.time()
        num_updates = self.total_steps // (self.num_envs * self.nsteps)
        alpha_step = 1 / num_updates
        s = 0
        rolling = RunningMeanStd(shape=())
        self.state_rolling = rolling_obs(shape=())
        self.init_state_obs(129)
        #self.runner.state_mean, self.runner.state_std = self.state_rolling.mean, np.sqrt(self.state_rolling.var)
        self.runner.states = self.env.reset()
        forward_filter = RewardForwardFilter(self.gamma)

        # main loop
        for t in range(1, num_updates + 1):
            states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, old_policies, dones = self.runner.run(
            )
            policy, extr_last_values, intr_last_values = self.model.forward(
                next_states[-1])
            int_rff = np.array([
                forward_filter.update(intr_rewards[i])
                for i in range(len(intr_rewards))
            ])
            #R_intr_mean, R_intr_std = rolling.update(self.discount(intr_rewards, self.gamma).ravel().mean()) #
            rolling.update(int_rff.ravel())
            R_intr_std = np.sqrt(rolling.var)
            intr_rewards /= R_intr_std
            #print('intr reward', intr_rewards)

            forward_loss = self.forward_model.backprop(
                states[0], fold_batch(next_states), fold_batch(actions),
                fold_batch(extr_rewards), self.nsteps)

            Adv_extr = self.GAE(extr_rewards,
                                values_extr,
                                extr_last_values,
                                dones,
                                gamma=0.999,
                                lambda_=self.lambda_)
            Adv_intr = self.GAE(
                intr_rewards,
                values_intr,
                intr_last_values,
                np.zeros_like(dones),
                gamma=0.99,
                lambda_=self.lambda_)  # non episodic intr reward signal
            R_extr = Adv_extr + values_extr
            R_intr = Adv_intr + values_intr
            total_Adv = self.model.extr_coeff * Adv_extr + self.model.intr_coeff * Adv_intr

            #self.runner.state_mean, self.runner.state_std = state_rolling.update(fold_batch(next_states)[:,:,:,-1:]) # update state normalisation statistics
            self.runner.state_mean, self.runner.state_std = self.state_rolling.update(
                next_states)  # update state normalisation statistics

            # perform minibatch gradient descent for K epochs
            l = 0
            idxs = np.arange(len(states))
            for epoch in range(self.num_epochs):
                batch_size = self.nsteps // self.num_minibatches
                np.random.shuffle(idxs)
                for batch in range(0, len(states), batch_size):
                    batch_idxs = idxs[batch:batch + batch_size]
                    # stack all states, next_states, actions and Rs across all workers into a single batch
                    mb_states, mb_nextstates, mb_actions, mb_Rextr, mb_Rintr, mb_Adv, mb_old_policies = fold_batch(states[batch_idxs]), fold_batch(next_states[batch_idxs]), \
                                                    fold_batch(actions[batch_idxs]), fold_batch(R_extr[batch_idxs]), fold_batch(R_intr[batch_idxs]), \
                                                    fold_batch(total_Adv[batch_idxs]), fold_batch(old_policies[batch_idxs])

                    mb_nextstates = mb_nextstates[np.where(
                        np.random.uniform(
                            size=(batch_size)) < self.pred_prob)][:, :, :, -1:]
                    #mb_nextstates = (mb_nextstates  - self.runner.state_mean[np.newaxis,:,:,np.newaxis]) / self.runner.state_std[np.newaxis,:,:,np.newaxis]
                    mean, std = self.runner.state_mean, self.runner.state_std
                    l += self.model.backprop(mb_states, mb_nextstates,
                                             mb_Rextr, mb_Rintr, mb_Adv,
                                             mb_actions, mb_old_policies,
                                             self.alpha, mean, std)

            l /= (self.num_epochs * self.num_minibatches)

            # Imagined future rollout

            hidden = self.forward_model.get_initial_hidden(self.num_envs)
            obs = next_states[-1]
            encoded_last_state = self.forward_model.encode_state(
                next_states[-1])  # o_t -> s_t
            actions = [
                np.random.choice(policy.shape[1], p=policy[i])
                for i in range(policy.shape[0])
            ]
            imagined_rollout = []
            with tf.variable_scope('forward_model/latent-space-rnn',
                                   reuse=tf.AUTO_REUSE):
                for i in range(self.nsteps):
                    next_obs, extr_rewards, encoded_last_state, hidden = self.forward_model.predict_next(
                        encoded_last_state, hidden, actions)
                    #print('imagined obs', next_obs.shape)
                    intr_rewards = self.model.intrinsic_reward(
                        next_obs[..., -1:], self.runner.state_mean,
                        self.runner.state_std)
                    policies, extr_values, intr_values = self.model.forward(
                        obs)
                    actions = [
                        np.random.choice(policy.shape[1], p=policy[i])
                        for i in range(policy.shape[0])
                    ]
                    imagined_rollout.append([
                        obs, next_obs, actions, extr_rewards[:, 0],
                        intr_rewards, extr_values, intr_values, policies
                    ])
                    obs = next_obs

            obs, next_obs, actions, extr_rewards, intr_rewards, extr_values, intr_values, old_policies = stack_many(
                zip(*imagined_rollout))
            #print('imagined obs', obs.shape)
            #print('imagined extr rew', extr_rewards.shape)
            #print('imagined extr_values', extr_values.shape)
            #print('imagined intr_values', intr_values.shape)

            intr_rewards /= R_intr_std

            policies, extr_last_values, intr_last_values = self.model.forward(
                next_obs[-1])
            Adv_extr = self.GAE(extr_rewards,
                                extr_values,
                                extr_last_values,
                                np.zeros_like(dones),
                                gamma=0.999,
                                lambda_=self.lambda_)
            Adv_intr = self.GAE(
                intr_rewards,
                intr_values,
                intr_last_values,
                np.zeros_like(dones),
                gamma=0.99,
                lambda_=self.lambda_)  # non episodic intr reward signal
            R_extr = Adv_extr + values_extr
            R_intr = Adv_intr + values_intr
            total_Adv = self.model.extr_coeff * Adv_extr + self.model.intr_coeff * Adv_intr

            for batch in range(0, len(obs), batch_size):
                batch_idxs = idxs[batch:batch + batch_size]
                # stack all states, next_states, actions and Rs across all workers into a single batch
                mb_states, mb_nextstates, mb_actions, mb_Rextr, mb_Rintr, mb_Adv, mb_old_policies = fold_batch(obs[batch_idxs]), fold_batch(next_obs[batch_idxs]), \
                                                fold_batch(actions[batch_idxs]), fold_batch(R_extr[batch_idxs]), fold_batch(R_intr[batch_idxs]), \
                                                fold_batch(total_Adv[batch_idxs]), fold_batch(old_policies[batch_idxs])

                mb_nextstates = mb_nextstates[np.where(
                    np.random.uniform(
                        size=(batch_size)) < self.pred_prob)][..., -1:]
                #mb_nextstates = (mb_nextstates  - self.runner.state_mean[np.newaxis,:,:,np.newaxis]) / self.runner.state_std[np.newaxis,:,:,np.newaxis]
                mean, std = self.runner.state_mean, self.runner.state_std
                l += self.model.backprop(mb_states, mb_nextstates, mb_Rextr,
                                         mb_Rintr, mb_Adv, mb_actions,
                                         mb_old_policies, self.alpha, mean,
                                         std)

            if self.render_freq > 0 and t % (self.validate_freq *
                                             self.render_freq) == 0:
                render = True
            else:
                render = False

            if self.validate_freq > 0 and t % self.validate_freq == 0:
                self.validation_summary(t, l, start, render)
                start = time.time()

            if self.save_freq > 0 and t % self.save_freq == 0:
                s += 1
                self.saver.save(
                    self.sess,
                    str(self.model_dir + self.current_time + '/' + str(s) +
                        ".ckpt"))
                print('saved model')
Пример #21
0
 def forward(self, state, hidden, validate=False):
     if validate:
         return self.validate_policy.forward(state, hidden)
     else :
         return self.train_policy.forward(fold_batch(state), hidden)
Пример #22
0
    def _train_nstep(self):
        batch_size = self.num_envs * self.nsteps
        num_updates = self.total_steps // batch_size
        #validate_freq = self.validate_freq // batch_size
        #save_freq = self.save_freq // batch_size

        s = 0
        rolling = RunningMeanStd()
        self.init_state_obs(20*50*25)
        forward_filter = RewardForwardFilter(0.99)
        self.runner.states = self.env.reset()
        # main loop
        start = time.time()
        for t in range(1,num_updates+1):
            states, next_states, actions, extr_rewards, intr_rewards, extr_values, intr_values, dones, infos = self.runner.run()
            policy, last_extr_values, last_intr_values = self.model.forward(next_states[-1])
            self.update_minmax(states)

            Qaux_value = self.model.get_pixel_control(next_states[-1])
            pixel_rewards = self.pixel_rewards(states)
            Qaux_target = fold_batch(self.auxiliary_target(pixel_rewards, np.max(Qaux_value, axis=-1), dones))

            #onehot_rewards = fold_batch(one_hot(extr_rewards.astype(np.int32), 3))
            reward_states, sample_rewards, = self.sample_reward(states, extr_rewards)

            self.runner.state_mean, self.runner.state_std = self.state_rolling.update(next_states) # update state normalisation statistics
            
            
            r_intr = np.array([forward_filter.update(intr_rewards[i]) for i in range(len(intr_rewards))]) # update intrinsic return estimate
            R_intr_mean, R_intr_std = rolling.update(r_intr.ravel())
            intr_rewards /= R_intr_std # normalise intr rewards 
            #print('intr_reward', intr_rewards)

            R_extr = self.GAE(extr_rewards, extr_values, last_extr_values, dones, gamma=0.999, lambda_=self.lambda_, clip=False) + extr_values
            R_intr = self.GAE(intr_rewards, intr_values, last_intr_values, np.zeros_like(dones), gamma=0.99, lambda_=self.lambda_, clip=False) + intr_values
            #R_mean, R_std = rolling.update(R_intr.ravel())
            
            
            Adv = self.model.extr_coeff * (R_extr - extr_values) + self.model.intr_coeff * (R_intr - intr_values)

            # stack all states, next_states, actions and Rs across all workers into a single batch
            next_states = next_states[...,-1:] if len(next_states.shape) == 5 else next_states
            states, next_states, actions, R_extr, R_intr, Adv = fold_batch(states), fold_batch(next_states), fold_batch(actions), fold_batch(R_extr), fold_batch(R_intr), fold_batch(Adv) 
        
            l = self.model.backprop(states, next_states, R_extr, R_intr, Adv, actions, Qaux_target, reward_states, sample_rewards, self.runner.state_mean, self.runner.state_std)
            #print('backprop time', time.time() -start)
            
            #start= time.time()
            if self.render_freq > 0 and t % ((self.validate_freq // batch_size) * self.render_freq) == 0:
                render = True
            else:
                render = False
     
            if self.validate_freq > 0 and t % (self.validate_freq // batch_size) == 0:
                self.validation_summary(t,l,start,render)
                start = time.time()
            
            if self.save_freq > 0 and  t % (self.save_freq // batch_size) == 0:
                s += 1
                self.saver.save(self.sess, str(self.model_dir + '/' + str(s) + ".ckpt") )
                print('saved model')