예제 #1
0
    def run(self):
        self.replay_buffer=ReplayBuffer(self.config["replay_buffer_size"])
        device = torch.device(self.config["learner_device"])
        self.learning_model.to(device)
        self.target_model.to(device)
        optimizer = torch.optim.Adam(
            self.learning_model.parameters(), lr=self.config["lr"]
        )
        
        self.train_batcher.update(self._state_dict(self.learning_model,torch.device("cpu")))
        self.evaluation_batcher.update(self._state_dict(self.learning_model,torch.device("cpu")))

        n_episodes=self.config["n_envs"]*self.config["n_threads"]
        self.train_batcher.reset(agent_info=DictTensor({"epsilon":torch.ones(n_episodes)*self.config["epsilon_greedy"]}))
        logging.info("Sampling initial transitions")
        for k in range(self.config["initial_buffer_epochs"]):
            self.train_batcher.execute()        
            trajectories=self.train_batcher.get()
            self.replay_buffer.push(trajectories)
        
        n_episodes=self.config["n_evaluation_rollouts"]
        self.evaluation_batcher.execute(agent_info=DictTensor({"epsilon":torch.zeros(n_episodes)}), n_episodes=n_episodes)
        
        logging.info("Starting Learning")
        _start_time=time.time()
        
        logging.info("Learning")
        while time.time()-_start_time <self.config["time_limit"]:
            
            self.train_batcher.execute()
            trajectories=self.train_batcher.get()
            self.replay_buffer.push(trajectories)
            self.logger.add_scalar("replay_buffer_size",self.replay_buffer.size(),self.iteration)
            # avg_reward = 0
           
            for k in range(self.config["qvalue_epochs"]):
                optimizer.zero_grad()
                dt = self.get_loss(device)
                
                [self.logger.add_scalar(k,dt[k].item(),self.iteration) for k in dt.keys()]
                
                floss=dt["q_loss"]
                floss.backward()
                if self.config["clip_grad"] > 0:
                    n = torch.nn.utils.clip_grad_norm_(
                        self.learning_model.parameters(), self.config["clip_grad"]
                    )
                    self.logger.add_scalar("grad_norm", n.item(), self.iteration)
                self.iteration+=1
                optimizer.step()
            
                tau=self.config["tau"]
                self.soft_update_params(self.learning_model,self.target_model,tau)
                


            self.train_batcher.update(self._state_dict(self.learning_model,torch.device("cpu")))            
            self.evaluate()
            self.iteration+=1
예제 #2
0
    def __call__(self, state, observation, agent_info=None, history=None):
        """
        Executing one step of the agent
        """
        # Verify that the batch size is 1
        initial_state = observation["initial_state"]
        B = observation.n_elems()

        if agent_info is None:
            agent_info = DictTensor(
                {"stochastic": torch.tensor([True]).repeat(B)})

        model_initial_state = self.model.initial_state(B)
        agent_state = None
        agent_step = None
        if state is None:
            assert initial_state.all()
            agent_state = model_initial_state
            agent_step = torch.zeros(B).long()
        else:
            _is = (initial_state.float().unsqueeze(-1).repeat(
                1,
                model_initial_state.size()[1]))
            agent_state = _is * model_initial_state + (
                1 - _is) * state["agent_state"]
            agent_step = (
                initial_state.float() * torch.zeros(B) +
                (1 - initial_state.float()) * state["agent_step"]).long()

        score_action, value, next_state = self.model(
            agent_state, observation["frame"], observation["last_action"])
        action_proba = torch.softmax(score_action, dim=1)
        dist = torch.distributions.Categorical(action_proba)
        action_sampled = dist.sample()
        action_max = action_proba.max(1)[1]
        smask = agent_info["stochastic"].float()
        action = (action_sampled * smask + (1 - smask) * action_max).long()

        new_state = DictTensor({
            "agent_state": next_state,
            "agent_step": agent_step + 1
        })

        agent_do = DictTensor({
            "action": action,
            "action_probabilities": action_proba
        })

        state = DictTensor({
            "agent_state": agent_state,
            "agent_step": agent_step
        })
        return state, agent_do, new_state
예제 #3
0
 def reset(self,agent_info=DictTensor({}), env_info=DictTensor({})):
     n_workers = len(self.workers)
     assert isinstance(agent_info,DictTensor) and (agent_info.empty() or agent_info.n_elems()==self.n_envs*n_workers)
     assert isinstance(env_info,DictTensor) and (env_info.empty() or env_info.n_elems()==self.n_envs*n_workers)
     pos=0
     for k in range(n_workers):
             n=self.n_envs
             wi=None if agent_info is None else agent_info.slice(pos,pos+n)
             ei= None if env_info is None else env_info.slice(pos,pos+n)
             self.workers[k].reset(
                 agent_info=wi, env_info=ei
             )
             pos+=n
예제 #4
0
    def __call__(self, state, observation, agent_info=None, history=None):
        """
        Executing one step of the agent
        """
        # Verify that the batch size is 1
        initial_state = observation["initial_state"]
        B = observation.n_elems()

        if agent_info is None:
            agent_info = DictTensor(
                {"stochastic": torch.tensor([True]).repeat(B)})

        # Create the initial state of the recurrent policy
        agent_initial = self.model.initial_state(B)
        if (state is None):  # If the batcher is starting
            state = DictTensor({
                "agent_state": agent_initial,
                "agent_step": torch.zeros(B).long()
            })
        else:
            #Maybe some observations are initial states of new episodes. For these state, we must initialize the internal state of the policy
            istate = DictTensor({
                "agent_state": agent_initial,
                "agent_step": torch.zeros(B).long()
            })
            state = masked_dicttensor(istate, state, initial_state)

        new_z, action_proba = self.model(state["agent_state"],
                                         observation["frame"])

        #We sample an action following the distribution
        dist = torch.distributions.Categorical(action_proba)
        action_sampled = dist.sample()

        #Depending on the agent_info variable that tells us if we are in 'stochastic' or 'deterministic' mode, we keep the sampled action, or compute the action with the max score
        action_max = action_proba.max(1)[1]
        smask = agent_info["stochastic"].float()
        action = masked_tensor(action_max, action_sampled,
                               agent_info["stochastic"])

        new_state = DictTensor({
            "agent_state": new_z,
            "agent_step": state["agent_step"] + 1
        })

        agent_do = DictTensor({
            "action": action,
            "action_probabilities": action_proba
        })

        return state, agent_do, new_state
예제 #5
0
    def __call__(self, state, observation, agent_info=None, history=None):
        B = observation.n_elems()

        agent_state = None
        if state is None:
            agent_state = DictTensor({"timestep": torch.zeros(B).long()})
        else:
            agent_state = state

        scores = torch.randn(B, self.n_actions)
        probabilities = torch.softmax(scores, dim=1)
        actions = torch.distributions.Categorical(probabilities).sample()
        new_state = DictTensor({"timestep": agent_state["timestep"] + 1})
        return agent_state, DictTensor({"action": actions}), new_state
예제 #6
0
    def step(self, policy_output):
        assert policy_output.n_elems() == self.envs_running.size()[0]
        outputs = policy_output.unfold()
        alls = []
        env_run = {}
        for b in range(len(outputs)):
            idx_env = self.envs_running[b]
            action = policy_output["action"][b]
            last_action = action
            if (isinstance(self.gym_envs[0].action_space,
                           gym.spaces.Discrete)):
                action = action.item()
                last_action = last_action.unsqueeze(0)
            else:
                action = action.tolist()
                last_action = last_action.unsqueeze(0)

            initial_state = torch.tensor([False])
            act = action

            frame, reward, done, unused_info = self.gym_envs[idx_env].step(act)
            reward = torch.tensor([reward])
            frame = format_frame(frame)
            if isinstance(frame, torch.Tensor):
                frame = {"frame": frame}
            if not done:
                env_run[b] = idx_env

            done = torch.tensor([done])
            r = DictTensor({
                "reward": reward,
                "done": done,
                "initial_state": initial_state,
                "last_action": last_action,
                **frame,
            })
            alls.append(r)

        d = DictTensor.cat(alls)

        keys = []
        values = []
        for key, value in env_run.items():
            keys.append(key)
            values.append(value)
        dd = d.index(torch.tensor(keys).long())
        old_envs_running = self.envs_running
        self.envs_running = torch.tensor(values)
        return (d, old_envs_running), (dd, self.envs_running)
예제 #7
0
    def run(self):
        device = torch.device(self.config["learner_device"])
        self.learning_model.to(device)
        optimizer = torch.optim.Adam(
            self.learning_model.parameters(), lr=self.config["lr"]
        )
        cpu_parameters=self._state_dict(self.learning_model,torch.device("cpu"))
        self.train_batcher.update(cpu_parameters)
        self.evaluation_batcher.update(cpu_parameters)
        n_episodes=self.config["n_evaluation_rollouts"]
        self.evaluation_batcher.execute(agent_info=DictTensor({"stochastic":torch.ones(n_episodes)}), n_episodes=n_episodes)

        # Initialize the train batcher
        n_episodes=self.config["n_envs"]*self.config["n_threads"]
        self.train_batcher.reset(agent_info=DictTensor({"stochastic":torch.ones(n_episodes)}))

        _start_time=time.time()
        while time.time()-_start_time<self.config["time_limit"]:
            self.train_batcher.execute()
            trajectories=self.train_batcher.get()
            avg_reward = 0
            for K in range(self.config["k_epochs"]):
                optimizer.zero_grad()
                dt = self.get_loss(trajectories)
                [
                    self.logger.add_scalar("loss/" + k, dt[k].item(), self.iteration)
                    for k in dt.keys()
                ]

                # Computation of final loss
                ld = self.config["coef_critic"] * dt["value_loss"]
                lr = self.config["coef_ppo"] * dt["ppo_loss"]
                le = self.config["coef_entropy"] * dt["entropy_loss"]

                floss = ld - le - lr
                floss.backward()
                if self.config["clip_grad"] > 0:
                    n = torch.nn.utils.clip_grad_norm_(
                        self.learning_model.parameters(), self.config["clip_grad"]
                    )
                    self.logger.add_scalar("grad_norm", n.item(), self.iteration)
                optimizer.step()
                self.evaluate()
                self.iteration+=1
            cpu_parameters=self._state_dict(self.learning_model,torch.device("cpu"))
            self.train_batcher.update(cpu_parameters)
            self.evaluate()
            self.iteration+=1
예제 #8
0
 def sample(self,n=1):
     limit=self.pos
     if self.full:
         limit=self.N
     transitions=torch.randint(0,high=limit,size=(n,))
     d={k:self.buffer[k][transitions] for k in self.buffer}
     return DictTensor(d)
예제 #9
0
    def reset(self, env_info: DictTensor = DictTensor({})):
        """ reset the environments instances

        :param env_info: a DictTensor of size n_envs, such that each value will be transmitted to each environment instance
        :type env_info: DictTensor, optional
        """
        pass
예제 #10
0
    def get_loss(self, device):
        transitions=self.replay_buffer.sample(n=self.config["n_batches"])
        transitions = transitions.to(device)
        B=transitions.n_elems()
        Bv=torch.arange(B)
        action = transitions["action"]
        reward = transitions["_reward"]
        frame = transitions["frame"]
        _frame = transitions["_frame"]
        _done = transitions["_done"].float()

        q=self.learning_model(frame)
        qa=q[Bv,action]
        qp = self.learning_model(_frame)
        actionp=qp.max(1)[1]
        _q_target = self.target_model(_frame).detach()
        _q_target_a= _q_target[Bv,actionp]
        _target_value=_q_target_a*(1-_done)*self.config["discount_factor"]+reward
        td = (_target_value-qa)**2
        dt = DictTensor(
            {
                "q_loss": td.mean(),
            }
        )
        return dt
예제 #11
0
    def get_q_loss(self, transitions,device):

        transitions = transitions.to(device)
        B=transitions.n_elems()
        Bv=torch.arange(B)
        action = transitions["action"]
        reward = transitions["_reward"]
        frame = transitions["frame"]
        _frame = transitions["_frame"]
        _done = transitions["_done"].float()

        # action for s_prime
        mean_prime,var_prime=self.learning_model(_frame)
        _id = torch.eye(self.action_dim).unsqueeze(0).repeat(B, 1, 1)
        # _nvar = var_prime.unsqueeze(-1).repeat(1, 1, self.action_dim)
        # _nvar = _nvar * _id
        distribution=torch.distributions.Normal(mean_prime, var_prime)
        next_action=distribution.sample().detach()

        #Compute targets
        q1=self.target_q1(_frame,next_action).detach().squeeze(-1)
        q2=self.target_q2(_frame,next_action).detach().squeeze(-1)
        q = torch.min(q1,q2)
        lp= distribution.log_prob(next_action).detach().sum(-1)
        q = q  - self.config["lambda_entropy"]*lp
        target_value=q*(1.-_done)*self.config["discount_factor"]+reward

        q1_loss=(target_value.detach()-self.q1(frame,action).squeeze(-1))**2
        q2_loss=(target_value.detach()-self.q2(frame,action).squeeze(-1))**2
        dt ={
                "q1_loss": q1_loss.mean(),
                "q2_loss": q2_loss.mean(),
            }
        return DictTensor(dt),transitions
예제 #12
0
    def __call__(self, state, observation,agent_info=None,history=None):
        """
        Executing one step of the agent
        """
        # Verify that the batch size is 1
            
        initial_state = observation["initial_state"]
        B = observation.n_elems()
        
        if agent_info is None:
            agent_info=DictTensor({"epsilon":torch.zeros(B)})
        
        agent_step = None
        if state is None:
            assert initial_state.all()
            agent_step = torch.zeros(B).long()
        else:
            agent_step = (
                initial_state.float() * torch.zeros(B)
                + (1 - initial_state.float()) * state["agent_step"]
            ).long()

        q = self.model(
            observation["frame"]
        )

        qs,action = q.max(1)
        raction = torch.tensor(np.random.randint(low=0,high=self.n_actions,size=(action.size()[0])))         
        epsilon=agent_info["epsilon"]
        mask=torch.rand(action.size()[0]).lt(epsilon).float()
        action=mask*raction+(1-mask)*action
        action=action.long()


        new_state = DictTensor(
            {"agent_step": agent_step + 1}
        )
       
        agent_do = DictTensor(
            {"action": action, "q": q}
        )

        state = DictTensor({"agent_step": agent_step})

        return state, agent_do, new_state
예제 #13
0
    def execute(self, n_episodes, agent_info=DictTensor({}),env_info=DictTensor({})):
        n_workers = len(self.workers)
        assert n_episodes % (self.n_envs*n_workers) == 0
        assert isinstance(agent_info,DictTensor) and (agent_info.empty() or agent_info.n_elems()==n_episodes)
        assert isinstance(env_info,DictTensor) and (env_info.empty() or env_info.n_elems()==n_episodes)

        self.n_per_worker = [int(n_episodes / n_workers) for w in range(n_workers)]
        pos=0
        for k in range(n_workers):
                n=self.n_per_worker[k]
                assert n%self.n_envs==0
                wi=agent_info.slice(pos,pos+n)
                ei=env_info.slice(pos,pos+n)
                self.workers[k].acquire_episodes(
                    n_episodes=self.n_per_worker[k], agent_info=wi, env_info=ei
                )
                pos+=n
        assert pos==n_episodes
예제 #14
0
    def get_loss(self,trajectories):
            #First, we want to compute the cumulated reward per trajectory
            #The reward is a t+1 in each iteration (since it is btained after the aaction), so we use the '_reward' field in the trajectory
            # The 'reward' field corresopnds to the reward at time t
            reward=trajectories["_reward"]

            #We get the mask that tells which transition is in a trajectory (1) or not (0)
            mask=trajectories.mask()

            #We remove the reward values that are not in the trajectories
            reward=reward*mask
            max_length=trajectories.lengths.max().item()
            #Now, we want to compute the action probabilities over the trajectories such that we will be able to do 'backward'
            action_probabilities=[]
            for t in range(max_length):
                proba=self.learning_model(trajectories["frame"][:,t])
                action_probabilities.append(proba.unsqueeze(1)) # We append the probability, and introduces the temporal dimension (2nde dimension)
            action_probabilities=torch.cat(action_probabilities,dim=1) #Now, we have a B x T x n_actions tensor

            #We compute the critic value for t=0 to T (i.e including the very last observation)
            critic=[]
            for t in range(max_length):
                b=self.critic_model(trajectories["frame"][:,t])
                critic.append(b.unsqueeze(1))
            critic=torch.cat(critic+[b.unsqueeze(1)],dim=1).squeeze(-1) #Now, we have a B x (T+1) tensor
            #We also need to compute the critic value at for the last observation of the trajectories (to compute the TD)
            # It may be the last element of the trajectories (if episode is not finished), or on the last frame of the episode
            idx=torch.arange(trajectories.n_elems())
            last_critic=self.critic_model(trajectories["_frame"][idx,trajectories.lengths-1]).squeeze(-1)
            critic[idx,trajectories.lengths]=last_critic


            #We compute the temporal difference
            target=reward+self.config["discount_factor"]*(1-trajectories["_done"].float())*critic[:,1:].detach()
            td=critic[:,:-1]-target

            critic_loss=td**2
            #We sum the loss for each episode (considering the mask)
            critic_loss= (critic_loss*mask).sum(1)/mask.sum(1)
            #We average the loss over all the trajectories
            avg_critic_loss = critic_loss.mean()

            #We do the same on the reinforce loss
            action_distribution=torch.distributions.Categorical(action_probabilities)
            log_proba=action_distribution.log_prob(trajectories["action"])
            a2c_loss = -log_proba * td.detach()
            a2c_loss = (a2c_loss*mask).sum(1)/mask.sum(1)
            avg_a2c_loss=a2c_loss.mean()

            #We compute the entropy loss
            entropy=action_distribution.entropy()
            entropy=(entropy*mask).sum(1)/mask.sum(1)
            avg_entropy=entropy.mean()

            return DictTensor({"critic_loss":avg_critic_loss,"a2c_loss":avg_a2c_loss,"entropy_loss":avg_entropy})
    def __call__(self,state,observation,agent_info=None,history=None):
        B=observation.n_elems()
        agent_state=None

        #Initialize agent_info is not specified
        if agent_info is None:
            agent_info=DictTensor({"agent_id":torch.tensor([0]).repeat(B)})

        #initialize the state of the agent if not specified
        if state is None:
            agent_state=DictTensor({"timestep":torch.zeros(B).long()})
        else:
            agent_state=state

        scores=torch.randn(B,self.n_actions)
        probabilities=torch.softmax(scores,dim=1)
        actions=torch.distributions.Categorical(probabilities).sample()
        new_state=DictTensor({"timestep":agent_state["timestep"]+1})
        # We also decide to output the action probabilities
        return agent_state,DictTensor({"action":actions,"action_probabilities":probabilities,"agent_id":agent_info["agent_id"]}),new_state
예제 #16
0
    def get_loss(self,trajectories):
            #First, we want to compute the cumulated reward per trajectory
            #The reward is a t+1 in each iteration (since it is btained after the aaction), so we use the '_reward' field in the trajectory
            # The 'reward' field corresopnds to the reward at time t
            reward=trajectories["_reward"]

            #We get the mask that tells which transition is in a trajectory (1) or not (0)
            mask=trajectories.mask()

            #We remove the reward values that are not in the trajectories
            reward=reward*mask

            #We compute the future cumulated reward at each timestep (by reverse computation)
            max_length=trajectories.lengths.max().item()
            cumulated_reward=torch.zeros_like(reward)
            cumulated_reward[:,max_length-1]=reward[:,max_length-1]
            for t in range(max_length-2,-1,-1):
                cumulated_reward[:,t]=reward[:,t]+self.config["discount_factor"]*cumulated_reward[:,t+1]

            #Now, we want to compute the action probabilities over the trajectories such that we will be able to do 'backward'
            action_probabilities=[]
            for t in range(max_length):
                proba=self.learning_model(trajectories["frame"][:,t])
                action_probabilities.append(proba.unsqueeze(1)) # We append the probability, and introduces the temporal dimension (2nde dimension)
            action_probabilities=torch.cat(action_probabilities,dim=1) #Now, we have a B x T x n_actions tensor

            #We compute the baseline
            baseline=[]
            for t in range(max_length):
                b=self.baseline_model(trajectories["frame"][:,t])
                baseline.append(b.unsqueeze(1))
            baseline=torch.cat(baseline,dim=1).squeeze(-1) #Now, we have a B x T tensor

            #We compute the baseline loss
            baseline_loss=(baseline-cumulated_reward)**2
            #We sum the loss for each episode (considering the mask)
            baseline_loss= (baseline_loss*mask).sum(1)/mask.sum(1)
            #We average the loss over all the trajectories
            avg_baseline_loss = baseline_loss.mean()

            #We do the same on the reinforce loss
            action_distribution=torch.distributions.Categorical(action_probabilities)
            log_proba=action_distribution.log_prob(trajectories["action"])
            reinforce_loss = log_proba * (cumulated_reward-baseline).detach()
            reinforce_loss = (reinforce_loss*mask).sum(1)/mask.sum(1)
            avg_reinforce_loss=reinforce_loss.mean()

            #We compute the entropy loss
            entropy=action_distribution.entropy()
            entropy=(entropy*mask).sum(1)/mask.sum(1)
            avg_entropy=entropy.mean()

            return DictTensor({"avg_reward":cumulated_reward[:,0].mean(),"baseline_loss":avg_baseline_loss,"reinforce_loss":avg_reinforce_loss,"entropy_loss":avg_entropy})
예제 #17
0
    def __call__(self, state, observation, agent_info=None, history=None):
        """
        Executing one step of the agent
        """
        # Verify that the batch size is 1
        initial_state = observation["initial_state"]
        B = observation.n_elems()

        if agent_info is None:
            agent_info = DictTensor(
                {"stochastic": torch.tensor([True]).repeat(B)})

        agent_step = None
        if state is None:
            assert initial_state.all()
            agent_step = torch.zeros(B).long()
        else:
            agent_step = (
                initial_state.float() * torch.zeros(B) +
                (1 - initial_state.float()) * state["agent_step"]).long()

        _mean, _var = self.model(observation["frame"])
        _id = torch.eye(self.action_dim).unsqueeze(0).repeat(B, 1, 1)
        # _nvar = _var.unsqueeze(-1).repeat(1, 1, self.action_dim)
        # _nvar = _nvar * _id
        distribution = torch.distributions.Normal(_mean, _var)
        action_sampled = distribution.sample()
        action_max = _mean
        smask = agent_info["stochastic"].float().unsqueeze(-1).repeat(
            1, self.action_dim)
        action = (action_sampled * smask + (1.0 - smask) * action_max)
        new_state = DictTensor({"agent_step": agent_step + 1})

        agent_do = DictTensor({"action": action, "mean": _mean, "std": _var})
        state = DictTensor({"agent_step": agent_step})
        return state, agent_do, new_state
예제 #18
0
    def reset(self, env_info=DictTensor({})):
        N = self.n_envs()
        self.envs_running = torch.arange(N)
        reward = torch.zeros(N)

        last_action = None
        if (isinstance(self.gym_envs[0].action_space, gym.spaces.Discrete)):
            last_action = torch.zeros(N, dtype=torch.int64)
        else:
            a = self.gym_envs[0].action_space.sample()
            a = torch.tensor(a).unsqueeze(0).repeat(N, 1)
            last_action = a

        done = torch.zeros(N).bool()
        initial_state = torch.ones(N).bool()
        frames = None
        if (env_info.empty()):
            frames = [format_frame(e.reset()) for e in self.gym_envs]
        else:
            frames = []
            for n in range(len(self.gym_envs)):
                v = {k: env_info[k][n].tolist() for k in env_info.keys()}
                frames.append(format_frame(self.gym_envs[n].reset(env_info=v)))
        _frames = []
        for f in frames:
            if isinstance(f, torch.Tensor):
                _frames.append({"frame": f})
            else:
                _frames.append(f)
        frames = [DictTensor(_f) for _f in _frames]
        frames = DictTensor.cat(frames)
        frames.set("reward", reward)
        frames.set("done", done)
        frames.set("initial_state", initial_state)
        frames.set("last_action", last_action)
        return frames, self.envs_running
예제 #19
0
    def get_policy_loss(self,transitions):
        frame = transitions["frame"]
        B=transitions.n_elems()
        #Now, compute the policy term
        mean,var=self.learning_model(frame)
        #print(var.mean().item())
        #print(mean)
        _id = torch.eye(self.action_dim).unsqueeze(0).repeat(B, 1, 1)
        # _nvar = var.unsqueeze(-1).repeat(1, 1, self.action_dim)
        # _nvar = _nvar * _id
        distribution=torch.distributions.Normal(mean, var)
        entropy=distribution.entropy().mean()
        action_tilde=distribution.rsample()
        #print(action_tilde)
        q1 = self.q1(frame,action_tilde).squeeze(-1)
        q2 = self.q2(frame,action_tilde).squeeze(-1)
        q=torch.min(q1,q2)
        loss=q-self.config["lambda_entropy"]*distribution.log_prob(action_tilde).sum(-1)

        dt={"policy_loss":-loss.mean(),"entropy":entropy.detach(),"avg_var":var.mean().detach(),"avg_mean":mean.mean().detach()}
        dt=DictTensor(dt)
        return dt
예제 #20
0
    def run(self):
        # Instantiate the learning model abd the baseline model
        self.learning_model=AgentModel(self.obs_dim,self.n_actions,32)
        self.critic_model=BaselineModel(self.obs_dim,32)

        #We create a batcher dedicated to evaluation
        model=copy.deepcopy(self.learning_model)
        self.evaluation_batcher=EpisodeBatcher(
            n_timesteps=self.config["max_episode_steps"],
            n_slots=self.config["n_evaluation_episodes"],
            create_agent=self._create_agent,
            create_env=self._create_env,
            env_args={
                "n_envs": self.config["n_envs"],
                "max_episode_steps": self.config["max_episode_steps"],
                "env_name":self.config["env_name"]
            },
            agent_args={"n_actions": self.n_actions, "model": model},
            n_threads=self.config["n_evaluation_threads"],
            seeds=[self.config["env_seed"]+k*10 for k in range(self.config["n_evaluation_threads"])],
        )

        #Creation of the batcher for sampling complete pieces of trajectories (i.e Batcher)
        #The batcher will sample n_threads*n_envs trajectories at each call
        # To have a fast batcher, we have to configure it with n_timesteps=self.config["max_episode_steps"]
        model=copy.deepcopy(self.learning_model)
        self.train_batcher=Batcher(
            n_timesteps=self.config["a2c_timesteps"],
            n_slots=self.config["n_envs"]*self.config["n_threads"],
            create_agent=self._create_agent,
            create_env=self._create_train_env,
            env_args={
                "n_envs": self.config["n_envs"],
                "max_episode_steps": self.config["max_episode_steps"],
                "env_name":self.config["env_name"]
            },
            agent_args={"n_actions": self.n_actions, "model": model},
            n_threads=self.config["n_threads"],
            seeds=[self.config["env_seed"]+k*10 for k in range(self.config["n_threads"])],
        )

        #Creation of the optimizer
        optimizer = torch.optim.Adam(nn.Sequential(self.learning_model,self.critic_model).parameters(), lr=self.config["lr"])

        #Training Loop:
        _start_time=time.time()
        self.iteration=0

        # #We launch the evaluation batcher (in deterministic mode)
        n_episodes=self.config["n_evaluation_episodes"]
        agent_info=DictTensor({"stochastic":torch.tensor([False]).repeat(n_episodes)})
        self.evaluation_batcher.execute(n_episodes=n_episodes,agent_info=agent_info)
        self.evaluation_iteration=self.iteration

        #Initialize the training batcher such that agents will start to acqire pieces of episodes
        self.train_batcher.update(self.learning_model.state_dict())
        n_episodes=self.config["n_envs"]*self.config["n_threads"]
        agent_info=DictTensor({"stochastic":torch.tensor([True]).repeat(n_episodes)})
        self.train_batcher.reset(agent_info=agent_info)

        while(time.time()-_start_time<self.config["time_limit"]):
            #Call the batcher to get a sample of trajectories

            #2) We get the pieces of episodes. Since the env is an infinite env, we will always receive a new piece of episode
            self.train_batcher.execute()
            trajectories=self.train_batcher.get(blocking=True)

            #3) Now, we compute the loss
            dt=self.get_loss(trajectories)
            [self.logger.add_scalar(k,dt[k].item(),self.iteration) for k in dt.keys()]

            # Computation of final loss
            ld = self.config["critic_coef"] * dt["critic_loss"]
            lr = self.config["a2c_coef"] * dt["a2c_loss"]
            le = self.config["entropy_coef"] * dt["entropy_loss"]

            floss = ld - le - lr
            floss= floss/n_episodes*trajectories.n_elems()

            optimizer.zero_grad()
            floss.backward()
            optimizer.step()

            #Update the train batcher with the updated model
            self.train_batcher.update(self.learning_model.state_dict())
            self.iteration+=1

            #We check the evaluation batcher
            evaluation_trajectories=self.evaluation_batcher.get(blocking=False)
            if not evaluation_trajectories is None: #trajectories are available
                #Compute the cumulated reward
                cumulated_reward=(evaluation_trajectories["_reward"]*evaluation_trajectories.mask()).sum(1).mean()
                self.logger.add_scalar("evaluation_reward",cumulated_reward.item(),self.evaluation_iteration)
                print("At iteration %d, reward is %f"%(self.evaluation_iteration,cumulated_reward.item()))
                #We reexecute the evaluation batcher (with same value of agent_info and same number of episodes)
                self.evaluation_batcher.update(self.learning_model.state_dict())
                self.evaluation_iteration=self.iteration
                self.evaluation_batcher.reexecute()

        self.train_batcher.close()
        self.evaluation_batcher.get() # To wait for the last trajectories
        self.evaluation_batcher.close()
        self.logger.update_csv() # To save as a CSV file in logdir
        self.logger.close()
    # The **EpisodeBatcher** will sample full episodes (until the environment returns done==True)
    # If one consider a rlstructures.VecEnv env, and n_threads (or processes), then the batcher will sample n_episodes = N * env.n_envs()*n_threads episodes at each execution (where N is chosen by the user)
    # *seeds* is a list of environment seeds, one seed per process
    # The batcher has to be configured 'at the right size' since all the processes are sharing a common *Buffer* to store trajectories
    # The simplest case is:
    # *n_slots = env.n_envs() x n_threads *
    # *n_timeteps* is the number of timesteps that will be acquired at each call

    batcher = Batcher(n_timesteps=10,
                      n_slots=16,
                      n_threads=4,
                      seeds=[1, 2, 3, 4],
                      create_agent=create_agent,
                      agent_args={"n_actions": 2},
                      create_env=create_env,
                      env_args={"max_episode_steps": 100})

    # A traajectory batcher has to be *reset*
    # Then calling *execute* will acquire the next T steps
    # The *execute* method will return *None* if all environments have stopped
    batcher.reset(agent_info=DictTensor({"agent_id": torch.arange(16)}),
                  env_info=DictTensor({"env_id": torch.arange(16)}))
    import time

    batcher.execute()
    t = batcher.get()
    while not t is None:
        print(t.lengths)
        batcher.execute()
        t = batcher.get()
예제 #22
0
    def run(self):
        # Instantiate the learning model abd the baseline model
        self.learning_model = AgentModel(self.obs_dim, self.n_actions, 16)
        self.baseline_model = BaselineModel(self.obs_dim, 16)

        #We create a batcher dedicated to evaluation
        model = copy.deepcopy(self.learning_model)
        self.evaluation_batcher = EpisodeBatcher(
            n_timesteps=self.config["max_episode_steps"],
            n_slots=self.config["n_evaluation_episodes"],
            create_agent=self._create_agent,
            create_env=self._create_env,
            env_args={
                "n_envs": self.config["n_envs"],
                "max_episode_steps": self.config["max_episode_steps"],
                "env_name": self.config["env_name"]
            },
            agent_args={
                "n_actions": self.n_actions,
                "model": model
            },
            n_threads=self.config["n_evaluation_threads"],
            seeds=[
                self.config["env_seed"] + k * 10
                for k in range(self.config["n_evaluation_threads"])
            ],
        )

        #Creation of the batcher for sampling complete episodes (i.e Episode Batcher)
        #The batcher will sample n_threads*n_envs trajectories at each call
        # To have a fast batcher, we have to configure it with n_timesteps=self.config["max_episode_steps"]
        model = copy.deepcopy(self.learning_model)
        self.train_batcher = EpisodeBatcher(
            n_timesteps=self.config["max_episode_steps"],
            n_slots=self.config["n_envs"] * self.config["n_threads"],
            create_agent=self._create_agent,
            create_env=self._create_env,
            env_args={
                "n_envs": self.config["n_envs"],
                "max_episode_steps": self.config["max_episode_steps"],
                "env_name": self.config["env_name"]
            },
            agent_args={
                "n_actions": self.n_actions,
                "model": model
            },
            n_threads=self.config["n_threads"],
            seeds=[
                self.config["env_seed"] + k * 10
                for k in range(self.config["n_threads"])
            ],
        )

        #Creation of the optimizer
        optimizer = torch.optim.Adam(nn.Sequential(
            self.learning_model, self.baseline_model).parameters(),
                                     lr=self.config["lr"])

        #Training Loop:
        _start_time = time.time()
        self.iteration = 0

        #We launch the evaluation batcher (in deterministic mode)
        n_episodes = self.config["n_evaluation_episodes"]
        agent_info = DictTensor(
            {"stochastic": torch.tensor([False]).repeat(n_episodes)})
        self.evaluation_batcher.execute(n_episodes=n_episodes,
                                        agent_info=agent_info)
        self.evaluation_iteration = self.iteration

        while (time.time() - _start_time < self.config["time_limit"]):
            #Update the batcher with the last version of the learning model
            self.train_batcher.update(self.learning_model.state_dict())

            #Call the batcher to get a sample of trajectories
            #1) The policy will be executed in "stochastic' mode
            n_episodes = self.config["n_envs"] * self.config["n_threads"]
            agent_info = DictTensor(
                {"stochastic": torch.tensor([True]).repeat(n_episodes)})
            self.train_batcher.execute(n_episodes=n_episodes,
                                       agent_info=agent_info)

            #2) We get the trajectories (and wait until the trajectories have been sampled)
            trajectories = self.train_batcher.get(blocking=True)

            #3) Now, we compute the loss
            dt = self.get_loss(trajectories)
            [
                self.logger.add_scalar(k, dt[k].item(), self.iteration)
                for k in dt.keys()
            ]

            # Computation of final loss
            ld = self.config["baseline_coef"] * dt["baseline_loss"]
            lr = self.config["reinforce_coef"] * dt["reinforce_loss"]
            le = self.config["entropy_coef"] * dt["entropy_loss"]

            floss = ld - le - lr

            optimizer.zero_grad()
            floss.backward()
            optimizer.step()

            #Update the train batcher with the updated model
            self.train_batcher.update(self.learning_model.state_dict())
            print("At iteration %d, avg (discounted) reward is %f" %
                  (self.iteration, dt["avg_reward"].item()))
            print("\t Avg trajectory length is %f" %
                  (trajectories.lengths.float().mean().item()))
            print(
                "\t Curves can be visualized using 'tensorboard --logdir=%s'" %
                self.config["logdir"])
            self.iteration += 1

            #We check the evaluation batcher
            evaluation_trajectories = self.evaluation_batcher.get(
                blocking=False)
            if not evaluation_trajectories is None:  #trajectories are available
                #Compute the cumulated reward
                cumulated_reward = (
                    evaluation_trajectories["_reward"] *
                    evaluation_trajectories.mask()).sum(1).mean()
                self.logger.add_scalar("evaluation_reward",
                                       cumulated_reward.item(),
                                       self.evaluation_iteration)
                #We reexecute the evaluation batcher (with same value of agent_info and same number of episodes)
                self.evaluation_batcher.update(
                    self.learning_model.state_dict())
                self.evaluation_iteration = self.iteration
                self.evaluation_batcher.reexecute()

        self.train_batcher.close()
        self.evaluation_batcher.get()  # To wait for the last trajectories
        self.evaluation_batcher.close()
        self.logger.update_csv()  # To save as a CSV file in logdir
        self.logger.close()
예제 #23
0
    def step(self, policy_output):
        assert policy_output.n_elems() == self.n_envs()
        outputs = policy_output.unfold()
        alls = []
        alls_after = []
        env_run = {}
        for b in range(len(outputs)):
            action = policy_output["action"][b]
            last_action = action
            if (isinstance(self.gym_envs[0].action_space,
                           gym.spaces.Discrete)):
                action = action.item()
                last_action = last_action.unsqueeze(0)
            else:
                action = action.tolist()
                last_action = last_action.unsqueeze(0)

            initial_state = torch.tensor([False])
            act = action

            frame, reward, done, unused_info = self.gym_envs[b].step(act)
            reward = torch.tensor([reward])
            frame = format_frame(frame)
            if isinstance(frame, torch.Tensor):
                frame = {"frame": frame}

            done = torch.tensor([done])
            r = DictTensor({
                "reward": reward,
                "done": done,
                "initial_state": initial_state,
                "last_action": last_action,
                **frame,
            })
            alls.append(r)

            if done:
                if "set" in dir(self.gym_envs[b]):
                    self.gym_envs[b].set(self.env_info[b])

                if self.env_info.empty():
                    frame = self.gym_envs[b].reset()
                else:
                    v = {k: env_info[k][b].tolist() for k in env_info.keys()}
                    frame = self.gym_envs[b].reset(env_info=v)

                frame = format_frame(frame)
                if isinstance(frame, torch.Tensor):
                    frame = {"frame": frame}

                last_action = None
                if (isinstance(self.gym_envs[0].action_space,
                               gym.spaces.Discrete)):
                    last_action = torch.zeros(1, dtype=torch.int64)
                else:
                    a = self.gym_envs[0].action_space.sample()
                    a = torch.tensor([a])
                    last_action = a

                initial_state = torch.tensor([True])
                reward = torch.tensor([0.0])
                r = DictTensor({
                    "reward": reward,
                    "done": done,
                    "initial_state": initial_state,
                    "last_action": last_action,
                    **frame,
                })
                alls_after.append(r)
            else:
                alls_after.append(r)

        next_observation = DictTensor.cat(alls)
        next_observation_next_slot = DictTensor.cat(alls_after)
        return (
            (next_observation, torch.arange(self.n_envs())),
            (next_observation_next_slot, torch.arange(self.n_envs())),
        )
예제 #24
0
def acquire_slot(
    buffer,
    env,
    agent,
    agent_state,
    observation,
    agent_info,
    env_running,
):
    """
    Run the agent to fill one slot in the buffer

    Args:
        buffer (SlottedTemporalBuffer): the buffer to store the information
        env (VecEnv): the environment
        agent (Agent): the agent
        agent_state (DictTensor): the current state of the agent
        observation (DictTensor): the observation from the agent
        env_running (torch.LongTensor): the mapping between batch dim (in agent_state and observation) and the env idx

    Return:
        env_to_slot (dict): a mapping from env_idx to slot indexes
        agent_state,observation,env_running: the state at the end of the execution

        if env_running.size()[0]==0: there is nothing more to run
    """
    with torch.no_grad():
        require_history = agent.require_history()

        B = env_running.size()[0]
        id_slots = buffer.get_free_slots(B)
        env_to_slot = {
            env_running[i].item(): id_slots[i]
            for i in range(len(id_slots))
        }
        t = 0
        for t in range(buffer.s_slots):
            # print(t,buffer.s_slots)
            _id_slots = [
                env_to_slot[env_running[i].item()]
                for i in range(env_running.size()[0])
            ]
            history = None
            if require_history:
                history = buffer.get_single_slots(_id_slots, erase=False)
            old_agent_state, agent_output, new_agent_state = agent(
                agent_state, observation, agent_info, history=history)

            # print(old_agent_state,agent_output,new_agent_state)
            (nobservation,
             env_running), (nnobservation,
                            nenv_running) = env.step(agent_output)
            position_in_slot = torch.tensor([t]).repeat(len(_id_slots))

            to_write = (observation + agent_output + old_agent_state +
                        new_agent_state.prepend_key("_") +
                        nobservation.prepend_key("_") +
                        DictTensor({"position_in_slot": position_in_slot}))
            id_slots = [
                env_to_slot[env_running[i].item()]
                for i in range(env_running.size()[0])
            ]
            assert id_slots == _id_slots
            buffer.write(id_slots, to_write)

            # Now, let us prepare the next step

            observation = nnobservation
            idxs = [
                k for k in range(env_running.size()[0])
                if env_running[k].item() in nenv_running
            ]
            if len(idxs) == 0:
                return env_to_slot, None, None, None, nenv_running
            idxs = torch.tensor(idxs)

            agent_state = new_agent_state.index(idxs)
            agent_info = agent_info.index(idxs)
            env_running = nenv_running
            assert len(agent_state.keys()) == 0 or (agent_state.n_elems()
                                                    == observation.n_elems())

            if nenv_running.size()[0] == 0:
                return env_to_slot, agent_state, observation, agent_info, env_running
        return env_to_slot, agent_state, observation, agent_info, env_running
예제 #25
0
from rlstructures.env_wrappers import GymEnv
from rlstructures import DictTensor
import torch

envs=[MyEnv() for k in range(4)]
env=GymEnv(envs,seed=80)

# Each instance of the gym.Env will be initialized with seed+i such that the multiple instances will have different seeds

#Interaction with the environment is easy, but made by using DictTensor

obs,who_is_still_running=env.reset()
print(obs)
n_running=who_is_still_running.size()[0]
while n_running>0: #While some envs are still running
    action=DictTensor({"action":torch.tensor([0]).repeat(n_running)})
    (obs,who_was_running),(obs2,who_is_still_running) = env.step(action)
    n_running=who_is_still_running.size()[0]
    print(obs2)

# Note that gym wrappers work with continuous and discrete action spaces, but may not with environments where the action space is more complicated.
# If you are facing gym envs with a complex action space, you may develop your own wrapper
# A good starting point is the rlstructures.GymEnv code which is very simple can be used to define a new wrapper
# All the other rlstuctures components will work with complex action spaces without modifications

# Trajectories in RLStructures

# When acquiring trajectories throug the *batcher.get* execution, one receives a **TemporalDictTensor**
# * Each element of the trajectories (at time t) is a complete transition

# To illustrate the structure let us consider an example:
#
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#

###### DictTensor

# A DictTensor is dictionary of pytorch tensors. It assumes that the
# first dimension of each tensor contained in the DictTensor is the batch dimension
# The easiest way to build a DictTensor is to use a ditcionnary of tensors as input

from rlstructures import DictTensor
import torch
d = DictTensor({"x": torch.randn(3, 5), "y": torch.randn(3, 8)})

# The number of batches is accessible through n_elems()
print(d.n_elems(), " <- number of elements in the batch")

# Many methods can be used over DictTensor (see DictTensor documentation)

d["x"]  # Returns the tensor 'x' in the DictTensor
d.keys()  # Returns the names of the variables of the DictTensor

# An empty DictTensor can be defined as follows:
d = DictTensor({})

###### TemporalDictTensor

# A TemporalDictTensor is a sequence of DictTensors. In memory, it is stored as a dictionary of tensors,
예제 #27
0
    def get_loss(self, trajectories):
        device=self.config["learner_device"]
        trajectories = trajectories.to(device)
        max_length = trajectories.lengths.max().item()
        assert trajectories.lengths.eq(max_length).all()
        actions = trajectories["action"]
        actions_probabilities = trajectories["action_probabilities"]
        reward = trajectories["_reward"]
        frame = trajectories["frame"]
        last_action = trajectories["last_action"]
        done = trajectories["_done"].float()
        # Re compute model on trajectories
        n_action_scores = []
        n_values = []
        hidden_state = trajectories["agent_state"][:, 0]
        for T in range(max_length):
            hidden_state = masked_tensor(hidden_state,trajectories["agent_state"][:, T],trajectories["initial_state"][:, T])
            _as, _v, hidden_state = self.learning_model(
                hidden_state, frame[:, T], last_action[:, T]
            )
            n_action_scores.append(_as.unsqueeze(1))
            n_values.append(_v.unsqueeze(1))
        n_action_scores = torch.cat(n_action_scores, dim=1)

        n_values = torch.cat(
            [*n_values, torch.zeros(trajectories.n_elems(), 1, 1).to(device)], dim=1
        ).squeeze(-1)

        # Compute value function for last state
        _idx = torch.arange(trajectories.n_elems()).to(device)
        _hidden_state = hidden_state.detach() #trajectories["_agent_state"][_idx, trajectories.lengths - 1]
        _frame = trajectories["_frame"][_idx, trajectories.lengths - 1]
        _last_action = trajectories["_last_action"][_idx, trajectories.lengths - 1]
        _, _v, _ = self.learning_model(_hidden_state, _frame, _last_action)
        n_values[_idx, trajectories.lengths] = _v.squeeze(-1)

        advantage = self.get_gae(
            trajectories,
            n_values,
            discount_factor=self.config["discount_factor"],
            _lambda=self.config["gae_lambda"],
        )

        value_loss = advantage ** 2
        avg_value_loss = value_loss.mean()

        n_action_probabilities = torch.softmax(n_action_scores, dim=2)
        n_action_distribution = torch.distributions.Categorical(n_action_probabilities)
        log_a=torch.distributions.Categorical(actions_probabilities).log_prob(actions)
        log_na=n_action_distribution.log_prob(actions)
        ratios=torch.exp(log_na-log_a)
        surr1 = ratios * advantage
        surr2 = torch.clamp(ratios,1-self.config["eps_clip"],1-self.config["eps_clip"])*advantage

        ppo_loss = torch.min(surr1,surr2)
        avg_ppo_loss = ppo_loss.mean()

        entropy_loss = n_action_distribution.entropy()
        avg_entropy_loss = entropy_loss.mean()


        dt = DictTensor(
            {
                "entropy_loss": avg_entropy_loss,
                "ppo_loss": avg_ppo_loss,
                "value_loss": avg_value_loss,
            }
        )
        return dt
예제 #28
0
 def get_single(self, slots, position):
     assert isinstance(slots, list)
     assert isinstance(slots[0], int)
     idx = torch.tensor(slots).to(self._device).long()
     d = {k: self.buffers[k][idx, position] for k in self.buffers}
     return DictTensor(d)
예제 #29
0
    def run(self):
        self.replay_buffer=ReplayBuffer(self.config["replay_buffer_size"])
        device = torch.device(self.config["learner_device"])
        self.learning_model.to(device)

        self.q1.to(device)
        self.q2.to(device)
        self.target_q1.to(device)
        self.target_q2.to(device)
        optimizer = torch.optim.Adam(
            self.learning_model.parameters(), lr=self.config["lr"]
        )
        optimizer_q1 = torch.optim.Adam(
            self.q1.parameters(), lr=self.config["lr"]
        )
        optimizer_q2 = torch.optim.Adam(
            self.q2.parameters(), lr=self.config["lr"]
        )

        self.train_batcher.update(self._state_dict(self.learning_model,torch.device("cpu")))
        self.evaluation_batcher.update(self._state_dict(self.learning_model,torch.device("cpu")))

        n_episodes=self.config["n_envs"]*self.config["n_threads"]
        self.train_batcher.reset(agent_info=DictTensor({"stochastic":torch.zeros(n_episodes).eq(0.0)}))
        logging.info("Sampling initial transitions")
        n_iterations=int(self.config["n_starting_transitions"]/(n_episodes*self.config["batch_timesteps"]))
        for k in range(n_iterations):
            self.train_batcher.execute()
            trajectories=self.train_batcher.get()
            self.replay_buffer.push(trajectories)
        print("replay_buffer_size = ",self.replay_buffer.size())

        n_episodes=self.config["n_evaluation_rollouts"]
        stochastic=torch.tensor([self.config["evaluation_mode"]=="stochastic"]).repeat(n_episodes)
        self.evaluation_batcher.execute(agent_info=DictTensor({"stochastic":stochastic}), n_episodes=n_episodes)

        logging.info("Starting Learning")
        _start_time=time.time()

        logging.info("Learning")
        while time.time()-_start_time <self.config["time_limit"]:
            self.train_batcher.execute()
            trajectories=self.train_batcher.get()
            self.replay_buffer.push(trajectories)
            self.logger.add_scalar("replay_buffer_size",self.replay_buffer.size(),self.iteration)
            # avg_reward = 0

            for k in range(self.config["n_batches_per_epochs"]):
                transitions=self.replay_buffer.sample(n=self.config["size_batches"])

                #print(dt)
                dt,transitions = self.get_q_loss(transitions,device)
                [self.logger.add_scalar(k,dt[k].item(),self.iteration) for k in dt.keys()]
                optimizer_q1.zero_grad()
                dt["q1_loss"].backward()
                optimizer_q1.step()

                optimizer_q2.zero_grad()
                dt["q2_loss"].backward()
                optimizer_q2.step()

                optimizer.zero_grad()
                dt = self.get_policy_loss(transitions)
                [self.logger.add_scalar(k,dt[k].item(),self.iteration) for k in dt.keys()]
                dt["policy_loss"].backward()
                optimizer.step()

                tau=self.config["tau"]
                self.soft_update_params(self.q1,self.target_q1,tau)
                self.soft_update_params(self.q2,self.target_q2,tau)

                self.iteration+=1

            self.train_batcher.update(self._state_dict(self.learning_model,torch.device("cpu")))
            self.evaluate()
예제 #30
0
    def get(self):
        with torch.no_grad():
            obs,is_running=self.env.reset(self.env_info)
            n_elems=obs.n_elems()
            observations=[{k:obs[k] for k in obs.keys()}]
            states=[]
            agent_state=None
            agent_info=self.agent_info
            if agent_info is None:
                agent_info=DictTensor({})
            t=0
            length=torch.zeros(is_running.size()[0]).long()
            first_state=None
            first_info=agent_info
            while is_running.size()[0]>0:
                old_agent_state, agent_output, new_agent_state = self.agent(
                    agent_state, obs,agent_info
                )

                if (len(states)==0):
                    first_state=old_agent_state
                    s={k:old_agent_state[k] for k in old_agent_state.keys()}
                    s={**s,**{k:agent_output[k] for k in agent_output.keys()}}
                    s={**s,**{"_"+k:new_agent_state[k] for k in new_agent_state.keys()}}
                    states.append(s)
                else:
                    s={k:old_agent_state[k] for k in old_agent_state.keys()}
                    s={**s,**{k:agent_output[k] for k in agent_output.keys()}}
                    s={**s,**{"_"+k:new_agent_state[k] for k in new_agent_state.keys()}}

                    ns={k:states[0][k].clone() for k in states[0]}

                    for k in states[0]:
                        ns[k][is_running]=(s[k])
                    states.append(ns)

                (l_o,l_is_running),(obs,is_running)=self.env.step(agent_output)

                for k in l_o.keys():
                    observations[t]["_"+k]=observations[0][k].clone()
                for k in l_o.keys():
                    observations[t]["_"+k][l_is_running]=(l_o[k])
                length[l_is_running]+=1
                t+=1
                if (is_running.size()[0]>0):
                    observations.append({})
                    for k in obs.keys():
                        observations[t][k]=observations[0][k].clone()
                    for k in obs.keys():
                        observations[t][k][is_running]=(obs[k])

                    ag={k:first_state[k].clone() for k in first_state.keys()}
                    for k in ag:
                        ag[k][l_is_running]=new_agent_state[k]
                    agent_state=DictTensor({k:ag[k][is_running] for k in ag})

                    ai={k:first_info[k].clone() for k in first_info.keys()}
                    agent_info=DictTensor({k:ai[k][is_running] for k in ai})

            f_observations={}
            for k in observations[0]:
                _all=[o[k].unsqueeze(1) for o in observations]
                f_observations[k]=torch.cat(_all,dim=1)
            f_states={}
            for k in states[0]:
                _all=[o[k].unsqueeze(1) for o in states]
                f_states[k]=torch.cat(_all,dim=1)
            return TemporalDictTensor({**f_observations,**f_states},lengths=length)