Exemplo n.º 1
0
    def reset(self):
        #Creation of the batchers
        model = copy.deepcopy(self.learning_model)
        self.train_batcher = EpisodeBatcher(
            n_timesteps=self.config["max_episode_steps"],
            n_slots=self.config["n_envs"] * self.config["n_threads"],
            create_agent=self._create_agent,
            create_env=self._create_env,
            env_args={
                "n_envs": self.config["n_envs"],
                "max_episode_steps": self.config["max_episode_steps"],
                **{
                    k: self.config[k]
                    for k in self.config if k.startswith("environment/")
                }
            },
            agent_args={
                "n_actions": self.n_actions,
                "model": model
            },
            n_threads=self.config["n_threads"],
            seeds=[
                self.config["env_seed"] + k * 10
                for k in range(self.config["n_threads"])
            ],
        )

        #Creation of the evaluation batcher
        model = copy.deepcopy(self.learning_model)
        self.evaluation_batcher = EpisodeBatcher(
            n_timesteps=self.config["max_episode_steps"],
            n_slots=self.config["n_evaluation_rollouts"],
            create_agent=self._create_agent,
            create_env=self._create_env,
            env_args={
                "n_envs": self.config["n_envs"],
                "max_episode_steps": self.config["max_episode_steps"],
                **{
                    k: self.config[k]
                    for k in self.config if k.startswith("environment/")
                }
            },
            agent_args={
                "n_actions": self.n_actions,
                "model": model
            },
            n_threads=self.config["n_evaluation_threads"],
            seeds=self.config["env_seed"] * 10,
        )
        self.register_batcher(self.train_batcher)
        self.register_batcher(self.evaluation_batcher)
Exemplo n.º 2
0
    def reset(self):
        self.q1 = self._create_q()
        self.q2 = self._create_q()
        self.target_q1=self._create_q()
        self.target_q2=self._create_q()
        self.target_q1.load_state_dict(self.q1.state_dict())
        self.target_q2.load_state_dict(self.q2.state_dict())

        model=copy.deepcopy(self.learning_model)
        self.train_batcher=Batcher(
            n_timesteps=self.config["batch_timesteps"],
            n_slots=self.config["n_envs"]*self.config["n_threads"],
            create_agent=self._create_agent,
            create_env=self._create_env,
            env_args={
                "mode":"train",
                "n_envs": self.config["n_envs"],
                "max_episode_steps": self.config["max_episode_steps"],
                **{k:self.config[k] for k in self.config if k.startswith("environment/")}
            },
            agent_args={"action_dim": self.action_dim, "policy": model},
            n_threads=self.config["n_threads"],
            seeds=self.config["env_seed"],
        )

        model=copy.deepcopy(self.learning_model)
        self.evaluation_batcher=EpisodeBatcher(
            n_timesteps=self.config["max_episode_steps"],
            n_slots=self.config["n_evaluation_rollouts"],
            create_agent=self._create_agent,
            create_env=self._create_env,
            env_args={
                "mode":"evaluation",
                "max_episode_steps": self.config["max_episode_steps"],
                "n_envs": self.config["n_envs"],
                **{k:self.config[k] for k in self.config if k.startswith("environment/")}

            },
            agent_args={"action_dim": self.action_dim, "policy": model},
            n_threads=self.config["n_evaluation_threads"],
            seeds=self.config["env_seed"]*10,
        )

        self.register_batcher(self.train_batcher)
        self.register_batcher(self.evaluation_batcher)
Exemplo n.º 3
0
    def run(self):
        # Instantiate the learning model abd the baseline model
        self.learning_model=AgentModel(self.obs_dim,self.n_actions,32)
        self.critic_model=BaselineModel(self.obs_dim,32)

        #We create a batcher dedicated to evaluation
        model=copy.deepcopy(self.learning_model)
        self.evaluation_batcher=EpisodeBatcher(
            n_timesteps=self.config["max_episode_steps"],
            n_slots=self.config["n_evaluation_episodes"],
            create_agent=self._create_agent,
            create_env=self._create_env,
            env_args={
                "n_envs": self.config["n_envs"],
                "max_episode_steps": self.config["max_episode_steps"],
                "env_name":self.config["env_name"]
            },
            agent_args={"n_actions": self.n_actions, "model": model},
            n_threads=self.config["n_evaluation_threads"],
            seeds=[self.config["env_seed"]+k*10 for k in range(self.config["n_evaluation_threads"])],
        )

        #Creation of the batcher for sampling complete pieces of trajectories (i.e Batcher)
        #The batcher will sample n_threads*n_envs trajectories at each call
        # To have a fast batcher, we have to configure it with n_timesteps=self.config["max_episode_steps"]
        model=copy.deepcopy(self.learning_model)
        self.train_batcher=Batcher(
            n_timesteps=self.config["a2c_timesteps"],
            n_slots=self.config["n_envs"]*self.config["n_threads"],
            create_agent=self._create_agent,
            create_env=self._create_train_env,
            env_args={
                "n_envs": self.config["n_envs"],
                "max_episode_steps": self.config["max_episode_steps"],
                "env_name":self.config["env_name"]
            },
            agent_args={"n_actions": self.n_actions, "model": model},
            n_threads=self.config["n_threads"],
            seeds=[self.config["env_seed"]+k*10 for k in range(self.config["n_threads"])],
        )

        #Creation of the optimizer
        optimizer = torch.optim.Adam(nn.Sequential(self.learning_model,self.critic_model).parameters(), lr=self.config["lr"])

        #Training Loop:
        _start_time=time.time()
        self.iteration=0

        # #We launch the evaluation batcher (in deterministic mode)
        n_episodes=self.config["n_evaluation_episodes"]
        agent_info=DictTensor({"stochastic":torch.tensor([False]).repeat(n_episodes)})
        self.evaluation_batcher.execute(n_episodes=n_episodes,agent_info=agent_info)
        self.evaluation_iteration=self.iteration

        #Initialize the training batcher such that agents will start to acqire pieces of episodes
        self.train_batcher.update(self.learning_model.state_dict())
        n_episodes=self.config["n_envs"]*self.config["n_threads"]
        agent_info=DictTensor({"stochastic":torch.tensor([True]).repeat(n_episodes)})
        self.train_batcher.reset(agent_info=agent_info)

        while(time.time()-_start_time<self.config["time_limit"]):
            #Call the batcher to get a sample of trajectories

            #2) We get the pieces of episodes. Since the env is an infinite env, we will always receive a new piece of episode
            self.train_batcher.execute()
            trajectories=self.train_batcher.get(blocking=True)

            #3) Now, we compute the loss
            dt=self.get_loss(trajectories)
            [self.logger.add_scalar(k,dt[k].item(),self.iteration) for k in dt.keys()]

            # Computation of final loss
            ld = self.config["critic_coef"] * dt["critic_loss"]
            lr = self.config["a2c_coef"] * dt["a2c_loss"]
            le = self.config["entropy_coef"] * dt["entropy_loss"]

            floss = ld - le - lr
            floss= floss/n_episodes*trajectories.n_elems()

            optimizer.zero_grad()
            floss.backward()
            optimizer.step()

            #Update the train batcher with the updated model
            self.train_batcher.update(self.learning_model.state_dict())
            self.iteration+=1

            #We check the evaluation batcher
            evaluation_trajectories=self.evaluation_batcher.get(blocking=False)
            if not evaluation_trajectories is None: #trajectories are available
                #Compute the cumulated reward
                cumulated_reward=(evaluation_trajectories["_reward"]*evaluation_trajectories.mask()).sum(1).mean()
                self.logger.add_scalar("evaluation_reward",cumulated_reward.item(),self.evaluation_iteration)
                print("At iteration %d, reward is %f"%(self.evaluation_iteration,cumulated_reward.item()))
                #We reexecute the evaluation batcher (with same value of agent_info and same number of episodes)
                self.evaluation_batcher.update(self.learning_model.state_dict())
                self.evaluation_iteration=self.iteration
                self.evaluation_batcher.reexecute()

        self.train_batcher.close()
        self.evaluation_batcher.get() # To wait for the last trajectories
        self.evaluation_batcher.close()
        self.logger.update_csv() # To save as a CSV file in logdir
        self.logger.close()
Exemplo n.º 4
0
class A2C:
    def __init__(self, config,create_env,create_train_env,create_agent):
        self.config = config

        # Creation of the Logger (that saves in tensorboard and CSV)
        self.logger = TFLogger(log_dir=self.config["logdir"], hps=self.config)

        self._create_env=create_env
        self._create_train_env=create_train_env
        self._create_agent=create_agent

        #Creation of one env instance to get the dimensionnality of observations and number of actions
        env = self._create_env(self.config["n_envs"], seed=0,env_name=self.config["env_name"])
        self.n_actions = env.action_space.n
        self.obs_dim = env.reset()[0]["frame"].size()[1]
        del env

    def run(self):
        # Instantiate the learning model abd the baseline model
        self.learning_model=AgentModel(self.obs_dim,self.n_actions,32)
        self.critic_model=BaselineModel(self.obs_dim,32)

        #We create a batcher dedicated to evaluation
        model=copy.deepcopy(self.learning_model)
        self.evaluation_batcher=EpisodeBatcher(
            n_timesteps=self.config["max_episode_steps"],
            n_slots=self.config["n_evaluation_episodes"],
            create_agent=self._create_agent,
            create_env=self._create_env,
            env_args={
                "n_envs": self.config["n_envs"],
                "max_episode_steps": self.config["max_episode_steps"],
                "env_name":self.config["env_name"]
            },
            agent_args={"n_actions": self.n_actions, "model": model},
            n_threads=self.config["n_evaluation_threads"],
            seeds=[self.config["env_seed"]+k*10 for k in range(self.config["n_evaluation_threads"])],
        )

        #Creation of the batcher for sampling complete pieces of trajectories (i.e Batcher)
        #The batcher will sample n_threads*n_envs trajectories at each call
        # To have a fast batcher, we have to configure it with n_timesteps=self.config["max_episode_steps"]
        model=copy.deepcopy(self.learning_model)
        self.train_batcher=Batcher(
            n_timesteps=self.config["a2c_timesteps"],
            n_slots=self.config["n_envs"]*self.config["n_threads"],
            create_agent=self._create_agent,
            create_env=self._create_train_env,
            env_args={
                "n_envs": self.config["n_envs"],
                "max_episode_steps": self.config["max_episode_steps"],
                "env_name":self.config["env_name"]
            },
            agent_args={"n_actions": self.n_actions, "model": model},
            n_threads=self.config["n_threads"],
            seeds=[self.config["env_seed"]+k*10 for k in range(self.config["n_threads"])],
        )

        #Creation of the optimizer
        optimizer = torch.optim.Adam(nn.Sequential(self.learning_model,self.critic_model).parameters(), lr=self.config["lr"])

        #Training Loop:
        _start_time=time.time()
        self.iteration=0

        # #We launch the evaluation batcher (in deterministic mode)
        n_episodes=self.config["n_evaluation_episodes"]
        agent_info=DictTensor({"stochastic":torch.tensor([False]).repeat(n_episodes)})
        self.evaluation_batcher.execute(n_episodes=n_episodes,agent_info=agent_info)
        self.evaluation_iteration=self.iteration

        #Initialize the training batcher such that agents will start to acqire pieces of episodes
        self.train_batcher.update(self.learning_model.state_dict())
        n_episodes=self.config["n_envs"]*self.config["n_threads"]
        agent_info=DictTensor({"stochastic":torch.tensor([True]).repeat(n_episodes)})
        self.train_batcher.reset(agent_info=agent_info)

        while(time.time()-_start_time<self.config["time_limit"]):
            #Call the batcher to get a sample of trajectories

            #2) We get the pieces of episodes. Since the env is an infinite env, we will always receive a new piece of episode
            self.train_batcher.execute()
            trajectories=self.train_batcher.get(blocking=True)

            #3) Now, we compute the loss
            dt=self.get_loss(trajectories)
            [self.logger.add_scalar(k,dt[k].item(),self.iteration) for k in dt.keys()]

            # Computation of final loss
            ld = self.config["critic_coef"] * dt["critic_loss"]
            lr = self.config["a2c_coef"] * dt["a2c_loss"]
            le = self.config["entropy_coef"] * dt["entropy_loss"]

            floss = ld - le - lr
            floss= floss/n_episodes*trajectories.n_elems()

            optimizer.zero_grad()
            floss.backward()
            optimizer.step()

            #Update the train batcher with the updated model
            self.train_batcher.update(self.learning_model.state_dict())
            self.iteration+=1

            #We check the evaluation batcher
            evaluation_trajectories=self.evaluation_batcher.get(blocking=False)
            if not evaluation_trajectories is None: #trajectories are available
                #Compute the cumulated reward
                cumulated_reward=(evaluation_trajectories["_reward"]*evaluation_trajectories.mask()).sum(1).mean()
                self.logger.add_scalar("evaluation_reward",cumulated_reward.item(),self.evaluation_iteration)
                print("At iteration %d, reward is %f"%(self.evaluation_iteration,cumulated_reward.item()))
                #We reexecute the evaluation batcher (with same value of agent_info and same number of episodes)
                self.evaluation_batcher.update(self.learning_model.state_dict())
                self.evaluation_iteration=self.iteration
                self.evaluation_batcher.reexecute()

        self.train_batcher.close()
        self.evaluation_batcher.get() # To wait for the last trajectories
        self.evaluation_batcher.close()
        self.logger.update_csv() # To save as a CSV file in logdir
        self.logger.close()


    def get_loss(self,trajectories):
            #First, we want to compute the cumulated reward per trajectory
            #The reward is a t+1 in each iteration (since it is btained after the aaction), so we use the '_reward' field in the trajectory
            # The 'reward' field corresopnds to the reward at time t
            reward=trajectories["_reward"]

            #We get the mask that tells which transition is in a trajectory (1) or not (0)
            mask=trajectories.mask()

            #We remove the reward values that are not in the trajectories
            reward=reward*mask
            max_length=trajectories.lengths.max().item()
            #Now, we want to compute the action probabilities over the trajectories such that we will be able to do 'backward'
            action_probabilities=[]
            for t in range(max_length):
                proba=self.learning_model(trajectories["frame"][:,t])
                action_probabilities.append(proba.unsqueeze(1)) # We append the probability, and introduces the temporal dimension (2nde dimension)
            action_probabilities=torch.cat(action_probabilities,dim=1) #Now, we have a B x T x n_actions tensor

            #We compute the critic value for t=0 to T (i.e including the very last observation)
            critic=[]
            for t in range(max_length):
                b=self.critic_model(trajectories["frame"][:,t])
                critic.append(b.unsqueeze(1))
            critic=torch.cat(critic+[b.unsqueeze(1)],dim=1).squeeze(-1) #Now, we have a B x (T+1) tensor
            #We also need to compute the critic value at for the last observation of the trajectories (to compute the TD)
            # It may be the last element of the trajectories (if episode is not finished), or on the last frame of the episode
            idx=torch.arange(trajectories.n_elems())
            last_critic=self.critic_model(trajectories["_frame"][idx,trajectories.lengths-1]).squeeze(-1)
            critic[idx,trajectories.lengths]=last_critic


            #We compute the temporal difference
            target=reward+self.config["discount_factor"]*(1-trajectories["_done"].float())*critic[:,1:].detach()
            td=critic[:,:-1]-target

            critic_loss=td**2
            #We sum the loss for each episode (considering the mask)
            critic_loss= (critic_loss*mask).sum(1)/mask.sum(1)
            #We average the loss over all the trajectories
            avg_critic_loss = critic_loss.mean()

            #We do the same on the reinforce loss
            action_distribution=torch.distributions.Categorical(action_probabilities)
            log_proba=action_distribution.log_prob(trajectories["action"])
            a2c_loss = -log_proba * td.detach()
            a2c_loss = (a2c_loss*mask).sum(1)/mask.sum(1)
            avg_a2c_loss=a2c_loss.mean()

            #We compute the entropy loss
            entropy=action_distribution.entropy()
            entropy=(entropy*mask).sum(1)/mask.sum(1)
            avg_entropy=entropy.mean()

            return DictTensor({"critic_loss":avg_critic_loss,"a2c_loss":avg_a2c_loss,"entropy_loss":avg_entropy})
Exemplo n.º 5
0
class SAC(BaseExperiment):
    def __init__(self, config, create_env, create_agent):
        super().__init__(config,create_env,create_agent)
        env = self._create_env(
            self.config["n_envs"], seed=0,**{k:self.config[k] for k in self.config if k.startswith("environment/")}
        )

        self.action_dim = env.action_space.sample().shape[0]
        self.obs_dim = env.reset()[0]["frame"].size()[1]
        del env

    def check_arguments(self,args):
        assert args["n_evaluation_rollouts"]%(args["n_envs"]*args["n_evaluation_threads"])==0
        assert args["evaluation_mode"]=="deterministic" or args["evaluation_mode"]=="stochastic"
        return True

    def save(self):
        super().save()
        reward=self.evaluate(relaunch=False)
        while(reward is None):
            reward=self.evaluate(relaunch=False)
        f=open(self.config["logdir"]+"/out.out","wb")
        pickle.dump({"reward":reward},f)
        time.sleep(np.random.rand()*10)
        f.close()

    def reset(self):
        self.q1 = self._create_q()
        self.q2 = self._create_q()
        self.target_q1=self._create_q()
        self.target_q2=self._create_q()
        self.target_q1.load_state_dict(self.q1.state_dict())
        self.target_q2.load_state_dict(self.q2.state_dict())

        model=copy.deepcopy(self.learning_model)
        self.train_batcher=Batcher(
            n_timesteps=self.config["batch_timesteps"],
            n_slots=self.config["n_envs"]*self.config["n_threads"],
            create_agent=self._create_agent,
            create_env=self._create_env,
            env_args={
                "mode":"train",
                "n_envs": self.config["n_envs"],
                "max_episode_steps": self.config["max_episode_steps"],
                **{k:self.config[k] for k in self.config if k.startswith("environment/")}
            },
            agent_args={"action_dim": self.action_dim, "policy": model},
            n_threads=self.config["n_threads"],
            seeds=self.config["env_seed"],
        )

        model=copy.deepcopy(self.learning_model)
        self.evaluation_batcher=EpisodeBatcher(
            n_timesteps=self.config["max_episode_steps"],
            n_slots=self.config["n_evaluation_rollouts"],
            create_agent=self._create_agent,
            create_env=self._create_env,
            env_args={
                "mode":"evaluation",
                "max_episode_steps": self.config["max_episode_steps"],
                "n_envs": self.config["n_envs"],
                **{k:self.config[k] for k in self.config if k.startswith("environment/")}

            },
            agent_args={"action_dim": self.action_dim, "policy": model},
            n_threads=self.config["n_evaluation_threads"],
            seeds=self.config["env_seed"]*10,
        )

        self.register_batcher(self.train_batcher)
        self.register_batcher(self.evaluation_batcher)


    def _state_dict(self,model,device):
        sd = model.state_dict()
        for k, v in sd.items():
            sd[k] = v.to(device)
        return sd

    def soft_update_params(self,net, target_net, tau):
        for param, target_param in zip(net.parameters(), target_net.parameters()):
            target_param.data.copy_(tau * param.data +(1 - tau) * target_param.data)

    def run(self):
        self.replay_buffer=ReplayBuffer(self.config["replay_buffer_size"])
        device = torch.device(self.config["learner_device"])
        self.learning_model.to(device)

        self.q1.to(device)
        self.q2.to(device)
        self.target_q1.to(device)
        self.target_q2.to(device)
        optimizer = torch.optim.Adam(
            self.learning_model.parameters(), lr=self.config["lr"]
        )
        optimizer_q1 = torch.optim.Adam(
            self.q1.parameters(), lr=self.config["lr"]
        )
        optimizer_q2 = torch.optim.Adam(
            self.q2.parameters(), lr=self.config["lr"]
        )

        self.train_batcher.update(self._state_dict(self.learning_model,torch.device("cpu")))
        self.evaluation_batcher.update(self._state_dict(self.learning_model,torch.device("cpu")))

        n_episodes=self.config["n_envs"]*self.config["n_threads"]
        self.train_batcher.reset(agent_info=DictTensor({"stochastic":torch.zeros(n_episodes).eq(0.0)}))
        logging.info("Sampling initial transitions")
        n_iterations=int(self.config["n_starting_transitions"]/(n_episodes*self.config["batch_timesteps"]))
        for k in range(n_iterations):
            self.train_batcher.execute()
            trajectories=self.train_batcher.get()
            self.replay_buffer.push(trajectories)
        print("replay_buffer_size = ",self.replay_buffer.size())

        n_episodes=self.config["n_evaluation_rollouts"]
        stochastic=torch.tensor([self.config["evaluation_mode"]=="stochastic"]).repeat(n_episodes)
        self.evaluation_batcher.execute(agent_info=DictTensor({"stochastic":stochastic}), n_episodes=n_episodes)

        logging.info("Starting Learning")
        _start_time=time.time()

        logging.info("Learning")
        while time.time()-_start_time <self.config["time_limit"]:
            self.train_batcher.execute()
            trajectories=self.train_batcher.get()
            self.replay_buffer.push(trajectories)
            self.logger.add_scalar("replay_buffer_size",self.replay_buffer.size(),self.iteration)
            # avg_reward = 0

            for k in range(self.config["n_batches_per_epochs"]):
                transitions=self.replay_buffer.sample(n=self.config["size_batches"])

                #print(dt)
                dt,transitions = self.get_q_loss(transitions,device)
                [self.logger.add_scalar(k,dt[k].item(),self.iteration) for k in dt.keys()]
                optimizer_q1.zero_grad()
                dt["q1_loss"].backward()
                optimizer_q1.step()

                optimizer_q2.zero_grad()
                dt["q2_loss"].backward()
                optimizer_q2.step()

                optimizer.zero_grad()
                dt = self.get_policy_loss(transitions)
                [self.logger.add_scalar(k,dt[k].item(),self.iteration) for k in dt.keys()]
                dt["policy_loss"].backward()
                optimizer.step()

                tau=self.config["tau"]
                self.soft_update_params(self.q1,self.target_q1,tau)
                self.soft_update_params(self.q2,self.target_q2,tau)

                self.iteration+=1

            self.train_batcher.update(self._state_dict(self.learning_model,torch.device("cpu")))
            self.evaluate()


    def evaluate(self,relaunch=True):
        evaluation_trajectories = self.evaluation_batcher.get(blocking=False)

        if (evaluation_trajectories is None):
            return

        avg_reward = (
                    (
                        evaluation_trajectories["_reward"]
                        * evaluation_trajectories.mask()
                    )
                    .sum(1)
                    .mean()
                    .item()
        )
        self.logger.add_scalar("avg_reward/"+self.config["evaluation_mode"], avg_reward, self.iteration)
        if (self.config["verbose"]):
                print("Iteration "+str(self.iteration)+", Reward =  "+str(avg_reward))

        if (relaunch):
            cpu_parameters=self._state_dict(self.learning_model,torch.device("cpu"))
            self.evaluation_batcher.update(cpu_parameters)
            self.evaluation_batcher.reexecute()
        return avg_reward

    def get_q_loss(self, transitions,device):

        transitions = transitions.to(device)
        B=transitions.n_elems()
        Bv=torch.arange(B)
        action = transitions["action"]
        reward = transitions["_reward"]
        frame = transitions["frame"]
        _frame = transitions["_frame"]
        _done = transitions["_done"].float()

        # action for s_prime
        mean_prime,var_prime=self.learning_model(_frame)
        _id = torch.eye(self.action_dim).unsqueeze(0).repeat(B, 1, 1)
        # _nvar = var_prime.unsqueeze(-1).repeat(1, 1, self.action_dim)
        # _nvar = _nvar * _id
        distribution=torch.distributions.Normal(mean_prime, var_prime)
        next_action=distribution.sample().detach()

        #Compute targets
        q1=self.target_q1(_frame,next_action).detach().squeeze(-1)
        q2=self.target_q2(_frame,next_action).detach().squeeze(-1)
        q = torch.min(q1,q2)
        lp= distribution.log_prob(next_action).detach().sum(-1)
        q = q  - self.config["lambda_entropy"]*lp
        target_value=q*(1.-_done)*self.config["discount_factor"]+reward

        q1_loss=(target_value.detach()-self.q1(frame,action).squeeze(-1))**2
        q2_loss=(target_value.detach()-self.q2(frame,action).squeeze(-1))**2
        dt ={
                "q1_loss": q1_loss.mean(),
                "q2_loss": q2_loss.mean(),
            }
        return DictTensor(dt),transitions

    def get_policy_loss(self,transitions):
        frame = transitions["frame"]
        B=transitions.n_elems()
        #Now, compute the policy term
        mean,var=self.learning_model(frame)
        #print(var.mean().item())
        #print(mean)
        _id = torch.eye(self.action_dim).unsqueeze(0).repeat(B, 1, 1)
        # _nvar = var.unsqueeze(-1).repeat(1, 1, self.action_dim)
        # _nvar = _nvar * _id
        distribution=torch.distributions.Normal(mean, var)
        entropy=distribution.entropy().mean()
        action_tilde=distribution.rsample()
        #print(action_tilde)
        q1 = self.q1(frame,action_tilde).squeeze(-1)
        q2 = self.q2(frame,action_tilde).squeeze(-1)
        q=torch.min(q1,q2)
        loss=q-self.config["lambda_entropy"]*distribution.log_prob(action_tilde).sum(-1)

        dt={"policy_loss":-loss.mean(),"entropy":entropy.detach(),"avg_var":var.mean().detach(),"avg_mean":mean.mean().detach()}
        dt=DictTensor(dt)
        return dt
Exemplo n.º 6
0
class PPO(BaseExperiment):
    def __init__(self, config, create_env, create_agent):
        super().__init__(config,create_env,create_agent)
        env = self._create_env(
            self.config["n_envs"], seed=0,**{k:self.config[k] for k in self.config if k.startswith("environment/")}
        )
        self.n_actions = env.action_space.n
        self.obs_dim = env.reset()[0]["frame"].size()[1]
        del env

    def check_arguments(self,args):
        assert args["n_evaluation_rollouts"]%(args["n_envs"]*args["n_evaluation_threads"])==0
        assert args["evaluation_mode"]=="deterministic" or args["evaluation_mode"]=="stochastic"
        return True


    def reset(self):
        #Creation of the batchers
        model=copy.deepcopy(self.learning_model)
        print(self.config)
        self.train_batcher=Batcher(
            n_timesteps=self.config["learning_timesteps"],
            n_slots=self.config["n_envs"]*self.config["n_threads"],
            create_agent=self._create_agent,
            create_env=self._create_env,
            env_args={
                "mode":"train",
                "n_envs": self.config["n_envs"],
                "max_episode_steps": self.config["max_episode_steps"],
                **{k:self.config[k] for k in self.config if k.startswith("environment/")}
            },
            agent_args={"n_actions": self.n_actions, "model": model},
            n_threads=self.config["n_threads"],
            seeds=self.config["env_seed"],
        )

        model=copy.deepcopy(self.learning_model)
        self.evaluation_batcher=EpisodeBatcher(
            n_timesteps=self.config["max_episode_steps"],
            n_slots=self.config["n_evaluation_rollouts"],
            create_agent=self._create_agent,
            create_env=self._create_env,
            env_args={
                "mode":"evaluation",
                "max_episode_steps": self.config["max_episode_steps"],
                "n_envs": self.config["n_envs"],
                **{k:self.config[k] for k in self.config if k.startswith("environment/")}

            },
            agent_args={"n_actions": self.n_actions, "model": model},
            n_threads=self.config["n_evaluation_threads"],
            seeds=self.config["env_seed"]*10,
        )

        self.register_batcher(self.train_batcher)
        self.register_batcher(self.evaluation_batcher)


    def _state_dict(self,model,device):
        sd = model.state_dict()
        for k, v in sd.items():
            sd[k] = v.to(device)
        return sd

    def run(self):
        device = torch.device(self.config["learner_device"])
        self.learning_model.to(device)
        optimizer = torch.optim.Adam(
            self.learning_model.parameters(), lr=self.config["lr"]
        )
        cpu_parameters=self._state_dict(self.learning_model,torch.device("cpu"))
        self.train_batcher.update(cpu_parameters)
        self.evaluation_batcher.update(cpu_parameters)
        n_episodes=self.config["n_evaluation_rollouts"]
        self.evaluation_batcher.execute(agent_info=DictTensor({"stochastic":torch.ones(n_episodes)}), n_episodes=n_episodes)

        # Initialize the train batcher
        n_episodes=self.config["n_envs"]*self.config["n_threads"]
        self.train_batcher.reset(agent_info=DictTensor({"stochastic":torch.ones(n_episodes)}))

        _start_time=time.time()
        while time.time()-_start_time<self.config["time_limit"]:
            self.train_batcher.execute()
            trajectories=self.train_batcher.get()
            avg_reward = 0
            for K in range(self.config["k_epochs"]):
                optimizer.zero_grad()
                dt = self.get_loss(trajectories)
                [
                    self.logger.add_scalar("loss/" + k, dt[k].item(), self.iteration)
                    for k in dt.keys()
                ]

                # Computation of final loss
                ld = self.config["coef_critic"] * dt["value_loss"]
                lr = self.config["coef_ppo"] * dt["ppo_loss"]
                le = self.config["coef_entropy"] * dt["entropy_loss"]

                floss = ld - le - lr
                floss.backward()
                if self.config["clip_grad"] > 0:
                    n = torch.nn.utils.clip_grad_norm_(
                        self.learning_model.parameters(), self.config["clip_grad"]
                    )
                    self.logger.add_scalar("grad_norm", n.item(), self.iteration)
                optimizer.step()
                self.evaluate()
                self.iteration+=1
            cpu_parameters=self._state_dict(self.learning_model,torch.device("cpu"))
            self.train_batcher.update(cpu_parameters)
            self.evaluate()
            self.iteration+=1


    def evaluate(self,relaunch=True):
        evaluation_trajectories = self.evaluation_batcher.get(blocking=False)
        if (evaluation_trajectories is None):
            return

        avg_reward = (
                    (
                        evaluation_trajectories["_reward"]
                        * evaluation_trajectories.mask()
                    )
                    .sum(1)
                    .mean()
                    .item()
        )
        self.logger.add_scalar("avg_reward/"+self.config["evaluation_mode"], avg_reward, self.iteration)
        if (self.config["verbose"]):
                print("Iteration "+str(self.iteration)+", Reward =  "+str(avg_reward))

        if (relaunch):
            cpu_parameters=self._state_dict(self.learning_model,torch.device("cpu"))
            self.evaluation_batcher.update(cpu_parameters)
            self.evaluation_batcher.reexecute()
        return avg_reward


    def get_loss(self, trajectories):
        device=self.config["learner_device"]
        trajectories = trajectories.to(device)
        max_length = trajectories.lengths.max().item()
        assert trajectories.lengths.eq(max_length).all()
        actions = trajectories["action"]
        actions_probabilities = trajectories["action_probabilities"]
        reward = trajectories["_reward"]
        frame = trajectories["frame"]
        last_action = trajectories["last_action"]
        done = trajectories["_done"].float()
        # Re compute model on trajectories
        n_action_scores = []
        n_values = []
        hidden_state = trajectories["agent_state"][:, 0]
        for T in range(max_length):
            hidden_state = masked_tensor(hidden_state,trajectories["agent_state"][:, T],trajectories["initial_state"][:, T])
            _as, _v, hidden_state = self.learning_model(
                hidden_state, frame[:, T], last_action[:, T]
            )
            n_action_scores.append(_as.unsqueeze(1))
            n_values.append(_v.unsqueeze(1))
        n_action_scores = torch.cat(n_action_scores, dim=1)

        n_values = torch.cat(
            [*n_values, torch.zeros(trajectories.n_elems(), 1, 1).to(device)], dim=1
        ).squeeze(-1)

        # Compute value function for last state
        _idx = torch.arange(trajectories.n_elems()).to(device)
        _hidden_state = hidden_state.detach() #trajectories["_agent_state"][_idx, trajectories.lengths - 1]
        _frame = trajectories["_frame"][_idx, trajectories.lengths - 1]
        _last_action = trajectories["_last_action"][_idx, trajectories.lengths - 1]
        _, _v, _ = self.learning_model(_hidden_state, _frame, _last_action)
        n_values[_idx, trajectories.lengths] = _v.squeeze(-1)

        advantage = self.get_gae(
            trajectories,
            n_values,
            discount_factor=self.config["discount_factor"],
            _lambda=self.config["gae_lambda"],
        )

        value_loss = advantage ** 2
        avg_value_loss = value_loss.mean()

        n_action_probabilities = torch.softmax(n_action_scores, dim=2)
        n_action_distribution = torch.distributions.Categorical(n_action_probabilities)
        log_a=torch.distributions.Categorical(actions_probabilities).log_prob(actions)
        log_na=n_action_distribution.log_prob(actions)
        ratios=torch.exp(log_na-log_a)
        surr1 = ratios * advantage
        surr2 = torch.clamp(ratios,1-self.config["eps_clip"],1-self.config["eps_clip"])*advantage

        ppo_loss = torch.min(surr1,surr2)
        avg_ppo_loss = ppo_loss.mean()

        entropy_loss = n_action_distribution.entropy()
        avg_entropy_loss = entropy_loss.mean()


        dt = DictTensor(
            {
                "entropy_loss": avg_entropy_loss,
                "ppo_loss": avg_ppo_loss,
                "value_loss": avg_value_loss,
            }
        )
        return dt

    def get_gae(self, trajectories, values, discount_factor=1, _lambda=0):
        r = trajectories["_reward"]
        v = values[:, 1:].detach()
        d = trajectories["_done"].float()
        delta = r + discount_factor * v * (1.0 - d) - values[:, :-1]
        T = trajectories.lengths.max().item()
        gae = delta[:, -1]
        gaes = [gae]
        for t in range(T - 2, -1, -1):
            gae = delta[:, t] + discount_factor * _lambda * (1 - d[:, t]) * gae
            gaes.append(gae)
        gaes = list([g.unsqueeze(-1) for g in reversed(gaes)])
        fgae = torch.cat(gaes, dim=1)
        return fgae
Exemplo n.º 7
0
    def run(self):
        # Instantiate the learning model abd the baseline model
        self.learning_model = AgentModel(self.obs_dim, self.n_actions, 16)
        self.baseline_model = BaselineModel(self.obs_dim, 16)

        #We create a batcher dedicated to evaluation
        model = copy.deepcopy(self.learning_model)
        self.evaluation_batcher = EpisodeBatcher(
            n_timesteps=self.config["max_episode_steps"],
            n_slots=self.config["n_evaluation_episodes"],
            create_agent=self._create_agent,
            create_env=self._create_env,
            env_args={
                "n_envs": self.config["n_envs"],
                "max_episode_steps": self.config["max_episode_steps"],
                "env_name": self.config["env_name"]
            },
            agent_args={
                "n_actions": self.n_actions,
                "model": model
            },
            n_threads=self.config["n_evaluation_threads"],
            seeds=[
                self.config["env_seed"] + k * 10
                for k in range(self.config["n_evaluation_threads"])
            ],
        )

        #Creation of the batcher for sampling complete episodes (i.e Episode Batcher)
        #The batcher will sample n_threads*n_envs trajectories at each call
        # To have a fast batcher, we have to configure it with n_timesteps=self.config["max_episode_steps"]
        model = copy.deepcopy(self.learning_model)
        self.train_batcher = EpisodeBatcher(
            n_timesteps=self.config["max_episode_steps"],
            n_slots=self.config["n_envs"] * self.config["n_threads"],
            create_agent=self._create_agent,
            create_env=self._create_env,
            env_args={
                "n_envs": self.config["n_envs"],
                "max_episode_steps": self.config["max_episode_steps"],
                "env_name": self.config["env_name"]
            },
            agent_args={
                "n_actions": self.n_actions,
                "model": model
            },
            n_threads=self.config["n_threads"],
            seeds=[
                self.config["env_seed"] + k * 10
                for k in range(self.config["n_threads"])
            ],
        )

        #Creation of the optimizer
        optimizer = torch.optim.Adam(nn.Sequential(
            self.learning_model, self.baseline_model).parameters(),
                                     lr=self.config["lr"])

        #Training Loop:
        _start_time = time.time()
        self.iteration = 0

        #We launch the evaluation batcher (in deterministic mode)
        n_episodes = self.config["n_evaluation_episodes"]
        agent_info = DictTensor(
            {"stochastic": torch.tensor([False]).repeat(n_episodes)})
        self.evaluation_batcher.execute(n_episodes=n_episodes,
                                        agent_info=agent_info)
        self.evaluation_iteration = self.iteration

        while (time.time() - _start_time < self.config["time_limit"]):
            #Update the batcher with the last version of the learning model
            self.train_batcher.update(self.learning_model.state_dict())

            #Call the batcher to get a sample of trajectories
            #1) The policy will be executed in "stochastic' mode
            n_episodes = self.config["n_envs"] * self.config["n_threads"]
            agent_info = DictTensor(
                {"stochastic": torch.tensor([True]).repeat(n_episodes)})
            self.train_batcher.execute(n_episodes=n_episodes,
                                       agent_info=agent_info)

            #2) We get the trajectories (and wait until the trajectories have been sampled)
            trajectories = self.train_batcher.get(blocking=True)

            #3) Now, we compute the loss
            dt = self.get_loss(trajectories)
            [
                self.logger.add_scalar(k, dt[k].item(), self.iteration)
                for k in dt.keys()
            ]

            # Computation of final loss
            ld = self.config["baseline_coef"] * dt["baseline_loss"]
            lr = self.config["reinforce_coef"] * dt["reinforce_loss"]
            le = self.config["entropy_coef"] * dt["entropy_loss"]

            floss = ld - le - lr

            optimizer.zero_grad()
            floss.backward()
            optimizer.step()

            #Update the train batcher with the updated model
            self.train_batcher.update(self.learning_model.state_dict())
            print("At iteration %d, avg (discounted) reward is %f" %
                  (self.iteration, dt["avg_reward"].item()))
            print("\t Avg trajectory length is %f" %
                  (trajectories.lengths.float().mean().item()))
            print(
                "\t Curves can be visualized using 'tensorboard --logdir=%s'" %
                self.config["logdir"])
            self.iteration += 1

            #We check the evaluation batcher
            evaluation_trajectories = self.evaluation_batcher.get(
                blocking=False)
            if not evaluation_trajectories is None:  #trajectories are available
                #Compute the cumulated reward
                cumulated_reward = (
                    evaluation_trajectories["_reward"] *
                    evaluation_trajectories.mask()).sum(1).mean()
                self.logger.add_scalar("evaluation_reward",
                                       cumulated_reward.item(),
                                       self.evaluation_iteration)
                #We reexecute the evaluation batcher (with same value of agent_info and same number of episodes)
                self.evaluation_batcher.update(
                    self.learning_model.state_dict())
                self.evaluation_iteration = self.iteration
                self.evaluation_batcher.reexecute()

        self.train_batcher.close()
        self.evaluation_batcher.get()  # To wait for the last trajectories
        self.evaluation_batcher.close()
        self.logger.update_csv()  # To save as a CSV file in logdir
        self.logger.close()
Exemplo n.º 8
0
class Reinforce:
    def __init__(self, config, create_env, create_agent):
        self.config = config

        # Creation of the Logger (that saves in tensorboard and CSV)
        self.logger = TFLogger(log_dir=self.config["logdir"], hps=self.config)

        self._create_env = create_env
        self._create_agent = create_agent

        #Creation of one env instance to get the dimensionnality of observations and number of actions
        env = self._create_env(self.config["n_envs"],
                               seed=0,
                               env_name=self.config["env_name"])
        self.n_actions = env.action_space.n
        self.obs_dim = env.reset()[0]["frame"].size()[1]
        del env

    def run(self):
        # Instantiate the learning model abd the baseline model
        self.learning_model = AgentModel(self.obs_dim, self.n_actions, 16)
        self.baseline_model = BaselineModel(self.obs_dim, 16)

        #We create a batcher dedicated to evaluation
        model = copy.deepcopy(self.learning_model)
        self.evaluation_batcher = EpisodeBatcher(
            n_timesteps=self.config["max_episode_steps"],
            n_slots=self.config["n_evaluation_episodes"],
            create_agent=self._create_agent,
            create_env=self._create_env,
            env_args={
                "n_envs": self.config["n_envs"],
                "max_episode_steps": self.config["max_episode_steps"],
                "env_name": self.config["env_name"]
            },
            agent_args={
                "n_actions": self.n_actions,
                "model": model
            },
            n_threads=self.config["n_evaluation_threads"],
            seeds=[
                self.config["env_seed"] + k * 10
                for k in range(self.config["n_evaluation_threads"])
            ],
        )

        #Creation of the batcher for sampling complete episodes (i.e Episode Batcher)
        #The batcher will sample n_threads*n_envs trajectories at each call
        # To have a fast batcher, we have to configure it with n_timesteps=self.config["max_episode_steps"]
        model = copy.deepcopy(self.learning_model)
        self.train_batcher = EpisodeBatcher(
            n_timesteps=self.config["max_episode_steps"],
            n_slots=self.config["n_envs"] * self.config["n_threads"],
            create_agent=self._create_agent,
            create_env=self._create_env,
            env_args={
                "n_envs": self.config["n_envs"],
                "max_episode_steps": self.config["max_episode_steps"],
                "env_name": self.config["env_name"]
            },
            agent_args={
                "n_actions": self.n_actions,
                "model": model
            },
            n_threads=self.config["n_threads"],
            seeds=[
                self.config["env_seed"] + k * 10
                for k in range(self.config["n_threads"])
            ],
        )

        #Creation of the optimizer
        optimizer = torch.optim.Adam(nn.Sequential(
            self.learning_model, self.baseline_model).parameters(),
                                     lr=self.config["lr"])

        #Training Loop:
        _start_time = time.time()
        self.iteration = 0

        #We launch the evaluation batcher (in deterministic mode)
        n_episodes = self.config["n_evaluation_episodes"]
        agent_info = DictTensor(
            {"stochastic": torch.tensor([False]).repeat(n_episodes)})
        self.evaluation_batcher.execute(n_episodes=n_episodes,
                                        agent_info=agent_info)
        self.evaluation_iteration = self.iteration

        while (time.time() - _start_time < self.config["time_limit"]):
            #Update the batcher with the last version of the learning model
            self.train_batcher.update(self.learning_model.state_dict())

            #Call the batcher to get a sample of trajectories
            #1) The policy will be executed in "stochastic' mode
            n_episodes = self.config["n_envs"] * self.config["n_threads"]
            agent_info = DictTensor(
                {"stochastic": torch.tensor([True]).repeat(n_episodes)})
            self.train_batcher.execute(n_episodes=n_episodes,
                                       agent_info=agent_info)

            #2) We get the trajectories (and wait until the trajectories have been sampled)
            trajectories = self.train_batcher.get(blocking=True)

            #3) Now, we compute the loss
            dt = self.get_loss(trajectories)
            [
                self.logger.add_scalar(k, dt[k].item(), self.iteration)
                for k in dt.keys()
            ]

            # Computation of final loss
            ld = self.config["baseline_coef"] * dt["baseline_loss"]
            lr = self.config["reinforce_coef"] * dt["reinforce_loss"]
            le = self.config["entropy_coef"] * dt["entropy_loss"]

            floss = ld - le - lr

            optimizer.zero_grad()
            floss.backward()
            optimizer.step()

            #Update the train batcher with the updated model
            self.train_batcher.update(self.learning_model.state_dict())
            print("At iteration %d, avg (discounted) reward is %f" %
                  (self.iteration, dt["avg_reward"].item()))
            print("\t Avg trajectory length is %f" %
                  (trajectories.lengths.float().mean().item()))
            print(
                "\t Curves can be visualized using 'tensorboard --logdir=%s'" %
                self.config["logdir"])
            self.iteration += 1

            #We check the evaluation batcher
            evaluation_trajectories = self.evaluation_batcher.get(
                blocking=False)
            if not evaluation_trajectories is None:  #trajectories are available
                #Compute the cumulated reward
                cumulated_reward = (
                    evaluation_trajectories["_reward"] *
                    evaluation_trajectories.mask()).sum(1).mean()
                self.logger.add_scalar("evaluation_reward",
                                       cumulated_reward.item(),
                                       self.evaluation_iteration)
                #We reexecute the evaluation batcher (with same value of agent_info and same number of episodes)
                self.evaluation_batcher.update(
                    self.learning_model.state_dict())
                self.evaluation_iteration = self.iteration
                self.evaluation_batcher.reexecute()

        self.train_batcher.close()
        self.evaluation_batcher.get()  # To wait for the last trajectories
        self.evaluation_batcher.close()
        self.logger.update_csv()  # To save as a CSV file in logdir
        self.logger.close()

    def get_loss(self, trajectories):
        #First, we want to compute the cumulated reward per trajectory
        #The reward is a t+1 in each iteration (since it is btained after the aaction), so we use the '_reward' field in the trajectory
        # The 'reward' field corresopnds to the reward at time t
        reward = trajectories["_reward"]

        #We get the mask that tells which transition is in a trajectory (1) or not (0)
        mask = trajectories.mask()

        #We remove the reward values that are not in the trajectories
        reward = reward * mask

        #We compute the future cumulated reward at each timestep (by reverse computation)
        max_length = trajectories.lengths.max().item()
        cumulated_reward = torch.zeros_like(reward)
        cumulated_reward[:, max_length - 1] = reward[:, max_length - 1]
        for t in range(max_length - 2, -1, -1):
            cumulated_reward[:, t] = reward[:, t] + self.config[
                "discount_factor"] * cumulated_reward[:, t + 1]

        #Now, we want to compute the action probabilities over the trajectories such that we will be able to do 'backward'
        action_probabilities = []
        for t in range(max_length):
            proba = self.learning_model(trajectories["frame"][:, t])
            action_probabilities.append(
                proba.unsqueeze(1)
            )  # We append the probability, and introduces the temporal dimension (2nde dimension)
        action_probabilities = torch.cat(
            action_probabilities,
            dim=1)  #Now, we have a B x T x n_actions tensor

        #We compute the baseline
        baseline = []
        for t in range(max_length):
            b = self.baseline_model(trajectories["frame"][:, t])
            baseline.append(b.unsqueeze(1))
        baseline = torch.cat(baseline,
                             dim=1).squeeze(-1)  #Now, we have a B x T tensor

        #We compute the baseline loss
        baseline_loss = (baseline - cumulated_reward)**2
        #We sum the loss for each episode (considering the mask)
        baseline_loss = (baseline_loss * mask).sum(1) / mask.sum(1)
        #We average the loss over all the trajectories
        avg_baseline_loss = baseline_loss.mean()

        #We do the same on the reinforce loss
        action_distribution = torch.distributions.Categorical(
            action_probabilities)
        log_proba = action_distribution.log_prob(trajectories["action"])
        reinforce_loss = log_proba * (cumulated_reward - baseline).detach()
        reinforce_loss = (reinforce_loss * mask).sum(1) / mask.sum(1)
        avg_reinforce_loss = reinforce_loss.mean()

        #We compute the entropy loss
        entropy = action_distribution.entropy()
        entropy = (entropy * mask).sum(1) / mask.sum(1)
        avg_entropy = entropy.mean()

        return DictTensor({
            "avg_reward": cumulated_reward[:, 0].mean(),
            "baseline_loss": avg_baseline_loss,
            "reinforce_loss": avg_reinforce_loss,
            "entropy_loss": avg_entropy
        })
Exemplo n.º 9
0
class DQN(BaseExperiment):
    def __init__(self, config, create_env, create_agent):
        super().__init__(config,create_env,create_agent)
        env = self._create_env(
            self.config["n_envs"], seed=0,**{k:self.config[k] for k in self.config if k.startswith("environment/")}
        )
        self.n_actions = env.action_space.n
        self.obs_dim = env.reset()[0]["frame"].size()[1]        
        del env

    def check_arguments(self,args):
        assert args["n_evaluation_rollouts"]%(args["n_envs"]*args["n_evaluation_threads"])==0
        return True
        
       

    def reset(self):
        self.target_model=copy.deepcopy(self.learning_model)

        model=copy.deepcopy(self.learning_model)
        
        self.train_batcher=Batcher(
            n_timesteps=self.config["batch_timesteps"],
            n_slots=self.config["n_envs"]*self.config["n_threads"],
            create_agent=self._create_agent,
            create_env=self._create_env,
            env_args={
                "mode":"train",
                "n_envs": self.config["n_envs"],
                "max_episode_steps": self.config["max_episode_steps"],
                **{k:self.config[k] for k in self.config if k.startswith("environment/")}
            },
            agent_args={"n_actions": self.n_actions, "model": model},
            n_threads=self.config["n_threads"],
            seeds=self.config["env_seed"],            
        )
        
        model=copy.deepcopy(self.learning_model)
        self.evaluation_batcher=EpisodeBatcher( 
            n_timesteps=self.config["max_episode_steps"],
            n_slots=self.config["n_evaluation_rollouts"],   
            create_agent=self._create_agent,
            create_env=self._create_env,
            env_args={        
                "mode":"evaluation",                      
                "max_episode_steps": self.config["max_episode_steps"],
                "n_envs": self.config["n_envs"],
                **{k:self.config[k] for k in self.config if k.startswith("environment/")}

            },
            agent_args={"n_actions": self.n_actions, "model": model},
            n_threads=self.config["n_evaluation_threads"],
            seeds=self.config["env_seed"]*10,
        )

        self.register_batcher(self.train_batcher)
        self.register_batcher(self.evaluation_batcher)
        

    def _state_dict(self,model,device):
        sd = model.state_dict()
        for k, v in sd.items():
            sd[k] = v.to(device)
        return sd

    def run(self):
        self.replay_buffer=ReplayBuffer(self.config["replay_buffer_size"])
        device = torch.device(self.config["learner_device"])
        self.learning_model.to(device)
        self.target_model.to(device)
        optimizer = torch.optim.Adam(
            self.learning_model.parameters(), lr=self.config["lr"]
        )
        
        self.train_batcher.update(self._state_dict(self.learning_model,torch.device("cpu")))
        self.evaluation_batcher.update(self._state_dict(self.learning_model,torch.device("cpu")))

        n_episodes=self.config["n_envs"]*self.config["n_threads"]
        self.train_batcher.reset(agent_info=DictTensor({"epsilon":torch.ones(n_episodes)*self.config["epsilon_greedy"]}))
        logging.info("Sampling initial transitions")
        for k in range(self.config["initial_buffer_epochs"]):
            self.train_batcher.execute()        
            trajectories=self.train_batcher.get()
            self.replay_buffer.push(trajectories)
        
        n_episodes=self.config["n_evaluation_rollouts"]
        self.evaluation_batcher.execute(agent_info=DictTensor({"epsilon":torch.zeros(n_episodes)}), n_episodes=n_episodes)
        
        logging.info("Starting Learning")
        _start_time=time.time()
        
        logging.info("Learning")
        while time.time()-_start_time <self.config["time_limit"]:
            
            self.train_batcher.execute()
            trajectories=self.train_batcher.get()
            self.replay_buffer.push(trajectories)
            self.logger.add_scalar("replay_buffer_size",self.replay_buffer.size(),self.iteration)
            # avg_reward = 0
           
            for k in range(self.config["qvalue_epochs"]):
                optimizer.zero_grad()
                dt = self.get_loss(device)
                
                [self.logger.add_scalar(k,dt[k].item(),self.iteration) for k in dt.keys()]
                
                floss=dt["q_loss"]
                floss.backward()
                if self.config["clip_grad"] > 0:
                    n = torch.nn.utils.clip_grad_norm_(
                        self.learning_model.parameters(), self.config["clip_grad"]
                    )
                    self.logger.add_scalar("grad_norm", n.item(), self.iteration)
                self.iteration+=1
                optimizer.step()
            
                tau=self.config["tau"]
                self.soft_update_params(self.learning_model,self.target_model,tau)
                


            self.train_batcher.update(self._state_dict(self.learning_model,torch.device("cpu")))            
            self.evaluate()
            self.iteration+=1

    def soft_update_params(self,net, target_net, tau):
        for param, target_param in zip(net.parameters(), target_net.parameters()):
            target_param.data.copy_(tau * param.data +(1 - tau) * target_param.data)


    def evaluate(self,relaunch=True):        
        evaluation_trajectories = self.evaluation_batcher.get(blocking=False)
        
        if (evaluation_trajectories is None):
            return
        
        avg_reward = (
                    (
                        evaluation_trajectories["_reward"]
                        * evaluation_trajectories.mask()
                    )
                    .sum(1)
                    .mean()
                    .item()
        )
        self.logger.add_scalar("avg_reward", avg_reward, self.iteration)
        if (self.config["verbose"]):
                print("Iteration "+str(self.iteration)+", Reward =  "+str(avg_reward)+", Buffer size = "+str(self.replay_buffer.size()))
        
        if (relaunch):
            self.evaluation_batcher.update(self._state_dict(self.learning_model,torch.device("cpu")))
            self.evaluation_batcher.reexecute()
        return avg_reward



    def get_loss(self, device):
        transitions=self.replay_buffer.sample(n=self.config["n_batches"])
        transitions = transitions.to(device)
        B=transitions.n_elems()
        Bv=torch.arange(B)
        action = transitions["action"]
        reward = transitions["_reward"]
        frame = transitions["frame"]
        _frame = transitions["_frame"]
        _done = transitions["_done"].float()

        q=self.learning_model(frame)
        qa=q[Bv,action]
        qp = self.learning_model(_frame)
        actionp=qp.max(1)[1]
        _q_target = self.target_model(_frame).detach()
        _q_target_a= _q_target[Bv,actionp]
        _target_value=_q_target_a*(1-_done)*self.config["discount_factor"]+reward
        td = (_target_value-qa)**2
        dt = DictTensor(
            {
                "q_loss": td.mean(),
            }
        )
        return dt
if __name__ == "__main__":
    import torch.multiprocessing as mp
    mp.set_start_method("spawn")

    # The **EpisodeBatcher** will sample full episodes (until the environment returns done==True)
    # If one consider a rlstructures.VecEnv env, and n_threads (or processes), then the batcher will sample n_episodes = N * env.n_envs()*n_threads episodes at each execution (where N is chosen by the user)
    # *seeds* is a list of environment seeds, one seed per process
    # The batcher has to be configured 'at the right size' since all the processes are sharing a common *Buffer* to store trajectories
    # The simplest case is to choose *n_slots=maximun  number of acquired episodes* and *n_timesteps=max size of the episodes*. We will describe later the exact meaning of theses arguments.

    batcher=EpisodeBatcher(
            n_timesteps=100,
            n_slots=128,
            n_threads=4,
            seeds=[1,2,3,4],
            create_agent=create_agent,
            agent_args={"n_actions":2},
            create_env=create_env,
            env_args={"max_episode_steps":100}
    )

    # Execution will start the acqusition process (such that some other computations can be done in parallel to the episodes acquisition)

    n_episodes=32

    # Since we will sample 32 episodes, we need to configure the 32 agents and 32 environments that will interact

    agent_info=DictTensor({"agent_id":torch.arange(32)})
    env_info=DictTensor({"env_id":torch.arange(32)})

    #Running the batcher. It is a non-blocking function that launch the acqusition