Пример #1
0
    def __init__(self, envs, acmodel, device, num_frames_per_proc, discount,
                 lr, gae_lambda, entropy_coef, value_loss_coef, max_grad_norm,
                 recurrence, preprocess_obss, reshape_reward):
        """
        Initializes a `BaseAlgo` instance.

        Parameters:
        ----------
        envs : list
            a list of environments that will be run in parallel
        acmodel : torch.Module
            the model
        num_frames_per_proc : int
            the number of frames collected by every process for an update
        discount : float
            the discount for future rewards
        lr : float
            the learning rate for optimizers
        gae_lambda : float
            the lambda coefficient in the GAE formula
            ([Schulman et al., 2015](https://arxiv.org/abs/1506.02438))
        entropy_coef : float
            the weight of the entropy cost in the final objective
        value_loss_coef : float
            the weight of the value loss in the final objective
        max_grad_norm : float
            gradient will be clipped to be at most this value
        recurrence : int
            the number of steps the gradient is propagated back in time
        preprocess_obss : function
            a function that takes observations returned by the environment
            and converts them into the format that the model can handle
        reshape_reward : function
            a function that shapes the reward, takes an
            (observation, action, reward, done) tuple as an input
        """

        # Store parameters

        self.env = ParallelEnv(envs)
        self.acmodel = acmodel
        self.device = device
        self.num_frames_per_proc = num_frames_per_proc
        self.discount = discount
        self.lr = lr
        self.gae_lambda = gae_lambda
        self.entropy_coef = entropy_coef
        self.value_loss_coef = value_loss_coef
        self.max_grad_norm = max_grad_norm
        self.recurrence = recurrence
        self.preprocess_obss = preprocess_obss or default_preprocess_obss
        self.reshape_reward = reshape_reward

        # Control parameters

        assert self.acmodel.recurrent or self.recurrence == 1
        assert self.num_frames_per_proc % self.recurrence == 0

        # Configure acmodel

        self.acmodel.to(self.device)
        self.acmodel.train()

        # Store helpers values

        self.num_procs = len(envs)
        self.num_frames = self.num_frames_per_proc * self.num_procs

        # Initialize experience values

        shape = (self.num_frames_per_proc, self.num_procs)

        self.obs = self.env.reset()
        self.obss = [None] * (shape[0])
        if self.acmodel.recurrent:
            self.memory = torch.zeros(shape[1],
                                      self.acmodel.memory_size,
                                      device=self.device)
            self.memories = torch.zeros(*shape,
                                        self.acmodel.memory_size,
                                        device=self.device)
        self.mask = torch.ones(shape[1], device=self.device)
        self.masks = torch.zeros(*shape, device=self.device)
        self.actions = torch.zeros(*shape, device=self.device, dtype=torch.int)
        self.values = torch.zeros(*shape, device=self.device)
        self.rewards = torch.zeros(*shape, device=self.device)
        self.advantages = torch.zeros(*shape, device=self.device)
        self.log_probs = torch.zeros(*shape, device=self.device)

        # Initialize log values

        self.log_episode_return = torch.zeros(self.num_procs,
                                              device=self.device)
        self.log_episode_reshaped_return = torch.zeros(self.num_procs,
                                                       device=self.device)
        self.log_episode_num_frames = torch.zeros(self.num_procs,
                                                  device=self.device)

        self.log_done_counter = 0
        self.log_return = [0] * self.num_procs
        self.log_reshaped_return = [0] * self.num_procs
        self.log_num_frames = [0] * self.num_procs
        self.step_counter = [0] * self.num_procs
Пример #2
0
    def __init__(self,
                 envs,
                 model,
                 device=None,
                 num_frames_per_proc=None,
                 discount=0.99,
                 lr=0.001,
                 recurrence=4,
                 adam_eps=1e-8,
                 buffer_size=10000,
                 preprocess_obss=None,
                 reshape_reward=None):
        """
        Initializes a `BaseAlgo` instance.

        Parameters:
        ----------
        envs : list
            a list of environments that will be run in parallel
        acmodel : torch.Module
            the model
        num_frames_per_proc : int
            the number of frames collected by every process for an update
        discount : float
            the discount for future rewards
        lr : float
            the learning rate for optimizers
        gae_lambda : float
            the lambda coefficient in the GAE formula
            ([Schulman et al., 2015](https://arxiv.org/abs/1506.02438))
        entropy_coef : float
            the weight of the entropy cost in the final objective
        value_loss_coef : float
            the weight of the value loss in the final objective
        max_grad_norm : float
            gradient will be clipped to be at most this value
        recurrence : int
            the number of steps the gradient is propagated back in time
        preprocess_obss : function
            a function that takes observations returned by the environment
            and converts them into the format that the model can handle
        reshape_reward : function
            a function that shapes the reward, takes an
            (observation, action, reward, done) tuple as an input
        """

        # Store parameters

        num_frames_per_proc = num_frames_per_proc or 128  # is 128 correct here?

        self.env = ParallelEnv(envs)
        self.model = model
        self.eval_model = deepcopy(model)
        self.device = device
        self.num_frames_per_proc = num_frames_per_proc
        self.discount = discount
        self.lr = lr
        self.recurrence = recurrence
        self.preprocess_obss = preprocess_obss or default_preprocess_obss
        self.reshape_reward = reshape_reward

        self.reward_size = self.model.reward_size

        # Control parameters

        assert self.model.recurrent or self.recurrence == 1
        assert self.num_frames_per_proc % self.recurrence == 0

        # Configure acmodel

        self.model.to(self.device)
        self.model.train()

        # Store helpers values

        self.num_procs = len(envs)
        self.num_frames = self.num_frames_per_proc * self.num_procs

        # Initialize experience values

        shape = (self.num_frames_per_proc, self.num_procs)

        self.obs = self.env.reset()
        self.obss = [None] * (shape[0])
        if self.model.recurrent:
            self.memory = torch.zeros(shape[1],
                                      self.model.memory_size,
                                      device=self.device)
            self.memories = torch.zeros(*shape,
                                        self.model.memory_size,
                                        device=self.device)
        self.mask = torch.ones(shape[1], device=self.device)
        self.masks = torch.zeros(*shape, device=self.device)
        #        self.masks = torch.zeros(*shape, self.reward_size, device=self.device)
        self.actions = torch.zeros(*shape, device=self.device, dtype=torch.int)
        self.values = torch.zeros(*shape,
                                  self.env.action_space.n,
                                  self.reward_size,
                                  device=self.device)
        self.expected_values = torch.zeros(*shape,
                                           self.reward_size,
                                           device=self.device)
        self.rewards = torch.zeros(*shape,
                                   self.reward_size,
                                   device=self.device)
        self.log_probs = torch.zeros(*shape, device=self.device)

        # initialize the pareto weights

        self.weights = torch.ones(
            shape[1], self.reward_size, device=self.device) / self.reward_size

        # Initialize log values

        self.log_episode_return = torch.zeros(self.num_procs,
                                              self.reward_size,
                                              device=self.device)
        self.log_episode_reshaped_return = torch.zeros(self.num_procs,
                                                       self.reward_size,
                                                       device=self.device)
        self.log_episode_num_frames = torch.zeros(self.num_procs,
                                                  device=self.device)

        self.log_done_counter = 0
        self.log_return = [0] * self.num_procs
        self.log_reshaped_return = [0] * self.num_procs
        self.log_num_frames = [0] * self.num_procs

        self.buffer = ReplayBuffer(capacity=buffer_size)

        self.eps = 0.05

        #self.optimizer = torch.optim.Adam(self.model.parameters(), lr, eps=adam_eps)
        self.optimizer = torch.optim.RMSprop(params=self.model.parameters(),
                                             lr=self.lr)
Пример #3
0
def run_eval():
    envs = []
    for i in range(1):
        env = utils.make_env(args.env, args.seed + 10000 * i)
        env.is_teaching = False
        env.end_pos = args.eval_goal
        envs.append(env)
    env = ParallelEnv(envs)

    # Load agent

    model_dir = utils.get_model_dir(args.model)
    agent = utils.Agent(env.observation_space, env.action_space, model_dir, device, args.argmax, args.procs)

    # Initialize logs

    logs = {"num_frames_per_episode": [], "return_per_episode": []}

    # Run agent

    start_time = time.time()

    obss = env.reset()

    log_done_counter = 0
    log_episode_return = torch.zeros(args.procs, device=device)
    log_episode_num_frames = torch.zeros(args.procs, device=device)
    positions = []
    while log_done_counter < args.episodes:
        actions = agent.get_actions(obss)
        obss, rewards, dones, infos = env.step(actions)
        positions.extend([info["agent_pos"] for info in infos])
        agent.analyze_feedbacks(rewards, dones)

        log_episode_return += torch.tensor(rewards, device=device, dtype=torch.float)
        log_episode_num_frames += torch.ones(args.procs, device=device)

        for i, done in enumerate(dones):
            if done:
                log_done_counter += 1
                logs["return_per_episode"].append(log_episode_return[i].item())
                logs["num_frames_per_episode"].append(log_episode_num_frames[i].item())

        mask = 1 - torch.tensor(dones, device=device, dtype=torch.float)
        log_episode_return *= mask
        log_episode_num_frames *= mask

    end_time = time.time()

    # Print logs

    num_frames = sum(logs["num_frames_per_episode"])
    fps = num_frames/(end_time - start_time)
    duration = int(end_time - start_time)
    return_per_episode = utils.synthesize(logs["return_per_episode"])
    num_frames_per_episode = utils.synthesize(logs["num_frames_per_episode"])

    print("Eval: F {} | FPS {:.0f} | D {} | R:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {}"
          .format(num_frames, fps, duration,
                  *return_per_episode.values(),
                  *num_frames_per_episode.values()))
    return return_per_episode
Пример #4
0
# Set device

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#print(f"Device: {device}\n")
print("Device: {device}\n")

# Load environments

envs = []
for i in range(1):
    env = utils.make_env(args.env, args.seed + 10000 * i)
    env.is_teaching = False
    env.end_pos = [3, 1]
    envs.append(env)
env = ParallelEnv(envs)
print("Environments loaded\n")

# Load agent

model_dir = utils.get_model_dir(args.model)
agent = utils.Agent(env.observation_space, env.action_space, model_dir, device,
                    args.argmax, args.procs)
print("Agent loaded\n")

# Initialize logs

logs = {"num_frames_per_episode": [], "return_per_episode": []}

# Run agent
Пример #5
0
# Set device

device = torch.device("cpu")#torch.device("cuda" if torch.cuda.is_available() else "cpu")
txt_logger.info(f"Device: {device}\n")

# Load environments

envs = []
for i in range(args.procs):
    envs.append(utils.make_env(args.env, args.seed + 10000 * i))

eval_envs = []
for i in range(args.procs):
    eval_env_name = args.eval_env if args.eval_env else args.env
    eval_envs.append(utils.make_env(eval_env_name, args.seed + 10000 * i))
parallel_eval_env = ParallelEnv(eval_envs) # do this here cause we change the attribute directly on algo (instead of re-instantiating)
parallel_env = ParallelEnv(envs)
txt_logger.info("Environments loaded\n")

# Load training status

try:
    if args.load_status:
        status = utils.get_status(model_dir)
    else:
        status = {"num_frames": 0, "update": 0}
except OSError:
    status = {"num_frames": 0, "update": 0}

txt_logger.info("Training status loaded\n")