def __init__(self, model_aup, model_aux, env_type, z_dim, aup_train_steps,
                 **kwargs):
        load_kwargs(self, kwargs)
        assert self.training_envs is not None

        self.aup_train_steps = aup_train_steps
        for m in model_aux.model_aux:
            m.to(self.compute_device)
        self.model_aux = model_aux.model_aux
        self.optimizer_aux = model_aux.optimizer_aux
        self.model_aup = model_aup.to(self.compute_device)
        self.optimizer_aup = optim.Adam(self.model_aup.parameters(),
                                        lr=self.learning_rate_aup)
        checkpointing.load_checkpoint(self.logdir, self, aup=True)
        self.exp = env_type
        """ AUP-specific parameters """
        self.z_dim = z_dim
        self.use_scale = False
        if env_type == 'append-still':
            n_steps = 5e6
        else:
            n_steps = 4e6

        self.lamb_schedule = LinearSchedule(n_steps,
                                            initial_p=1e-3,
                                            final_p=1e-1)
示例#2
0
    def __init__(self, model, **kwargs):
        load_kwargs(self, kwargs)
        assert self.training_envs is not None

        self.model = model.to(self.compute_device)
        self.optimizer = optim.Adam(self.model.parameters(),
                                    lr=self.learning_rate)

        self.load_checkpoint()
示例#3
0
 def __init__(self, *levels, logger, curriculum_params={}, **kwargs):
     super().__init__(*levels, repeat_levels=True, **kwargs)
     self.logger = logger
     self.curriculum_stage = 0
     self.max_stage = len(levels) - 1
     self.curr_currently_playing = 0
     self.just_advanced = False
     self.perf_records = defaultdict(
         lambda: [0.0])  # map level to history of performance
     self.best = defaultdict(lambda: 0)
     load_kwargs(self, curriculum_params)
示例#4
0
    def __init__(self, training_model, target_model, **kwargs):
        load_kwargs(self, kwargs)
        assert self.training_envs is not None

        self.training_model = training_model.to(self.compute_device)
        self.target_model = target_model.to(self.compute_device)
        self.optimizer = optim.Adam(self.training_model.parameters(),
                                    lr=self.learning_rate)
        self.replay_buffer = ReplayBuffer(self.replay_size)

        checkpointing.load_checkpoint(self.logdir, self)
        print('loaded to {} steps'.format(self.num_steps))
示例#5
0
    def __init__(self, training_model, target_model, **kwargs):
        load_kwargs(self, kwargs)
        assert self.training_envs is not None

        self.training_model = training_model.to(self.compute_device)
        self.target_model = target_model.to(self.compute_device)
        self.optimizer = optim.Adam(self.training_model.parameters(),
                                    lr=self.learning_rate)
        self.replay_buffer = MultistepReplayBuffer(self.replay_size,
                                                   len(self.training_envs),
                                                   self.multi_step_learning,
                                                   self.gamma)

        self.load_checkpoint()
        self.epsilon = self.epsilon_schedule(self.num_steps)
示例#6
0
    def __init__(self, training_model, target_model, **kwargs):
        load_kwargs(self, kwargs)
        assert self.training_envs is not None

        self.training_model = training_model.to(self.compute_device)
        self.target_model = target_model.to(self.compute_device)
        self.optimizer = optim.Adam(self.training_model.parameters(),
                                    lr=self.learning_rate)
        self.replay_buffer = ReplayBuffer(self.replay_size)
        self.agent_trajectories = defaultdict(lambda: np.empty(
            self.multi_step_learning,
            dtype=[('obs', object), ('action', int), ('reward', float)]))

        self.load_checkpoint()
        self.epsilon = self.epsilon_schedule(self.num_steps)
示例#7
0
    def __init__(self, model_aux, env_type, z_dim, n_rfn, buf_size, vae_epochs,
                 random_projection, aux_train_steps, **kwargs):

        load_kwargs(self, kwargs)
        assert self.training_envs is not None

        self.model_aux = [
            model_aux.to(self.compute_device) for _ in range(n_rfn)
        ]
        self.optimizer_aux = [
            optim.Adam(m.parameters(), lr=self.learning_rate_aux)
            for m in self.model_aux
        ]
        self.aux_train_steps = aux_train_steps
        checkpointing.load_checkpoint(self.logdir, self, aux=True)
        print(self.model_aux)
        self.exp = env_type
        if self.num_steps >= (aux_train_steps - 1):
            skip_vae_training = True
        else:
            skip_vae_training = False
        print('loaded to {} steps'.format(self.num_steps))
        print('Final train step is {} steps'.format(aux_train_steps))

        self.z_dim = z_dim
        self.state_encoder = None
        self.n_random_reward_fns = n_rfn
        self.is_random_projection = random_projection
        self.random_buffer_size = buf_size
        self.train_encoder_epochs = vae_epochs

        if not self.is_random_projection and not skip_vae_training:
            self.state_encoder = [
                self.train_state_encoder(envs=self.training_envs)
                for _ in range(n_rfn)
            ]
        if random_projection:
            self.state_encoder = [None for _ in range(n_rfn)]

        for model in self.model_aux:
            model.register_reward_function(
                dim=self.z_dim,
                projection=self.is_random_projection,
                device=self.compute_device)