Exemplo n.º 1
0
    def initial_phase_training(self, max_epochs=1000, sup_batch_size=64):
        # change optimizer to Adam for unsupervised learning
        self.action_rep.optim = torch.optim.Adam(self.action_rep.parameters(),
                                                 lr=1e-3)
        initial_losses = []

        print("Inital training phase started...")
        for counter in range(max_epochs):
            losses = []
            states, actions, rewards, next_states, terminals = self.replay_memory.sample(
                sup_batch_size, random_machine=self.np_random)
            states = torch.from_numpy(states).to(device)
            actions_combined = torch.from_numpy(actions).to(
                device)  # make sure to separate actions and action-parameters
            action = actions_combined[:, 0].long()
            action_para = actions_combined[:, self.num_actions + 1:]
            next_states = torch.from_numpy(next_states).to(device)

            loss = self.self_supervised_update(states, action, action_para,
                                               next_states)
            losses.append(loss)

            initial_losses.append(np.mean(losses))
            if counter % 1 == 0:
                print("Epoch {} loss:: {}".format(
                    counter, np.mean(initial_losses[-10:])))

            # Terminate initial phase once action representations have converged.
            # if len(initial_losses) >= 20 and np.mean(initial_losses[-10:]) + 1e-5 >= np.mean(initial_losses[-20:]):
            #     print("Converged...")
            #     break

        print('... Initial training phase terminated!')
        self.initial_phase = False
        hard_update_target_network(self.action_rep, self.target_action_rep)
 def __init__(self,
              *args,
              **kwargs):
     super().__init__(*args, **kwargs)
     self.actor = MultiPassQActor(self.observation_space.shape[0], self.num_actions, self.action_parameter_sizes,
                                  **kwargs['actor_kwargs']).to(device)
     self.actor_target = MultiPassQActor(self.observation_space.shape[0], self.num_actions, self.action_parameter_sizes,
                                         **kwargs['actor_kwargs']).to(device)
     hard_update_target_network(self.actor, self.actor_target)
     self.actor_target.eval()
     self.actor_optimiser = optim.Adam(self.actor.parameters(), lr=self.learning_rate_actor)
Exemplo n.º 3
0
    def set_action_parameter_passthrough_weights(self, initial_weights, initial_bias=None):
        passthrough_layer = self.actor_param.action_parameters_passthrough_layer

        assert initial_weights.shape == passthrough_layer.weight.data.size()
        passthrough_layer.weight.data = torch.Tensor(initial_weights).float().to(self.device)
        if initial_bias is not None:

            assert initial_bias.shape == passthrough_layer.bias.data.size()
            passthrough_layer.bias.data = torch.Tensor(initial_bias).float().to(self.device)
        passthrough_layer.requires_grad = False
        passthrough_layer.weight.requires_grad = False
        passthrough_layer.bias.requires_grad = False
        hard_update_target_network(self.actor_param, self.actor_param_target)
Exemplo n.º 4
0
    def __init__(
            self,
            observation_space,
            action_space,
            actor_class=QActor,
            actor_kwargs={},
            actor_param_class=ParamActor,
            actor_param_kwargs={},
            epsilon_initial=1.0,
            epsilon_final=0.05,
            epsilon_steps=10000,
            batch_size=64,
            gamma=0.99,
            tau_actor=0.01,  # Polyak averaging factor for copying target weights
            tau_actor_param=0.001,
            replay_memory_size=1000000,
            learning_rate_actor=0.0001,
            learning_rate_actor_param=0.00001,
            initial_memory_threshold=0,
            use_ornstein_noise=False,  # if false, uses epsilon-greedy with uniform-random action-parameter exploration
            loss_func=F.mse_loss,  # F.mse_loss
            clip_grad=10,
            inverting_gradients=False,
            zero_index_gradients=False,
            indexed=False,
            weighted=False,
            average=False,
            random_weighted=False,
            device="cuda" if torch.cuda.is_available() else "cpu",
            seed=None):
        super(PDQNAgent, self).__init__(observation_space, action_space)
        self.device = torch.device(device)
        self.num_actions = self.action_space.spaces[0].n
        self.action_parameter_sizes = np.array([
            self.action_space.spaces[i].shape[0]
            for i in range(1, self.num_actions + 1)
        ])
        self.action_parameter_size = int(self.action_parameter_sizes.sum())
        self.action_max = torch.from_numpy(np.ones(
            (self.num_actions, ))).float().to(device)
        self.action_min = -self.action_max.detach()
        self.action_range = (self.action_max - self.action_min).detach()
        print([
            self.action_space.spaces[i].high
            for i in range(1, self.num_actions + 1)
        ])
        self.action_parameter_max_numpy = np.concatenate([
            self.action_space.spaces[i].high
            for i in range(1, self.num_actions + 1)
        ]).ravel()
        self.action_parameter_min_numpy = np.concatenate([
            self.action_space.spaces[i].low
            for i in range(1, self.num_actions + 1)
        ]).ravel()
        self.action_parameter_range_numpy = (self.action_parameter_max_numpy -
                                             self.action_parameter_min_numpy)
        self.action_parameter_max = torch.from_numpy(
            self.action_parameter_max_numpy).float().to(device)
        self.action_parameter_min = torch.from_numpy(
            self.action_parameter_min_numpy).float().to(device)
        self.action_parameter_range = torch.from_numpy(
            self.action_parameter_range_numpy).float().to(device)
        self.epsilon = epsilon_initial
        self.epsilon_initial = epsilon_initial
        self.epsilon_final = epsilon_final
        self.epsilon_steps = epsilon_steps
        self.indexed = indexed
        self.weighted = weighted
        self.average = average
        self.random_weighted = random_weighted
        assert (weighted ^ average ^ random_weighted
                ) or not (weighted or average or random_weighted)

        self.action_parameter_offsets = self.action_parameter_sizes.cumsum()
        self.action_parameter_offsets = np.insert(
            self.action_parameter_offsets, 0, 0)

        self.batch_size = batch_size
        self.gamma = gamma
        self.replay_memory_size = replay_memory_size
        self.initial_memory_threshold = initial_memory_threshold
        self.learning_rate_actor = learning_rate_actor
        self.learning_rate_actor_param = learning_rate_actor_param
        self.inverting_gradients = inverting_gradients
        self.tau_actor = tau_actor
        self.tau_actor_param = tau_actor_param
        self._step = 0
        self._episode = 0
        self.updates = 0
        self.clip_grad = clip_grad
        self.zero_index_gradients = zero_index_gradients

        self.np_random = None
        self.seed = seed
        self._seed(seed)

        self.use_ornstein_noise = use_ornstein_noise
        self.noise = OrnsteinUhlenbeckActionNoise(
            self.action_parameter_size,
            random_machine=self.np_random,
            mu=0.,
            theta=0.15,
            sigma=0.0001)  #, theta=0.01, sigma=0.01)

        print(self.num_actions + self.action_parameter_size)
        self.replay_memory = Memory(replay_memory_size,
                                    observation_space.shape,
                                    (1 + self.action_parameter_size, ),
                                    next_actions=False)
        self.actor = actor_class(self.observation_space.shape[0],
                                 self.num_actions, self.action_parameter_size,
                                 **actor_kwargs).to(device)
        self.actor_target = actor_class(self.observation_space.shape[0],
                                        self.num_actions,
                                        self.action_parameter_size,
                                        **actor_kwargs).to(device)
        hard_update_target_network(self.actor, self.actor_target)
        self.actor_target.eval()

        self.actor_param = actor_param_class(self.observation_space.shape[0],
                                             self.num_actions,
                                             self.action_parameter_size,
                                             **actor_param_kwargs).to(device)
        self.actor_param_target = actor_param_class(
            self.observation_space.shape[0], self.num_actions,
            self.action_parameter_size, **actor_param_kwargs).to(device)
        hard_update_target_network(self.actor_param, self.actor_param_target)
        self.actor_param_target.eval()

        self.loss_func = loss_func  # l1_smooth_loss performs better but original paper used MSE

        # Original DDPG paper [Lillicrap et al. 2016] used a weight decay of 0.01 for Q (critic)
        # but setting weight_decay=0.01 on the critic_optimiser seems to perform worse...
        # using AMSgrad ("fixed" version of Adam, amsgrad=True) doesn't seem to help either...
        self.actor_optimiser = optim.Adam(
            self.actor.parameters(),
            lr=self.learning_rate_actor)  #, betas=(0.95, 0.999))
        self.actor_param_optimiser = optim.Adam(
            self.actor_param.parameters(), lr=self.learning_rate_actor_param
        )  #, betas=(0.95, 0.999)) #, weight_decay=critic_l2_reg)
Exemplo n.º 5
0
    def __init__(
            self,
            observation_space,
            action_space,
            actor_class=Actor,
            reduced_action_dim=3,
            parameter_action_dim=4,
            actor_kwargs={},
            critic_class=Critic,
            critic_kwargs={},
            epsilon_initial=1.0,
            epsilon_final=0.01,
            epsilon_steps=10000,
            batch_size=64,
            gamma=0.99,
            beta=0.5,  # averaging factor between off-policy and on-policy targets during n-step updates
            tau_actor=0.001,  # Polyak averaging factor for updating target weights
            tau_critic=0.001,
            replay_memory=None,  # memory buffer object
            replay_memory_size=1000000,
            learning_rate_actor=0.00001,
            learning_rate_critic=0.001,
            initial_memory_threshold=0,
            clip_grad=10,
            adam_betas=(0.95, 0.999),
            use_ornstein_noise=False,  # if false, uses epsilon-greedy with uniform-random action-parameter exploration
            loss_func=F.mse_loss,  # F.smooth_l1_loss
            inverting_gradients=False,
            n_step_returns=False,
            initial_phase=True,
            embed_lr=1e-4,
            initial_phase_epochs=2000,
            seed=None):
        super(PADDPGAgent, self).__init__(observation_space, action_space)

        self.num_actions = self.action_space.spaces[0].n
        self.action_parameter_sizes = np.array([
            self.action_space.spaces[i].shape[0]
            for i in range(1, self.num_actions + 1)
        ])
        self.action_parameter_size = int(self.action_parameter_sizes.sum())
        self.action_max = torch.from_numpy(np.ones(
            (self.num_actions, ))).float().to(device)
        self.action_min = -self.action_max.detach()
        self.action_range = (self.action_max - self.action_min).detach()
        self.action_parameter_max_numpy = np.concatenate([
            self.action_space.spaces[i].high
            for i in range(1, self.num_actions + 1)
        ]).ravel()
        self.action_parameter_min_numpy = np.concatenate([
            self.action_space.spaces[i].low
            for i in range(1, self.num_actions + 1)
        ]).ravel()
        self.action_parameter_range_numpy = (self.action_parameter_max_numpy -
                                             self.action_parameter_min_numpy)
        self.action_parameter_max = torch.from_numpy(
            self.action_parameter_max_numpy).float().to(device)
        self.action_parameter_min = torch.from_numpy(
            self.action_parameter_min_numpy).float().to(device)
        self.action_parameter_range = torch.from_numpy(
            self.action_parameter_range_numpy).float().to(device)

        self.epsilon = epsilon_initial
        self.epsilon_initial = epsilon_initial
        self.epsilon_final = epsilon_final
        self.epsilon_steps = epsilon_steps

        self.clip_grad = clip_grad
        self.batch_size = batch_size
        self.gamma = gamma
        self.beta = beta
        self.replay_memory_size = replay_memory_size
        self.initial_memory_threshold = initial_memory_threshold
        self.learning_rate_actor = learning_rate_actor
        self.learning_rate_critic = learning_rate_critic
        self.inverting_gradients = inverting_gradients
        self.tau_actor = tau_actor
        self.tau_critic = tau_critic
        self._step = 0
        self._episode = 0
        self.updates = 0

        self.np_random = None
        self.seed = seed
        self._seed(seed)

        #embedding初始部分
        self.action_rep = ActionRepresentation.Action_representation(
            state_dim=self.observation_space.shape[0],
            action_dim=self.num_actions,
            reduced_action_dim=self.num_actions,
            parameter_action_dim=self.action_parameter_size)
        self.target_action_rep = ActionRepresentation.Action_representation(
            state_dim=self.observation_space.shape[0],
            action_dim=self.num_actions,
            reduced_action_dim=self.num_actions,
            parameter_action_dim=self.action_parameter_size)
        hard_update_target_network(self.action_rep, self.target_action_rep)
        self.initial_phase = initial_phase
        self.reduced_action_dim = reduced_action_dim
        self.parameter_action_dim = parameter_action_dim
        self.embed_lr = embed_lr
        self.initial_phase_epochs = initial_phase_epochs

        self.use_ornstein_noise = use_ornstein_noise
        self.noise = OrnsteinUhlenbeckActionNoise(
            self.action_parameter_size,
            random_machine=self.np_random,
            mu=0.,
            theta=0.15,
            sigma=0.0001)
        self.noise1 = OrnsteinUhlenbeckActionNoise(self.num_actions)
        print(self.num_actions + self.action_parameter_size)
        self.n_step_returns = n_step_returns
        if replay_memory is None:
            self.replay_memory = MemoryNStepReturns(
                replay_memory_size,
                observation_space.shape,
                (1 + self.num_actions + self.action_parameter_size, ),
                next_actions=False,
                n_step_returns=self.n_step_returns)
        else:
            self.replay_memory = replay_memory
        self.actor = actor_class(self.observation_space.shape[0],
                                 self.num_actions, self.action_parameter_size,
                                 **actor_kwargs).to(device)
        self.actor_target = actor_class(self.observation_space.shape[0],
                                        self.num_actions,
                                        self.action_parameter_size,
                                        **actor_kwargs).to(device)
        hard_update_target_network(self.actor, self.actor_target)
        self.actor_target.eval()

        self.critic = critic_class(self.observation_space.shape[0],
                                   self.num_actions,
                                   self.action_parameter_size,
                                   **critic_kwargs).to(device)
        self.critic_target = critic_class(self.observation_space.shape[0],
                                          self.num_actions,
                                          self.action_parameter_size,
                                          **critic_kwargs).to(device)
        hard_update_target_network(self.critic, self.critic_target)
        self.critic_target.eval()

        self.loss_func = loss_func  # l1_smooth_loss performs better but original paper used MSE

        self.actor_optimiser = optim.Adam(self.actor.parameters(),
                                          lr=self.learning_rate_actor,
                                          betas=adam_betas)
        self.critic_optimiser = optim.Adam(self.critic.parameters(),
                                           lr=self.learning_rate_critic,
                                           betas=adam_betas)
        self.action_rep_optimiser = optim.SGD(self.action_rep.parameters(),
                                              lr=self.embed_lr)