예제 #1
0
    def __init__(self, obs_dim, act_dim, env, memory_size=50000, batch_size=64,\
                 lr_critic=1e-4, lr_actor=1e-4, gamma=0.99, tau=0.001, n_steps = 1):

        self.gamma = gamma
        self.batch_size = batch_size
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        self.memory_size = memory_size
        self.tau = tau
        self.env = env
        self.n_steps = n_steps
        self.n_step_gamma = self.gamma**self.n_steps

        # actor
        self.actor = actor(input_size=obs_dim, output_size=act_dim)
        self.actor_target = actor(input_size=obs_dim, output_size=act_dim)
        self.actor_target.load_state_dict(self.actor.state_dict())

        # critic
        self.critic = critic(state_size=obs_dim,
                             action_size=act_dim,
                             output_size=1)
        self.critic_target = critic(state_size=obs_dim,
                                    action_size=act_dim,
                                    output_size=1)
        self.critic_target.load_state_dict(self.critic.state_dict())

        # optimizers
        self.optimizer_actor = optim.Adam(self.actor.parameters(), lr=lr_actor)
        self.optimizer_critic = optim.Adam(self.critic.parameters(),
                                           lr=lr_critic)

        # critic loss
        self.critic_loss = nn.MSELoss()

        # noise
        # self.noise = OrnsteinUhlenbeckProcess(dimension=act_dim, num_steps=5000)
        self.noise = GaussianNoise(dimension=act_dim, num_epochs=5000)

        # replay buffer
        #self.replayBuffer = Replay(self.memory_size, window_length=1)
        self.replayBuffer = Replay(self.memory_size, self.env)
        critic_t.load_state_dict(critic.state_dict())

    else:
        critic = Critic(state_dim, action_dim, max_action, args)
        critic_t = Critic(state_dim, action_dim, max_action, args)
        critic_t.load_state_dict(critic.state_dict())

    print("OK 3")
    # actor
    actor = Actor(state_dim, action_dim, max_action, args)
    actor_t = Actor(state_dim, action_dim, max_action, args)
    actor_t.load_state_dict(actor.state_dict())

    # action noise
    if not args.ou_noise:
        a_noise = GaussianNoise(action_dim, sigma=args.gauss_sigma)
    else:
        a_noise = OrnsteinUhlenbeckProcess(action_dim,
                                           mu=args.ou_mu,
                                           theta=args.ou_theta,
                                           sigma=args.ou_sigma)

    if USE_CUDA:
        critic.cuda()
        critic_t.cuda()
        actor.cuda()
        actor_t.cuda()

    print("OK 4")
    # CEM
    es = sepCEM(actor.get_size(),
예제 #3
0
    def __init__(self, obs_dim, act_dim, env = None, memory_size=50000, batch_size=64,\
                 lr_critic=1e-4, lr_actor=1e-4, gamma=0.99, tau=0.001, prioritized_replay=True,\
                 critic_dist_info=None, n_steps=1):

        self.gamma = gamma
        self.n_steps = n_steps
        self.n_step_gamma = self.gamma**self.n_steps
        self.batch_size = batch_size
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        self.memory_size = memory_size
        self.tau = tau
        self.env = env

        ##   critic_dist_info:
        # dictionary with information about critic output distribution.
        # parameters:
        # 1. distribution_type = 'categorical' or 'mixture_of_gaussian'
        #    if 'categorical':
        #       a.
        #    if 'mixture_of_gaussian':
        #       b.

        self.dist_type = critic_dist_info['type']
        if critic_dist_info['type'] == 'categorical':
            self.v_min = critic_dist_info['v_min']
            self.v_max = critic_dist_info['v_max']
            self.n_atoms = critic_dist_info['n_atoms']
            self.delta = (self.v_max - self.v_min) / float(self.n_atoms - 1)
            self.bin_centers = np.array([
                self.v_min + i * self.delta for i in range(self.n_atoms)
            ]).reshape(-1, 1)
        elif critic_dist_info['type'] == 'mixture_of_gaussian':
            #TODO
            pass
        else:
            print("Error: Unsupported distribution type")
            # TODO
            # throw exception

        # actor
        self.actor = actor(input_size=obs_dim, output_size=act_dim)
        self.actor_target = actor(input_size=obs_dim, output_size=act_dim)
        self.actor_target.load_state_dict(self.actor.state_dict())

        # critic
        self.critic = critic(state_size=obs_dim,
                             action_size=act_dim,
                             dist_info=critic_dist_info)
        self.critic_target = critic(state_size=obs_dim,
                                    action_size=act_dim,
                                    dist_info=critic_dist_info)
        self.critic_target.load_state_dict(self.critic.state_dict())

        # optimizers
        self.optimizer_actor = optim.Adam(self.actor.parameters(), lr=lr_actor)
        self.optimizer_critic = optim.Adam(self.critic.parameters(),
                                           lr=lr_critic)

        # critic loss
        self.critic_loss = nn.CrossEntropyLoss()

        # noise
        #self.noise = OrnsteinUhlenbeckProcess(dimension=act_dim, num_steps=5000)
        self.noise = GaussianNoise(dimension=act_dim, num_epochs=5000)

        # replay buffer
        self.prioritized_replay = prioritized_replay
        if self.prioritized_replay:
            # Open AI prioritized replay memory
            self.replayBuffer = PrioritizedReplayBuffer(self.memory_size,
                                                        alpha=0.6)
            prioritized_replay_beta0 = 0.4  # type: float
            prioritized_replay_beta_iters = 100000
            self.beta_schedule = LinearSchedule(prioritized_replay_beta_iters,\
                                                initial_p=prioritized_replay_beta0,\
                                                final_p=1.0)
            self.prioritized_replay_eps = 1e-6
        else:
            self.replayBuffer = Replay(
                self.memory_size,
                self.env,
                n_steps=self.n_steps,
                gamma=self.gamma)  #<- self implemented memory buffer