def __init__(self, obs_dim, act_dim, env, memory_size=50000, batch_size=64,\ lr_critic=1e-4, lr_actor=1e-4, gamma=0.99, tau=0.001, n_steps = 1): self.gamma = gamma self.batch_size = batch_size self.obs_dim = obs_dim self.act_dim = act_dim self.memory_size = memory_size self.tau = tau self.env = env self.n_steps = n_steps self.n_step_gamma = self.gamma**self.n_steps # actor self.actor = actor(input_size=obs_dim, output_size=act_dim) self.actor_target = actor(input_size=obs_dim, output_size=act_dim) self.actor_target.load_state_dict(self.actor.state_dict()) # critic self.critic = critic(state_size=obs_dim, action_size=act_dim, output_size=1) self.critic_target = critic(state_size=obs_dim, action_size=act_dim, output_size=1) self.critic_target.load_state_dict(self.critic.state_dict()) # optimizers self.optimizer_actor = optim.Adam(self.actor.parameters(), lr=lr_actor) self.optimizer_critic = optim.Adam(self.critic.parameters(), lr=lr_critic) # critic loss self.critic_loss = nn.MSELoss() # noise # self.noise = OrnsteinUhlenbeckProcess(dimension=act_dim, num_steps=5000) self.noise = GaussianNoise(dimension=act_dim, num_epochs=5000) # replay buffer #self.replayBuffer = Replay(self.memory_size, window_length=1) self.replayBuffer = Replay(self.memory_size, self.env)
critic_t.load_state_dict(critic.state_dict()) else: critic = Critic(state_dim, action_dim, max_action, args) critic_t = Critic(state_dim, action_dim, max_action, args) critic_t.load_state_dict(critic.state_dict()) print("OK 3") # actor actor = Actor(state_dim, action_dim, max_action, args) actor_t = Actor(state_dim, action_dim, max_action, args) actor_t.load_state_dict(actor.state_dict()) # action noise if not args.ou_noise: a_noise = GaussianNoise(action_dim, sigma=args.gauss_sigma) else: a_noise = OrnsteinUhlenbeckProcess(action_dim, mu=args.ou_mu, theta=args.ou_theta, sigma=args.ou_sigma) if USE_CUDA: critic.cuda() critic_t.cuda() actor.cuda() actor_t.cuda() print("OK 4") # CEM es = sepCEM(actor.get_size(),
def __init__(self, obs_dim, act_dim, env = None, memory_size=50000, batch_size=64,\ lr_critic=1e-4, lr_actor=1e-4, gamma=0.99, tau=0.001, prioritized_replay=True,\ critic_dist_info=None, n_steps=1): self.gamma = gamma self.n_steps = n_steps self.n_step_gamma = self.gamma**self.n_steps self.batch_size = batch_size self.obs_dim = obs_dim self.act_dim = act_dim self.memory_size = memory_size self.tau = tau self.env = env ## critic_dist_info: # dictionary with information about critic output distribution. # parameters: # 1. distribution_type = 'categorical' or 'mixture_of_gaussian' # if 'categorical': # a. # if 'mixture_of_gaussian': # b. self.dist_type = critic_dist_info['type'] if critic_dist_info['type'] == 'categorical': self.v_min = critic_dist_info['v_min'] self.v_max = critic_dist_info['v_max'] self.n_atoms = critic_dist_info['n_atoms'] self.delta = (self.v_max - self.v_min) / float(self.n_atoms - 1) self.bin_centers = np.array([ self.v_min + i * self.delta for i in range(self.n_atoms) ]).reshape(-1, 1) elif critic_dist_info['type'] == 'mixture_of_gaussian': #TODO pass else: print("Error: Unsupported distribution type") # TODO # throw exception # actor self.actor = actor(input_size=obs_dim, output_size=act_dim) self.actor_target = actor(input_size=obs_dim, output_size=act_dim) self.actor_target.load_state_dict(self.actor.state_dict()) # critic self.critic = critic(state_size=obs_dim, action_size=act_dim, dist_info=critic_dist_info) self.critic_target = critic(state_size=obs_dim, action_size=act_dim, dist_info=critic_dist_info) self.critic_target.load_state_dict(self.critic.state_dict()) # optimizers self.optimizer_actor = optim.Adam(self.actor.parameters(), lr=lr_actor) self.optimizer_critic = optim.Adam(self.critic.parameters(), lr=lr_critic) # critic loss self.critic_loss = nn.CrossEntropyLoss() # noise #self.noise = OrnsteinUhlenbeckProcess(dimension=act_dim, num_steps=5000) self.noise = GaussianNoise(dimension=act_dim, num_epochs=5000) # replay buffer self.prioritized_replay = prioritized_replay if self.prioritized_replay: # Open AI prioritized replay memory self.replayBuffer = PrioritizedReplayBuffer(self.memory_size, alpha=0.6) prioritized_replay_beta0 = 0.4 # type: float prioritized_replay_beta_iters = 100000 self.beta_schedule = LinearSchedule(prioritized_replay_beta_iters,\ initial_p=prioritized_replay_beta0,\ final_p=1.0) self.prioritized_replay_eps = 1e-6 else: self.replayBuffer = Replay( self.memory_size, self.env, n_steps=self.n_steps, gamma=self.gamma) #<- self implemented memory buffer