def __init__(self, nb_state, nb_action): self.nb_state = nb_state self.nb_action = nb_action self.actor = Actor(self.nb_state, self.nb_action) self.actor_target = Actor(self.nb_state, self.nb_action) self.actor_optim = Adam(self.actor.parameters(), lr=LEARNING_RATE) self.critic = Critic(self.nb_state, self.nb_action) self.critic_target = Critic(self.nb_state, self.nb_action) self.critic_optim = Adam(self.critic.parameters(), lr=LEARNING_RATE) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = SequentialMemory(limit=MEMORY_SIZE, window_length=1) self.random_process = OrnsteinUhlenbeckProcess(size=nb_action, theta=OU_THETA, mu=OU_MU, sigma=OU_SIGMA) self.is_training = True self.epsilon = 1.0 self.a_t = None self.s_t = None if USE_CUDA: self.cuda()
def __init__(self, nb_states, nb_actions): self.critic = Critic(nb_states, nb_actions) # Q self.critic_target = Critic(nb_states, nb_actions) self.actor = Actor(nb_states, nb_actions) # policy mu self.actor_target = Actor(nb_states, nb_actions) hard_update(self.critic_target, self.critic) hard_update(self.actor_target, self.actor) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=0.001) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=0.0001) self.criterion = nn.MSELoss() self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=0.15, mu=0, sigma=0.2) self.gamma = 0.99 self.batch_size = 64 if USE_CUDA: self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda()
def __init__(self, state_size, action_size, memory_size, batch_size=128, tan=0.001, actor_lr=0.001, critic_lr=0.001, epsilon=1.): self.state_size = state_size self.action_size = action_size self.batch_size = batch_size self.tan = tan self.warmup = WARM_UP self.epsilon = epsilon self.epsilon_decay = hyperparameters['D_EPSILON'] self.actor = Actor(state_size, action_size) self.actor_target = Actor(state_size, action_size) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr) self.critic = Critic(state_size, action_size) self.critic_target = Critic(state_size, action_size) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr) self.memory = Memory(memory_size) self.criterion = nn.MSELoss() self.random_process = OrnsteinUhlenbeckProcess(size=action_size, theta=0.15, mu=0., sigma=0.2) copy_parameter(self.actor, self.actor_target) copy_parameter(self.critic, self.critic_target)
def __init__(self, s_dim, a_dim, num_agent, **kwargs): self.s_dim = s_dim self.a_dim = a_dim self.config = kwargs['config'] self.num_agent = num_agent self.actor = Actor(s_dim, a_dim) self.actor_target = Actor(s_dim, a_dim) self.critic = Critic(s_dim, a_dim, num_agent) self.critic_target = Critic(s_dim, a_dim, num_agent) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.config.a_lr) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.config.c_lr) self.a_loss = 0 self.c_loss = 0 if self.config.use_cuda: self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() hard_update(self.actor, self.actor_target) hard_update(self.critic, self.critic_target) self.random_process = OrnsteinUhlenbeckProcess( size=self.a_dim, theta=self.config.ou_theta, mu=self.config.ou_mu, sigma=self.config.ou_sigma)
def __init__(self, s_dim, a_dim, **kwargs): self.s_dim = s_dim self.a_dim = a_dim self.config = kwargs['config'] self.device = 'cuda' if self.config.use_cuda else 'cpu' self.actor = Actor(s_dim, a_dim) self.actor_target = Actor(s_dim, a_dim) self.critic = Critic(s_dim, a_dim, 1) self.critic_target = Critic(s_dim, a_dim, 1) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.config.a_lr) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.config.c_lr) self.c_loss = 0 self.a_loss = 0 if self.config.use_cuda: self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() hard_update(self.actor, self.actor_target) hard_update(self.critic, self.critic_target) self.random_process = OrnsteinUhlenbeckProcess( size=self.a_dim, theta=self.config.ou_theta, mu=self.config.ou_mu, sigma=self.config.ou_sigma) self.replay_buffer = list() self.epsilon = 1. self.depsilon = self.epsilon / self.config.epsilon_decay
def __init__(self, conf, device): self.conf = conf self.state_dim = conf['state_dim'] self.action_dim = conf['action_dim'] self.device = device # create actor and critic network self.actor = Actor_RDPG(self.state_dim, self.action_dim).to(self.device) self.actor_target = Actor_RDPG(self.state_dim, self.action_dim).to(self.device) self.critic = Critic_RDPG(self.state_dim, self.action_dim).to(self.device) self.critic_target = Critic_RDPG(self.state_dim, self.action_dim).to(self.device) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) self.critic_optim = optim.Adam(self.critic.parameters(), lr=q_lr) self.actor_optim = optim.Adam(self.actor.parameters(), lr=policy_lr) #Create replay buffer self.random_process = OrnsteinUhlenbeckProcess(size=self.action_dim, theta=0.15, mu=0.0, sigma=0.2) # args.ou_theta:0.15 (noise theta), args.ou_sigma:0.2 (noise sigma), args.out_mu:0.0 (noise mu) self.epsilon = 1.0 self.depsilon = 1.0 / 50000 self.is_training = True self.tau = 0.001 # moving average for target network
def __init__(self, args, nb_states, nb_actions): USE_CUDA = torch.cuda.is_available() if args.seed > 0: self.seed(args.seed) self.nb_states = nb_states self.nb_actions = nb_actions self.gpu_ids = [i for i in range(args.gpu_nums) ] if USE_CUDA and args.gpu_nums > 0 else [-1] self.gpu_used = True if self.gpu_ids[0] >= 0 else False net_cfg = { 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'init_w': args.init_w } self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg).double() self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg).double() self.actor_optim = Adam(self.actor.parameters(), lr=args.p_lr, weight_decay=args.weight_decay) self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg).double() self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg).double() self.critic_optim = Adam(self.critic.parameters(), lr=args.c_lr, weight_decay=args.weight_decay) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = OrnsteinUhlenbeckProcess(size=self.nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma) # Hyper-parameters self.batch_size = args.bsize self.tau_update = args.tau_update self.gamma = args.gamma # Linear decay rate of exploration policy self.depsilon = 1.0 / args.epsilon # initial exploration rate self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.is_training = True self.continious_action_space = False
def __init__(self, nb_states, nb_actions, args): if args.seed > 0: self.seed(args.seed) self.nb_states = nb_states self.nb_actions = nb_actions actor_net_cfg = { 'hidden1': 32, 'hidden2': 32, 'hidden3': 32, 'init_w': args.init_w } critic_net_cfg = { 'hidden1': 64, 'hidden2': 64, 'hidden3': 64, 'init_w': args.init_w } self.actor = Actor(self.nb_states, self.nb_actions, **actor_net_cfg) self.actor_target = Actor(self.nb_states, self.nb_actions, **actor_net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_states, self.nb_actions, **critic_net_cfg) self.critic_target = Critic(self.nb_states, self.nb_actions, **critic_net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma) # Hyper-parameters self.batch_size = args.bsize self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.is_training = True self.best_reward = -10
def __init__(self, env, mem_size=7 * int(1e3), lr_critic=1e-3, lr_actor=1e-4, epsilon=1., max_epi=1500, epsilon_decay=1. / (1e5), gamma=.99, target_update_frequency=200, batch_size=64, random_process=True, max_step=None): self.CUDA = torch.cuda.is_available() self.orig_env = env #for recording if max_step is not None: self.orig_env._max_episode_steps = max_step self.env = self.orig_env self.N_S = self.env.observation_space.shape[0] self.N_A = self.env.action_space.shape[0] self.MAX_EPI = max_epi self.LOW = self.env.action_space.low self.HIGH = self.env.action_space.high self.actor = Actor(self.N_S, self.N_A) self.critic = Critic(self.N_S, self.N_A) self.target_actor = Actor(self.N_S, self.N_A) self.target_critic = Critic(self.N_S, self.N_A) self.target_actor.eval() self.target_critic.eval() self.target_actor.load_state_dict(self.actor.state_dict()) self.target_critic.load_state_dict(self.critic.state_dict()) if self.CUDA: self.actor.cuda() self.critic.cuda() self.target_actor.cuda() self.target_critic.cuda() self.exp = Experience(mem_size) self.optim_critic = optim.Adam(self.critic.parameters(), lr=lr_critic) self.optim_actor = optim.Adam(self.actor.parameters(), lr=-lr_actor) self.random_process = OrnsteinUhlenbeckProcess(\ size=self.N_A, theta=.15, mu=0, sigma=.2) self.EPSILON = epsilon self.EPSILON_DECAY = epsilon_decay self.GAMMA = gamma self.TARGET_UPDATE_FREQUENCY = target_update_frequency self.BATCH_SIZE = batch_size title = {common.S_EPI: [], common.S_TOTAL_R: []} self.data = pd.DataFrame(title) self.RAND_PROC = random_process
class Predator: def __init__(self, s_dim, a_dim, num_agent, **kwargs): self.s_dim = s_dim self.a_dim = a_dim self.config = kwargs['config'] self.num_agent = num_agent self.actor = Actor(s_dim, a_dim) self.actor_target = Actor(s_dim, a_dim) self.critic = Critic(s_dim, a_dim, num_agent) self.critic_target = Critic(s_dim, a_dim, num_agent) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.config.a_lr) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.config.c_lr) self.a_loss = 0 self.c_loss = 0 if self.config.use_cuda: self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() hard_update(self.actor, self.actor_target) hard_update(self.critic, self.critic_target) self.random_process = OrnsteinUhlenbeckProcess( size=self.a_dim, theta=self.config.ou_theta, mu=self.config.ou_mu, sigma=self.config.ou_sigma) def get_batches(self): experiences = random.sample(self.replay_buffer, self.batch_size) state_batches = np.array([_[0] for _ in experiences]) action_batches = np.array([_[1] for _ in experiences]) reward_batches = np.array([_[2] for _ in experiences]) next_state_batches = np.array([_[3] for _ in experiences]) done_batches = np.array([_[4] for _ in experiences]) return state_batches, action_batches, reward_batches, next_state_batches, done_batches def random_action(self): action = np.random.uniform(low=-1., high=1., size=(self.num_agent, self.a_dim)) return action def reset(self): self.random_process.reset_states()
def __init__(self, env, args): #(self, nb_states, nb_actions, args): if args.seed > 0: self.seed(args.seed) self.env = env self.nb_states = self.env.observation_space.shape[0] self.nb_actions = self.env.action_space.shape[0] # Create Actor and Critic Network net_cfg = { 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'init_w': args.init_w } self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) self.load_weights(args.output) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = OrnsteinUhlenbeckProcess(size=self.nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma) # Hyper-parameters self.batch_size = args.bsize self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.is_training = True # if USE_CUDA: self.cuda()
def __init__(self, env, actor_model, critic_model, memory=10000, batch_size=64, gamma=0.99, tau=0.001, actor_lr=1e-4, critic_lr=1e-3, critic_decay=1e-2, ou_theta=0.15, ou_sigma=0.2, render=None, evaluate=None, save_path=None, save_every=10, render_every=10, train_per_step=True): self.env = env self.actor = actor_model self.actor_target = actor_model.clone() self.critic = critic_model self.critic_target = critic_model.clone() if use_cuda: for net in [ self.actor, self.actor_target, self.critic, self.critic_target ]: net.cuda() self.memory = ReplayMemory(memory) self.batch_size = batch_size self.gamma = gamma self.tau = tau self.random_process = OrnsteinUhlenbeckProcess( env.action_space.shape[0], theta=ou_theta, sigma=ou_sigma) self.optim_critic = optim.Adam(self.critic.parameters(), lr=critic_lr, weight_decay=critic_decay) self.optim_actor = optim.Adam(self.actor.parameters(), lr=actor_lr) self.render = render self.render_every = render_every self.evaluate = evaluate self.save_path = save_path self.save_every = save_every self.train_per_step = train_per_step
def __init__(self, nb_states, nb_actions, args): if args.seed > 0: self.seed(args.seed) self.nb_states = nb_states self.nb_actions = nb_actions # Create Actor and Critic Network self.actor = Actor(self.nb_states, self.nb_actions, args.init_w) self.actor_target = Actor(self.nb_states, self.nb_actions, args.init_w) self.critic = Critic(self.nb_states, self.nb_actions, args.init_w) self.critic_target = Critic(self.nb_states, self.nb_actions, args.init_w) self.reward_predictor = Critic(self.nb_states, self.nb_actions, args.init_w) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma) # Hyper-parameters self.batch_size = args.bsize self.trajectory_length = args.trajectory_length self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.is_training = True # if USE_CUDA: self.cuda()
def __init__(self, s_dim, a_dim, n_agents, **kwargs): self.s_dim = s_dim self.a_dim = a_dim self.config = kwargs['config'] self.n_agents = n_agents self.device = 'cuda' if self.config.use_cuda else 'cpu' # Networks self.policy = Actor(s_dim, a_dim, n_agents) self.policy_target = Actor(s_dim, a_dim, n_agents) self.critic = Critic(s_dim, a_dim, n_agents) self.critic_target = Critic(s_dim, a_dim, n_agents) if self.config.use_cuda: self.policy.cuda() self.policy_target.cuda() self.critic.cuda() self.critic_target.cuda() self.policy_optimizer = torch.optim.Adam(self.policy.parameters(), lr=self.config.a_lr) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.config.c_lr) hard_update(self.policy, self.policy_target) hard_update(self.critic, self.critic_target) self.random_process = OrnsteinUhlenbeckProcess( size=self.a_dim, theta=self.config.ou_theta, mu=self.config.ou_mu, sigma=self.config.ou_sigma) self.replay_buffer = list() self.epsilon = 1. self.depsilon = self.epsilon / self.config.epsilon_decay self.c_loss = None self.a_loss = None self.action_log = list()
def __init__(self, nb_states, nb_actions): self.nb_states = nb_states self.nb_actions = nb_actions # Create Actor and Critic Network self.actor = Actor(self.nb_states, self.nb_actions) self.actor_target = Actor(self.nb_states, self.nb_actions) self.actor_optim = Adam(self.actor.parameters(), lr=ACTOR_LR) self.critic = Critic(self.nb_states, self.nb_actions) self.critic_target = Critic(self.nb_states, self.nb_actions) self.critic_optim = Adam(self.critic.parameters(), lr=CRITIC_LR) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = SequentialMemory(limit=MEMORY_SIZE, window_length=HISTORY_LEN) self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=OU_THETA, mu=OU_MU, sigma=OU_SIGMA) # Hyper-parameters self.batch_size = BATCH_SIZE self.tau = TAU self.discount = GAMMA self.depsilon = 1.0 / DEPSILON self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.is_training = True if USE_CUDA: self.cuda()
class Preyer: def __init__(self, s_dim, a_dim, **kwargs): self.s_dim = s_dim self.a_dim = a_dim self.config = kwargs['config'] self.device = 'cuda' if self.config.use_cuda else 'cpu' self.actor = Actor(s_dim, a_dim) self.actor_target = Actor(s_dim, a_dim) self.critic = Critic(s_dim, a_dim, 1) self.critic_target = Critic(s_dim, a_dim, 1) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.config.a_lr) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.config.c_lr) self.c_loss = 0 self.a_loss = 0 if self.config.use_cuda: self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() hard_update(self.actor, self.actor_target) hard_update(self.critic, self.critic_target) self.random_process = OrnsteinUhlenbeckProcess( size=self.a_dim, theta=self.config.ou_theta, mu=self.config.ou_mu, sigma=self.config.ou_sigma) self.replay_buffer = list() self.epsilon = 1. self.depsilon = self.epsilon / self.config.epsilon_decay def memory(self, s, a, r, s_, done): self.replay_buffer.append((s, a, r, s_, done)) if len(self.replay_buffer) >= self.config.memory_length: self.replay_buffer.pop(0) def get_batches(self): experiences = random.sample(self.replay_buffer, self.config.batch_size) state_batches = np.array([_[0] for _ in experiences]) action_batches = np.array([_[1] for _ in experiences]) reward_batches = np.array([_[2] for _ in experiences]) next_state_batches = np.array([_[3] for _ in experiences]) done_batches = np.array([_[4] for _ in experiences]) return state_batches, action_batches, reward_batches, next_state_batches, done_batches def choose_action(self, s, noisy=True): if self.config.use_cuda: s = Variable(torch.cuda.FloatTensor(s)) else: s = Variable(torch.FloatTensor(s)) a = self.actor.forward(s).cpu().detach().numpy() if noisy: a += max(self.epsilon, 0.001) * self.random_process.sample() self.epsilon -= self.depsilon a = np.clip(a, -1., 1.) return np.array([a]) def random_action(self): action = np.random.uniform(low=-1., high=1., size=(1, self.a_dim)) return action def reset(self): self.random_process.reset_states() def train(self): state_batches, action_batches, reward_batches, next_state_batches, done_batches = self.get_batches( ) state_batches = Variable(torch.Tensor(state_batches).to(self.device)) action_batches = Variable( torch.Tensor(action_batches).reshape(-1, 1).to(self.device)) reward_batches = Variable( torch.Tensor(reward_batches).reshape(-1, 1).to(self.device)) next_state_batches = Variable( torch.Tensor(next_state_batches).to(self.device)) done_batches = Variable( torch.Tensor( (done_batches == False) * 1).reshape(-1, 1).to(self.device)) target_next_actions = self.actor_target.forward( next_state_batches).detach() target_next_q = self.critic_target.forward( next_state_batches, target_next_actions).detach() main_q = self.critic(state_batches, action_batches) # Critic Loss self.critic.zero_grad() baselines = reward_batches + done_batches * self.config.gamma * target_next_q loss_critic = torch.nn.MSELoss()(main_q, baselines) loss_critic.backward() self.critic_optimizer.step() # Actor Loss self.actor.zero_grad() clear_action_batches = self.actor.forward(state_batches) loss_actor = ( -self.critic.forward(state_batches, clear_action_batches)).mean() loss_actor.backward() self.actor_optimizer.step() # This is for logging self.c_loss = loss_critic.item() self.a_loss = loss_actor.item() soft_update(self.actor, self.actor_target, self.config.tau) soft_update(self.critic, self.critic_target, self.config.tau) def getLoss(self): return self.c_loss, self.a_loss
class Agent(): def __init__(self, nb_states, nb_actions): self.critic = Critic(nb_states, nb_actions) # Q self.critic_target = Critic(nb_states, nb_actions) self.actor = Actor(nb_states, nb_actions) # policy mu self.actor_target = Actor(nb_states, nb_actions) hard_update(self.critic_target, self.critic) hard_update(self.actor_target, self.actor) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=0.001) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=0.0001) self.criterion = nn.MSELoss() self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=0.15, mu=0, sigma=0.2) self.gamma = 0.99 self.batch_size = 64 if USE_CUDA: self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def act(self, obs, epsilon=0.1): # epsilon -> tunning paramter if (random.random() < epsilon): # choose random action action = np.random.uniform(-1., 1., nb_actions) return action else: # the action is the output of actor network + Exploration Noise action = self.actor(obs).cpu().data.numpy() action += self.random_process.sample() action = np.clip(action, -1., 1.) # to stay in interval [-1,1] return action def backward(self, transitions): transitions = memory.sample(self.batch_size) batch = Transition(*zip(*transitions)) state_batch = Variable(torch.cat(batch.state)).type( FLOAT) # size 64 x 3 action_batch = Variable(torch.cat(batch.action)).type(FLOAT) # size 64 next_state_batch = Variable(torch.cat(batch.next_state)).type( FLOAT) # size 64 x 3 reward_batch = Variable(torch.cat(batch.reward)).type(FLOAT) # size 64 done_batch = Variable(torch.cat(batch.done)).type(FLOAT) #### Q - CRITIC UPDATE #### # Q(s_t,a_t) action_batch.unsqueeze_(1) # size 64x1 state_action_value = self.critic(state_batch, action_batch) # 64x1 # a_{t+1} = mu_target(s_{t+1}) next_action = self.actor_target( next_state_batch).detach() # 64 x nb_actions # Q'(s_{t+1},a_{t+1}) next_state_action_value = self.critic_target(next_state_batch, next_action).detach() next_state_action_value.squeeze_() # 64 # mask to consider next_state_values to 0 if state is terminal mask = Variable( np.logical_not(done_batch.data).type( torch.FloatTensor)).type(FLOAT) # mask = 1,1,1 .. # Compare Q(s_t,a_t) with r_t + gamma * Q'(s_{t+1},a_{t+1}) expected_state_action_value = reward_batch + ( self.gamma * next_state_action_value * mask) # Compute Huber loss # loss = F.smooth_l1_loss(state_action_values, expected_state_action_values) loss = self.criterion(state_action_value, expected_state_action_value) # Optimize the nn by updating weights with adam descent self.critic_optimizer.zero_grad() loss.backward() self.critic_optimizer.step() #### mu - ACTOR UPDATE #### # a_t = mu(s_t) action = self.actor(state_batch) # J = esperance[Q(s_t,mu(s_t))] -> a maximiser # -J = policy_loss -> a minimiser policy_loss = -self.critic(state_batch, action) policy_loss = policy_loss.mean() self.actor_optimizer.zero_grad() policy_loss.backward() self.actor_optimizer.step() #### update target network with polyak averaging soft_update(self.critic_target, self.critic, tau=0.001) soft_update(self.actor_target, self.actor, tau=0.001) return
class Agent(object): def __init__(self, nb_states, nb_actions, args): if args.seed > 0: self.seed(args.seed) self.nb_states = nb_states self.nb_actions = nb_actions # Create Actor and Critic Network self.actor = Actor(self.nb_states, self.nb_actions, args.init_w) self.actor_target = Actor(self.nb_states, self.nb_actions, args.init_w) self.critic = Critic(self.nb_states, self.nb_actions, args.init_w) self.critic_target = Critic(self.nb_states, self.nb_actions, args.init_w) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma) # Hyper-parameters self.batch_size = args.bsize self.trajectory_length = args.trajectory_length self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.is_training = True # if USE_CUDA: self.cuda() def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def random_action(self): action = np.random.uniform(-1., 1., self.nb_actions) return action def select_action(self, state, noise_enable=True, decay_epsilon=True): action, _ = self.actor(to_tensor(np.array([state]))) action = to_numpy(action).squeeze(0) if noise_enable == True: action += self.is_training * max(self.epsilon, 0) * self.random_process.sample() action = np.clip(action, -1., 1.) if decay_epsilon: self.epsilon -= self.depsilon return action def reset_lstm_hidden_state(self, done=True): self.actor.reset_lstm_hidden_state(done) def reset(self): self.random_process.reset_states() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def load_weights(self, output): if output is None: return False self.actor.load_state_dict(torch.load('{}/actor.pkl'.format(output))) self.critic.load_state_dict(torch.load('{}/critic.pkl'.format(output))) return True def save_model(self, output): if not os.path.exists(output): os.mkdir(output) torch.save(self.actor.state_dict(), '{}/actor.pkl'.format(output)) torch.save(self.critic.state_dict(), '{}/critic.pkl'.format(output))
def __init__(self, env, config): self.name = 'HierarchicalNet' self.save_folder = None self.test_record = {} self.train_record = {} self.config = config self.env = env self.epsilon = config.EPSILON self.commander_memory = Commander_Memory(config.MEMORY_SIZE,config.BATCH_SIZE) self.unit_memory = Unit_Memory(2*config.MEMORY_SIZE,config.UNIT_BATCH_SIZE) self.commander_actor = Commander_Actor(config.STATE_DIM,config.COMMAND_DIM,config.RNN_INSIZE) self.commander_actor_target = Commander_Actor(config.STATE_DIM,config.COMMAND_DIM,config.RNN_INSIZE) self.commander_critic = Commander_Critic(config.STATE_DIM,config.COMMAND_DIM,config.BATCH_SIZE,config.RNN_INSIZE) self.commander_critic_target = Commander_Critic(config.STATE_DIM,config.COMMAND_DIM,config.BATCH_SIZE,config.RNN_INSIZE) self.unit_actor = Unit_Actor(config.STATE_DIM,config.COMMAND_DIM,config.ACTION_DIM) self.unit_actor_target = Unit_Actor(config.STATE_DIM,config.COMMAND_DIM,config.ACTION_DIM) self.unit_critic = Unit_Critic(config.STATE_DIM,config.COMMAND_DIM,config.ACTION_DIM,config.HIDDEN_SIZE) self.unit_critic_target = Unit_Critic(config.STATE_DIM,config.COMMAND_DIM,config.ACTION_DIM,config.HIDDEN_SIZE) self.commander_actor_h0 = Variable(torch.zeros(2, 1, config.RNN_OUTSIZE),requires_grad=False) if config.GPU >= 0: self.commander_actor.cuda(device=config.GPU) self.commander_actor_target.cuda(device=config.GPU) self.commander_critic.cuda(device=config.GPU) self.commander_critic_target.cuda(device=config.GPU) self.unit_actor.cuda(device=config.GPU) self.unit_actor_target.cuda(device=config.GPU) self.unit_critic.cuda(device=config.GPU) self.unit_critic_target.cuda(device=config.GPU) self.commander_critic.h0 = self.commander_critic.h0.cuda(device=config.GPU) self.commander_critic_target.h0 = self.commander_critic_target.h0.cuda(device=config.GPU) self.commander_actor_h0 = self.commander_actor_h0.cuda(device=config.GPU) copy_parameter(self.commander_actor, self.commander_actor_target) copy_parameter(self.commander_critic, self.commander_critic_target) copy_parameter(self.unit_actor, self.unit_actor_target) copy_parameter(self.unit_critic, self.unit_critic_target) self.commander_actor_optimizer = optim.Adam(self.commander_actor.parameters(),lr=config.ACTOR_LR) self.unit_actor_optimizer = optim.Adam(self.unit_actor.parameters(),lr=config.ACTOR_LR) self.commander_critic_optimizer = optim.Adam(self.commander_critic.parameters(), lr=config.CRITIC_LR) self.unit_critic_optimizer = optim.Adam(self.unit_critic.parameters(), lr=config.CRITIC_LR) self.criterion = nn.MSELoss() self.action_noise = OrnsteinUhlenbeckProcess(size=(config.MYSELF_NUM, config.ACTION_DIM), theta=10, mu=0., sigma=2) self.command_noise = OrnsteinUhlenbeckProcess(size=(1,config.MYSELF_NUM, config.COMMAND_DIM), theta=10, mu=0., sigma=2) # self.action_noise = OrnsteinUhlenbeckProcess(size=(config.MYSELF_NUM, config.ACTION_DIM), theta=30, mu=0., sigma=3) # self.command_noise = OrnsteinUhlenbeckProcess(size=(1,config.MYSELF_NUM, config.COMMAND_DIM), theta=30, mu=0., sigma=3) # normalize state_normalization_myelf = [1,100,100,1,100,100,1] state_normalization_enemy = [1,100,100,100,100,10,100,100,1,1,1,10] self.state_normalization = state_normalization_myelf for i in range(config.K): self.state_normalization += state_normalization_enemy self.state_normalization = np.asarray(self.state_normalization,dtype=np.float32)
def __init__( self, env, mem_size=int(1e6), lr_critic=1e-3, lr_actor=1e-4, max_epi=int(1e4), epsilon_decay=1. / (1e5), gamma=.99, target_update_frequency=200, batch_size=64, random_process_mode='default', max_step=None, actor_update_mode='default', popart=False, actor='standard', critic='43', epsilon_start=1., epsilon_end=.01, epsilon_rate=1. / 200, partition_num=100, env_log_freq=100, model_log_freq=500, target_update_mode='hard', tau=1e-3, grad_clip_mode=None, grad_clip_norm=5., critic_weight_decay=0., exp_trunc=[], exp_percent=[], exp_rebalance_freq=None, exp_type='rank', ): # configuration log frame = inspect.currentframe() args, _, _, values = inspect.getargvalues(frame) self.config = ['{}: {}'.format(arg, values[arg]) for arg in args] self.CUDA = torch.cuda.is_available() self.ENV_NORMALIZED = env.class_name() == 'NormalizedEnv' self.POPART = popart self.actor_update_mode = actor_update_mode self.orig_env = (env) #for recording if max_step is not None: tmp_env = env if isinstance(tmp_env, gym.Wrapper): while (tmp_env.class_name() != 'TimeLimit'): tmp_env = tmp_env.env tmp_env._max_episode_steps = max_step self.env = self.orig_env self.N = 1 if hasattr(self.env.unwrapped, 'N'): self.N = self.env.unwrapped.N self.N_S = self.env.observation_space.shape[0] self.N_A = self.env.action_space.shape[0] self.n_s = self.N_S / self.N self.n_a = self.N_A / self.N self.MAX_EPI = max_epi self.LOW = self.env.action_space.low self.HIGH = self.env.action_space.high self.actor = ActorRegistry[actor](self.n_s, self.n_a) self.critic = CriticRegistry[critic](self.N_S, self.N_A) self.target_actor = ActorRegistry[actor](self.n_s, self.n_a) self.target_critic = CriticRegistry[critic](self.N_S, self.N_A) self.target_actor.eval() self.target_critic.eval() self.target_actor.load_state_dict(self.actor.state_dict()) self.target_critic.load_state_dict(self.critic.state_dict()) if self.CUDA: self.actor.cuda() self.critic.cuda() self.target_actor.cuda() self.target_critic.cuda() # pop-art self.update_counter = 0 self.beta = .1 self.y_mean = 0. self.y_square_mean = 0. self.target_y_mean = self.y_mean self.target_y_square_mean = self.y_square_mean # per self.total_step = 0 self.PARTITION_NUM = partition_num self.LEARN_START = mem_size / self.PARTITION_NUM + 1 self.exp_trunc = exp_trunc self.exp_percent = exp_percent self.exp_rebalance_freq = exp_rebalance_freq self.exp_batch_sizes = [] self.exp_type = exp_type #if len(self.exp_trunc)>0: if len(self.exp_trunc) != len(self.exp_percent): raise RuntimeError("different exp_trunc and exp_percent length") self.exp = [] for i in range(len(self.exp_trunc) + 1): tmp_batch_size = int( batch_size * (1 - sum(self.exp_percent))) if i == len( self.exp_trunc) else int(batch_size * self.exp_percent[i]) self.exp_batch_sizes.append(tmp_batch_size) exp_conf = { 'size': mem_size, 'learn_start': self.LEARN_START, 'partition_num': self.PARTITION_NUM, 'total_step': self.MAX_EPI * 50, 'batch_size': tmp_batch_size } self.exp.append(Experience( exp_conf)) if self.exp_type == 'rank' else self.exp.append( PrioritizedReplayBuffer(mem_size, alpha=.7)) #else: # exp_conf = { # 'size': mem_size, # 'learn_start': self.LEARN_START, # 'partition_num': self.PARTITION_NUM, # 'total_step': self.MAX_EPI * 50, # 'batch_size': batch_size, # } # self.exp = Experience(exp_conf) # uniform er #self.exp = Experience(mem_size) self.optim_critic = optim.Adam(self.critic.parameters(), lr=lr_critic, weight_decay=critic_weight_decay) self.optim_actor = optim.Adam(self.actor.parameters(), lr=-lr_actor) self.random_processes = [] for _ in xrange(self.N): random_process = OrnsteinUhlenbeckProcess(\ size=self.n_a, theta=.15, mu=0, sigma=.2) self.random_processes.append(random_process) self.EPSILON_START = epsilon_start self.EPSILON_END = epsilon_end # only default random process mode will use epsilon decay self.EPSILON_DECAY = epsilon_decay # other random process mode will use epsilon rate self.EPSILON_RATE = epsilon_rate self.GAMMA = gamma self.TARGET_UPDATE_FREQUENCY = target_update_frequency self.BATCH_SIZE = batch_size self.target_update_mode = target_update_mode self.tau = tau #title = {common.S_EPI:[], common.S_TOTAL_R:[]} #self.data = pd.DataFrame(title) self.RAND_PROC = random_process_mode self.grad_clip_mode = grad_clip_mode self.grad_clip_norm = grad_clip_norm # logger self.logger = None self.env_log_freq = env_log_freq self.model_log_freq = model_log_freq self.step = 0 # random seed self.seed = int(time.time()) random.seed(self.seed) np.random.seed(self.seed)
def run_agent(model_params, weights, state_transform, data_queue, weights_queue, process, global_step, updates, best_reward, param_noise_prob, save_dir, max_steps=10000000): train_fn, actor_fn, target_update_fn, params_actor, params_crit, actor_lr, critic_lr = \ build_model(**model_params) actor = Agent(actor_fn, params_actor, params_crit) actor.set_actor_weights(weights) env = RunEnv2(state_transform, max_obstacles=config.num_obstacles, skip_frame=config.skip_frames) random_process = OrnsteinUhlenbeckProcess(theta=.1, mu=0., sigma=.2, size=env.noutput, sigma_min=0.05, n_steps_annealing=1e6) # prepare buffers for data states = [] actions = [] rewards = [] terminals = [] total_episodes = 0 start = time() action_noise = True while global_step.value < max_steps: seed = random.randrange(2**32-2) state = env.reset(seed=seed, difficulty=2) random_process.reset_states() total_reward = 0. total_reward_original = 0. terminal = False steps = 0 while not terminal: state = np.asarray(state, dtype='float32') action = actor.act(state) if action_noise: action += random_process.sample() next_state, reward, next_terminal, info = env.step(action) total_reward += reward total_reward_original += info['original_reward'] steps += 1 global_step.value += 1 # add data to buffers states.append(state) actions.append(action) rewards.append(reward) terminals.append(terminal) state = next_state terminal = next_terminal if terminal: break total_episodes += 1 # add data to buffers after episode end states.append(state) actions.append(np.zeros(env.noutput)) rewards.append(0) terminals.append(terminal) states_np = np.asarray(states).astype(np.float32) data = (states_np, np.asarray(actions).astype(np.float32), np.asarray(rewards).astype(np.float32), np.asarray(terminals), ) weight_send = None if total_reward > best_reward.value: weight_send = actor.get_actor_weights() # send data for training data_queue.put((process, data, weight_send, total_reward)) # receive weights and set params to weights weights = weights_queue.get() report_str = 'Global step: {}, steps/sec: {:.2f}, updates: {}, episode len {}, ' \ 'reward: {:.2f}, original_reward {:.4f}; best reward: {:.2f} noise {}'. \ format(global_step.value, 1. * global_step.value / (time() - start), updates.value, steps, total_reward, total_reward_original, best_reward.value, 'actions' if action_noise else 'params') print(report_str) with open(os.path.join(save_dir, 'train_report.log'), 'a') as f: f.write(report_str + '\n') actor.set_actor_weights(weights) action_noise = np.random.rand() < 1 - param_noise_prob if not action_noise: set_params_noise(actor, states_np, random_process.current_sigma) # clear buffers del states[:] del actions[:] del rewards[:] del terminals[:] if total_episodes % 100 == 0: env = RunEnv2(state_transform, max_obstacles=config.num_obstacles, skip_frame=config.skip_frames)
class RDPG_v2: def __init__(self, conf, device): self.conf = conf self.state_dim = conf['state_dim'] self.action_dim = conf['action_dim'] self.device = device # create actor and critic network self.actor = Actor_RDPG(self.state_dim, self.action_dim).to(self.device) self.actor_target = Actor_RDPG(self.state_dim, self.action_dim).to(self.device) self.critic = Critic_RDPG(self.state_dim, self.action_dim).to(self.device) self.critic_target = Critic_RDPG(self.state_dim, self.action_dim).to(self.device) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) self.critic_optim = optim.Adam(self.critic.parameters(), lr=q_lr) self.actor_optim = optim.Adam(self.actor.parameters(), lr=policy_lr) #Create replay buffer self.random_process = OrnsteinUhlenbeckProcess(size=self.action_dim, theta=0.15, mu=0.0, sigma=0.2) # args.ou_theta:0.15 (noise theta), args.ou_sigma:0.2 (noise sigma), args.out_mu:0.0 (noise mu) self.epsilon = 1.0 self.depsilon = 1.0 / 50000 self.is_training = True self.tau = 0.001 # moving average for target network def random_action(self): action = np.random.uniform( 0., 1., self.action_dim) # [-1,1] select as a number of action_dim return action def select_action(self, state, noise_enable=True, decay_epsilon=True): action, _ = self.actor( to_tensor(state).reshape(-1).unsqueeze(0) ) # input shape = [batch(=1) X state_dim], action : type (tuple), shape [batch X action_dim] action = action.cpu().detach().numpy().squeeze( 0) # action shape [action_dim,] if noise_enable == True: action += self.is_training * max(self.epsilon, 0) * self.random_process.sample() action = np.clip(action, 0., 1.) # input 중 -1~1 을 벗어나는 값에 대해 -1 or 1 로 대체 if decay_epsilon: self.epsilon -= self.depsilon return action def update_policy(self, memory, gamma=0.99): print("updating...") # Sample batch experiences = memory.sample( self.conf['batch_size'] ) # type: list | shape: (max_epi_length(2000)-1 X batch(32) X 5(??)) if len(experiences) == 0: # not enough samples return dtype = torch.cuda.FloatTensor policy_loss_total = 0 value_loss_total = 0 for t in range(len(experiences) - 1): # iterate over episodes # print("t:", t) target_cx = Variable(torch.zeros(self.conf['batch_size'], 50)).type(dtype) target_hx = Variable(torch.zeros(self.conf['batch_size'], 50)).type(dtype) cx = Variable(torch.zeros(self.conf['batch_size'], 50)).type(dtype) hx = Variable(torch.zeros(self.conf['batch_size'], 50)).type(dtype) # we first get the data out of the sampled experience # shape of state0, action, reward: [batch X state_dim], [batch X 1], [batch X 1] state0 = np.stack([ trajectory.state0 for trajectory in experiences[t] ]) # batch 개수만큼 각 epi 중 t 시점에서 상태만 추출 # action = np.expand_dims(np.stack((trajectory.action for trajectory in experiences[t])), axis=1) action = np.stack( [trajectory.action for trajectory in experiences[t]]) reward = np.expand_dims(np.stack( [trajectory.reward for trajectory in experiences[t]]), axis=1) # reward = np.stack((trajectory.reward for trajectory in experiences[t])) state1 = np.stack( [trajectory.state0 for trajectory in experiences[t + 1]]) target_action, (target_hx, target_cx) = self.actor_target( to_tensor(state1).reshape(self.conf['batch_size'], -1), (target_hx, target_cx)) next_q_value = self.critic_target([ to_tensor(state1).reshape(self.conf['batch_size'], -1), target_action ]) target_q = to_tensor(reward) + gamma * next_q_value # Critic update current_q = self.critic([ to_tensor(state0).reshape(self.conf['batch_size'], -1), to_tensor(action) ]) value_loss = F.smooth_l1_loss(current_q, target_q) value_loss /= len(experiences) # divide by trajectory length value_loss_total += value_loss # update per trajectory self.critic.zero_grad() value_loss.backward() # Actor update action, (hx, cx) = self.actor( to_tensor(state0).reshape(self.conf['batch_size'], -1), (hx, cx)) policy_loss = -self.critic([ to_tensor(state0).reshape(self.conf['batch_size'], -1), action ]) policy_loss /= len(experiences) # divide by trajectory length policy_loss_total += policy_loss.mean() policy_loss = policy_loss.mean() self.actor.zero_grad() policy_loss.backward() self.critic_optim.step() self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) print("update finish!") def reset_lstm_hidden_state(self, done=True): self.actor.reset_lstm_hidden_state(done) def save_model(self, path): torch.save(self.critic.state_dict(), path + '_q') torch.save(self.critic_target.state_dict(), path + '_target_q') torch.save(self.actor.state_dict(), path + '_policy') def load_model(self, path): self.critic.load_state_dict(torch.load(path + '_q')) self.critic_target.load_state_dict(torch.load(path + '_target_q')) self.actor.load_state_dict(torch.load(path + '_policy')) self.critic.eval() self.critic_target.eval() self.actor.eval()
class DDPG(object): def __init__(self, args, nb_states, nb_actions): USE_CUDA = torch.cuda.is_available() if args.seed > 0: self.seed(args.seed) self.nb_states = nb_states self.nb_actions= nb_actions self.gpu_ids = [i for i in range(args.gpu_nums)] if USE_CUDA and args.gpu_nums > 0 else [-1] self.gpu_used = True if self.gpu_ids[0] >= 0 else False net_cfg = { 'hidden1':args.hidden1, 'hidden2':args.hidden2, 'init_w':args.init_w } self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg).double() self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg).double() self.actor_optim = Adam(self.actor.parameters(), lr=args.p_lr, weight_decay=args.weight_decay) self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg).double() self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg).double() self.critic_optim = Adam(self.critic.parameters(), lr=args.c_lr, weight_decay=args.weight_decay) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = OrnsteinUhlenbeckProcess(size=self.nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma) # Hyper-parameters self.batch_size = args.bsize self.tau_update = args.tau_update self.gamma = args.gamma # Linear decay rate of exploration policy self.depsilon = 1.0 / args.epsilon # initial exploration rate self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.is_training = True self.continious_action_space = False def update_policy(self): pass def cuda_convert(self): if len(self.gpu_ids) == 1: if self.gpu_ids[0] >= 0: with torch.cuda.device(self.gpu_ids[0]): print('model cuda converted') self.cuda() if len(self.gpu_ids) > 1: self.data_parallel() self.cuda() self.to_device() print('model cuda converted and paralleled') def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def data_parallel(self): self.actor = nn.DataParallel(self.actor, device_ids=self.gpu_ids) self.actor_target = nn.DataParallel(self.actor_target, device_ids=self.gpu_ids) self.critic = nn.DataParallel(self.critic, device_ids=self.gpu_ids) self.critic_target = nn.DataParallel(self.critic_target, device_ids=self.gpu_ids) def to_device(self): self.actor.to(torch.device('cuda:{}'.format(self.gpu_ids[0]))) self.actor_target.to(torch.device('cuda:{}'.format(self.gpu_ids[0]))) self.critic.to(torch.device('cuda:{}'.format(self.gpu_ids[0]))) self.critic_target.to(torch.device('cuda:{}'.format(self.gpu_ids[0]))) def observe(self, r_t, s_t1, done): if self.is_training: self.memory.append(self.s_t, self.a_t, r_t, done) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1.,1.,self.nb_actions) # self.a_t = action return action def select_action(self, s_t, decay_epsilon=True): # proto action action = to_numpy( self.actor(to_tensor(np.array([s_t]), gpu_used=self.gpu_used, gpu_0=self.gpu_ids[0])), gpu_used=self.gpu_used ).squeeze(0) action += self.is_training * max(self.epsilon, 0) * self.random_process.sample() action = np.clip(action, -1., 1.) if decay_epsilon: self.epsilon -= self.depsilon # self.a_t = action return action def reset(self, s_t): self.s_t = s_t self.random_process.reset_states() def load_weights(self, dir): if dir is None: return if self.gpu_used: # load all tensors to GPU (gpu_id) ml = lambda storage, loc: storage.cuda(self.gpu_ids) else: # load all tensors to CPU ml = lambda storage, loc: storage self.actor.load_state_dict( torch.load('output/{}/actor.pkl'.format(dir), map_location=ml) ) self.critic.load_state_dict( torch.load('output/{}/critic.pkl'.format(dir), map_location=ml) ) print('model weights loaded') def save_model(self,output): if len(self.gpu_ids) == 1 and self.gpu_ids[0] > 0: with torch.cuda.device(self.gpu_ids[0]): torch.save( self.actor.state_dict(), '{}/actor.pt'.format(output) ) torch.save( self.critic.state_dict(), '{}/critic.pt'.format(output) ) elif len(self.gpu_ids) > 1: torch.save(self.actor.module.state_dict(), '{}/actor.pt'.format(output) ) torch.save(self.actor.module.state_dict(), '{}/critic.pt'.format(output) ) else: torch.save( self.actor.state_dict(), '{}/actor.pt'.format(output) ) torch.save( self.critic.state_dict(), '{}/critic.pt'.format(output) ) def seed(self,seed): torch.manual_seed(seed) if len(self.gpu_ids) > 0: torch.cuda.manual_seed_all(seed)
class BiCNet(): def __init__(self, s_dim, a_dim, n_agents, **kwargs): self.s_dim = s_dim self.a_dim = a_dim self.config = kwargs['config'] self.n_agents = n_agents self.device = 'cuda' if self.config.use_cuda else 'cpu' # Networks self.policy = Actor(s_dim, a_dim, n_agents) self.policy_target = Actor(s_dim, a_dim, n_agents) self.critic = Critic(s_dim, a_dim, n_agents) self.critic_target = Critic(s_dim, a_dim, n_agents) if self.config.use_cuda: self.policy.cuda() self.policy_target.cuda() self.critic.cuda() self.critic_target.cuda() self.policy_optimizer = torch.optim.Adam(self.policy.parameters(), lr=self.config.a_lr) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.config.c_lr) hard_update(self.policy, self.policy_target) hard_update(self.critic, self.critic_target) self.random_process = OrnsteinUhlenbeckProcess( size=self.a_dim, theta=self.config.ou_theta, mu=self.config.ou_mu, sigma=self.config.ou_sigma) self.replay_buffer = list() self.epsilon = 1. self.depsilon = self.epsilon / self.config.epsilon_decay self.c_loss = None self.a_loss = None self.action_log = list() def choose_action(self, obs, noisy=True): obs = torch.Tensor([obs]).to(self.device) action = self.policy(obs).cpu().detach().numpy()[0] self.action_log.append(action) if noisy: for agent_idx in range(self.n_agents): pass # action[agent_idx] += self.epsilon * self.random_process.sample() self.epsilon -= self.depsilon self.epsilon = max(self.epsilon, 0.001) np.clip(action, -1., 1.) return action def reset(self): self.random_process.reset_states() self.action_log.clear() def prep_train(self): self.policy.train() self.critic.train() self.policy_target.train() self.critic_target.train() def prep_eval(self): self.policy.eval() self.critic.eval() self.policy_target.eval() self.critic_target.eval() def random_action(self): return np.random.uniform(low=-1, high=1, size=(self.n_agents, 2)) def memory(self, s, a, r, s_, done): self.replay_buffer.append((s, a, r, s_, done)) if len(self.replay_buffer) >= self.config.memory_length: self.replay_buffer.pop(0) def get_batches(self): experiences = random.sample(self.replay_buffer, self.config.batch_size) state_batches = np.array([_[0] for _ in experiences]) action_batches = np.array([_[1] for _ in experiences]) reward_batches = np.array([_[2] for _ in experiences]) next_state_batches = np.array([_[3] for _ in experiences]) done_batches = np.array([_[4] for _ in experiences]) return state_batches, action_batches, reward_batches, next_state_batches, done_batches def train(self): state_batches, action_batches, reward_batches, next_state_batches, done_batches = self.get_batches( ) state_batches = torch.Tensor(state_batches).to(self.device) action_batches = torch.Tensor(action_batches).to(self.device) reward_batches = torch.Tensor(reward_batches).reshape( self.config.batch_size, self.n_agents, 1).to(self.device) next_state_batches = torch.Tensor(next_state_batches).to(self.device) done_batches = torch.Tensor( (done_batches == False) * 1).reshape(self.config.batch_size, self.n_agents, 1).to(self.device) target_next_actions = self.policy_target.forward(next_state_batches) target_next_q = self.critic_target.forward(next_state_batches, target_next_actions) main_q = self.critic(state_batches, action_batches) ''' How to concat each agent's Q value? ''' #target_next_q = target_next_q #main_q = main_q.mean(dim=1) ''' Reward Norm ''' # reward_batches = (reward_batches - reward_batches.mean(dim=0)) / reward_batches.std(dim=0) / 1024 # Critic Loss self.critic.zero_grad() baselines = reward_batches + done_batches * self.config.gamma * target_next_q loss_critic = torch.nn.MSELoss()(main_q, baselines.detach()) loss_critic.backward() torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5) self.critic_optimizer.step() # Actor Loss self.policy.zero_grad() clear_action_batches = self.policy.forward(state_batches) loss_actor = -self.critic.forward(state_batches, clear_action_batches).mean() loss_actor += (clear_action_batches**2).mean() * 1e-3 loss_actor.backward() torch.nn.utils.clip_grad_norm_(self.policy.parameters(), 0.5) self.policy_optimizer.step() # This is for logging self.c_loss = loss_critic.item() self.a_loss = loss_actor.item() soft_update(self.policy, self.policy_target, self.config.tau) soft_update(self.critic, self.critic_target, self.config.tau) def get_loss(self): return self.c_loss, self.a_loss def get_action_std(self): return np.array(self.action_log).std(axis=-1).mean()
class DDPG(object): def __init__(self, nb_states, nb_actions, args): if args.seed > 0: self.seed(args.seed) self.nb_states = nb_states self.nb_actions = nb_actions # Create Actor and Critic Network net_cfg = { "hidden1": args.hidden1, "hidden2": args.hidden2, "init_w": args.init_w, } self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update( self.actor_target, self.actor ) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) # Create replay buffer self.memory = SequentialMemory( limit=args.rmsize, window_length=args.window_length ) self.random_process = OrnsteinUhlenbeckProcess( size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma ) # Hyper-parameters self.batch_size = args.bsize self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.is_training = True # if USE_CUDA: self.cuda() def update_policy(self): # Sample batch ( state_batch, action_batch, reward_batch, next_state_batch, terminal_batch, ) = self.memory.sample_and_split(self.batch_size) # Prepare for the target q batch next_q_values = self.critic_target( [ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ] ) # next_q_values.volatile = False target_q_batch = ( to_tensor(reward_batch) + self.discount * to_tensor(terminal_batch.astype(np.float)) * next_q_values ) # Critic update self.critic.zero_grad() q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)]) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() # Actor update self.actor.zero_grad() policy_loss = -self.critic( [to_tensor(state_batch), self.actor(to_tensor(state_batch))] ) policy_loss = policy_loss.mean() policy_loss.backward() self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): if self.is_training: self.memory.append(self.s_t, self.a_t, r_t, done) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1.0, 1.0, self.nb_actions) self.a_t = action return action def select_action(self, s_t, decay_epsilon=True): action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0) action += self.is_training * max(self.epsilon, 0) * self.random_process.sample() action = np.clip(action, -1.0, 1.0) if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action return action def reset(self, obs): self.s_t = obs self.random_process.reset_states() def load_weights(self, output): if output is None: return self.actor.load_state_dict(torch.load("{}/actor.pkl".format(output))) self.critic.load_state_dict(torch.load("{}/critic.pkl".format(output))) def save_model(self, output): torch.save(self.actor.state_dict(), "{}/actor.pkl".format(output)) torch.save(self.critic.state_dict(), "{}/critic.pkl".format(output)) def seed(self, s): torch.manual_seed(s) if USE_CUDA: torch.cuda.manual_seed(s)
def run_agent(args, model_params, weights, data_queue, weights_queue, process, global_step, updates, best_reward, param_noise_prob, save_dir, max_steps=10000000): train_fn, actor_fn, target_update_fn, params_actor, params_crit, actor_lr, critic_lr = build_model( **model_params) actor = Agent(actor_fn, params_actor, params_crit) actor.set_actor_weights(weights) env = RunEnv2(model=args.modeldim, prosthetic=args.prosthetic, difficulty=args.difficulty, skip_frame=config.skip_frames) env.spec.timestep_limit = 3000 # ndrw # random_process = OrnsteinUhlenbeckProcess(theta=.1, mu=0., sigma=.3, size=env.noutput, sigma_min=0.05, n_steps_annealing=1e6) sigma_rand = random.uniform(0.05, 0.5) dt_rand = random.uniform(0.002, 0.02) param_noise_prob = random.uniform(param_noise_prob * 0.25, min(param_noise_prob * 1.5, 1.)) random_process = OrnsteinUhlenbeckProcess(theta=.1, mu=0., sigma=sigma_rand, dt=dt_rand, size=env.noutput, sigma_min=0.05, n_steps_annealing=1e6) print('OUProcess_sigma = ' + str(sigma_rand) + ' OUProcess_dt = ' + str(dt_rand) + ' param_noise_prob = ' + str(param_noise_prob)) # prepare buffers for data states = [] actions = [] rewards = [] terminals = [] total_episodes = 0 start = time() action_noise = True while global_step.value < max_steps: seed = random.randrange(2**32 - 2) state = env.reset(seed=seed, difficulty=args.difficulty) random_process.reset_states() total_reward = 0. total_reward_original = 0. terminal = False steps = 0 while not terminal: state = np.asarray(state, dtype='float32') action = actor.act(state) if action_noise: action += random_process.sample() next_state, reward, next_terminal, info = env._step(action) total_reward += reward total_reward_original += info['original_reward'] steps += 1 global_step.value += 1 # add data to buffers states.append(state) actions.append(action) rewards.append(reward) terminals.append(terminal) state = next_state terminal = next_terminal if terminal: break total_episodes += 1 # add data to buffers after episode end states.append(state) actions.append(np.zeros(env.noutput)) rewards.append(0) terminals.append(terminal) states_np = np.asarray(states).astype(np.float32) data = ( states_np, np.asarray(actions).astype(np.float32), np.asarray(rewards).astype(np.float32), np.asarray(terminals), ) weight_send = None if total_reward > best_reward.value: weight_send = actor.get_actor_weights() # send data for training data_queue.put((process, data, weight_send, total_reward)) # receive weights and set params to weights weights = weights_queue.get() # report_str = 'Global step: {}, steps/sec: {:.2f}, updates: {}, episode len: {}, pelvis_X: {:.2f}, reward: {:.2f}, original_reward {:.4f}, best reward: {:.2f}, noise: {}'. \ # format(global_step.value, 1. * global_step.value / (time() - start), updates.value, steps, info['pelvis_X'], total_reward, total_reward_original, best_reward.value, 'actions' if action_noise else 'params') # report_str = 'Global step: {}, steps/sec: {:.2f}, updates: {}, episode len: {}, pelvis_X: {:.2f}, reward: {:.2f}, best reward: {:.2f}, noise: {}'. \ # format(global_step.value, 1. * global_step.value / (time() - start), updates.value, steps, info['pelvis_X'], total_reward, best_reward.value, 'actions' if action_noise else 'params') report_str = 'Global step: {}, steps/sec: {:.2f}, updates: {}, episode len: {}, pelvis_X: {:.2f}, pelvis_Z: {:.2f}, reward: {:.2f}, best reward: {:.2f}, noise: {}'. \ format(global_step.value, 1. * global_step.value / (time() - start), updates.value, steps, info['pelvis'][0], info['pelvis'][2], total_reward, best_reward.value, 'actions' if action_noise else 'params') print(report_str) try: with open(os.path.join(save_dir, 'train_report.log'), 'a') as f: f.write(report_str + '\n') except: print('#############################################') print( 'except » with open(os.path.join(save_dir, train_report.log), a) as f:' ) print('#############################################') actor.set_actor_weights(weights) action_noise = np.random.rand() < 1 - param_noise_prob if not action_noise: set_params_noise(actor, states_np, random_process.current_sigma) # clear buffers del states[:] del actions[:] del rewards[:] del terminals[:] if total_episodes % 100 == 0: env = RunEnv2(model=args.modeldim, prosthetic=args.prosthetic, difficulty=args.difficulty, skip_frame=config.skip_frames)
class DDPG_trainer(object): def __init__(self, nb_state, nb_action): self.nb_state = nb_state self.nb_action = nb_action self.actor = Actor(self.nb_state, self.nb_action) self.actor_target = Actor(self.nb_state, self.nb_action) self.actor_optim = Adam(self.actor.parameters(), lr=LEARNING_RATE) self.critic = Critic(self.nb_state, self.nb_action) self.critic_target = Critic(self.nb_state, self.nb_action) self.critic_optim = Adam(self.critic.parameters(), lr=LEARNING_RATE) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = SequentialMemory(limit=MEMORY_SIZE, window_length=1) self.random_process = OrnsteinUhlenbeckProcess(size=nb_action, theta=OU_THETA, mu=OU_MU, sigma=OU_SIGMA) self.is_training = True self.epsilon = 1.0 self.a_t = None self.s_t = None if USE_CUDA: self.cuda() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def select_action(self, s_t, decay_epsilon=True): action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0) action += self.is_training * max(self.epsilon, 0) * self.random_process.sample() action = np.clip(action, -1., 1.) if decay_epsilon: self.epsilon -= DELTA_EPSILON self.a_t = action return action def reset(self, observation): self.start_state = observation self.random_process.reset_states() def observe(self, r_t, s_t1, done): if self.is_training: self.memory.append(self.s_t, self.a_t, r_t, done) self.s_t = s_t1 def update_all(self): # Help Warm Up if self.memory.nb_entries < BATCH_SIZE * 2: return # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_and_split(BATCH_SIZE) # Prepare for the target q batch with torch.no_grad(): next_q_values = self.critic_target([ to_tensor(next_state_batch), self.actor_target(to_tensor(next_state_batch)), ]) target_q_batch = to_tensor(reward_batch) + \ DISCOUNT * to_tensor(terminal_batch.astype(np.float)) * next_q_values # Critic update self.critic.zero_grad() for state in state_batch: if state.shape[0] <= 2: # print("Error sampled memory!") return q_batch = self.critic( [to_tensor(state_batch), to_tensor(action_batch)]) value_loss = CRITERION(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() # Actor update self.actor.zero_grad() policy_loss = -self.critic( [to_tensor(state_batch), self.actor(to_tensor(state_batch))]) policy_loss = policy_loss.mean() policy_loss.backward() self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, TAU) soft_update(self.critic_target, self.critic, TAU)
def __init__(self, gamma, tau, actor_hidden_size, critic_hidden_size, observation_space, action_space, args): self.num_inputs = observation_space.shape[0] self.action_space = action_space self.actor_hidden_size = actor_hidden_size self.critic_hidden_size = critic_hidden_size self.comm_hidden_size = actor_hidden_size // 2 self.gamma = gamma self.tau = tau self.args = args # replay for the update of attention unit self.queue = queue.Queue() # Define actor part 1 self.actor_p1 = ActorPart1(self.num_inputs, actor_hidden_size).to(device) self.actor_target_p1 = ActorPart1(self.num_inputs, actor_hidden_size).to(device) # attention unit is not end-to-end trained self.atten = AttentionUnit(actor_hidden_size, actor_hidden_size).to(device) self.atten_optim = Adam(self.atten.parameters(), lr=self.args.actor_lr) # Define Communication Channel self.comm = CommunicationChannel(actor_hidden_size, self.comm_hidden_size).to(device) self.comm_target = CommunicationChannel( actor_hidden_size, self.comm_hidden_size).to(device) self.comm_optim = Adam(self.comm.parameters(), lr=self.args.actor_lr) # Define actor part 2 # input -- [thoughts, intergrated thoughts] self.actor_p2 = ActorPart2( actor_hidden_size + self.comm_hidden_size * 2, self.action_space, actor_hidden_size).to(device) self.actor_target_p2 = ActorPart2( actor_hidden_size + self.comm_hidden_size * 2, self.action_space, actor_hidden_size).to(device) self.actor_optim = Adam([{ 'params': self.actor_p1.parameters(), 'lr': self.args.actor_lr }, { 'params': self.actor_p2.parameters(), 'lr': self.args.actor_lr }]) self.critic = Critic(self.num_inputs, self.action_space, critic_hidden_size).to(device) self.critic_target = Critic(self.num_inputs, self.action_space, critic_hidden_size).to(device) self.critic_optim = Adam(self.critic.parameters(), lr=self.args.critic_lr) # Make sure target is with the same weight hard_update(self.actor_target_p1, self.actor_p1) hard_update(self.comm_target, self.comm) hard_update(self.actor_target_p2, self.actor_p2) hard_update(self.critic_target, self.critic) # Create replay buffer self.memory = ReplayMemory(args.memory_size) self.random_process = OrnsteinUhlenbeckProcess(size=action_space.n, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma)
critic = Critic(state_dim, action_dim, max_action, args) critic_t = Critic(state_dim, action_dim, max_action, args) critic_t.load_state_dict(critic.state_dict()) print("OK 3") # actor actor = Actor(state_dim, action_dim, max_action, args) actor_t = Actor(state_dim, action_dim, max_action, args) actor_t.load_state_dict(actor.state_dict()) # action noise if not args.ou_noise: a_noise = GaussianNoise(action_dim, sigma=args.gauss_sigma) else: a_noise = OrnsteinUhlenbeckProcess(action_dim, mu=args.ou_mu, theta=args.ou_theta, sigma=args.ou_sigma) if USE_CUDA: critic.cuda() critic_t.cuda() actor.cuda() actor_t.cuda() print("OK 4") # CEM es = sepCEM(actor.get_size(), mu_init=actor.get_params(), sigma_init=args.sigma_init, damp=args.damp, damp_limit=args.damp_limit,
class DDPG(object): def __init__(self, state_size, action_size, memory_size, batch_size=128, tan=0.001, actor_lr=0.001, critic_lr=0.001, epsilon=1.): self.state_size = state_size self.action_size = action_size self.batch_size = batch_size self.tan = tan self.warmup = WARM_UP self.epsilon = epsilon self.epsilon_decay = hyperparameters['D_EPSILON'] self.actor = Actor(state_size, action_size) self.actor_target = Actor(state_size, action_size) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr) self.critic = Critic(state_size, action_size) self.critic_target = Critic(state_size, action_size) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr) self.memory = Memory(memory_size) self.criterion = nn.MSELoss() self.random_process = OrnsteinUhlenbeckProcess(size=action_size, theta=0.15, mu=0., sigma=0.2) copy_parameter(self.actor, self.actor_target) copy_parameter(self.critic, self.critic_target) def train(self): # if not warm up if self.memory.counter < self.warmup: return # get batch state_batch, action_batch, next_state_batch, reward_batch, done_batch = self.memory.sample( self.batch_size) action_batch = action_batch.reshape((-1, self.action_size)) reward_batch = reward_batch.reshape((-1, 1)) done_batch = done_batch.reshape((-1, 1)) # update critic nsb = Variable(torch.from_numpy(next_state_batch).float(), volatile=True) # next_state_batch nab = self.actor_target(nsb) # next_action_batch next_q = self.critic_target(nsb, nab) next_q.volatile = False rb = Variable(torch.from_numpy(reward_batch).float()) # reward_batch db = Variable(torch.from_numpy(done_batch).float( )) # if next state is None, next_q should be 0, which means q = r q_target = rb + hyperparameters['GAMMA'] * db * next_q sb_grad = Variable(torch.from_numpy(state_batch).float() ) # state_batch with grad, mean output need grad ab = Variable(torch.from_numpy(action_batch).float()) # action_batch q_eval = self.critic(sb_grad, ab) value_loss = self.criterion(q_eval, q_target) self.critic.zero_grad() value_loss.backward() # nn.utils.clip_grad_norm(self.critic.parameters(),0.8) self.critic_optimizer.step() # update actor sb_grad = Variable( torch.from_numpy(state_batch).float()) # state_batch aab = self.actor(sb_grad) # actor_action_batch q = self.critic(sb_grad, aab) policy_loss = torch.mean(-q) self.actor.zero_grad() policy_loss.backward() # nn.utils.clip_grad_norm(self.actor.parameters(),0.8) self.actor_optimizer.step() # update parameter between two network update_parameter(self.critic_target, self.critic, self.tan) update_parameter(self.actor_target, self.actor, self.tan) def select_action(self, s, is_train=True, decay_e=True): if self.memory.counter < self.warmup: action = env.action_space.sample()[0] # action = random.uniform(-2.,2.) return action state = Variable(torch.FloatTensor([s]).float()) action = self.actor(state).squeeze(1).data.numpy() action += is_train * max(self.epsilon, 0) * self.random_process.sample() action = float(np.clip(action, -1., 1.)[0]) if decay_e: if self.memory.counter > self.warmup: self.epsilon -= self.epsilon_decay return action