def __init__(self, state_dim, action_dim, learning_rate=0.001, reward_decay=0.99, e_greedy=0.9): self.action_dim = action_dim self.state_dim = state_dim self.lr = learning_rate self.gamma = reward_decay # in according to the parameters in the formulation. self.epsilon = e_greedy self.EPS_START = 0.9 self.EPS_END = 0.05 self.EPS_DECAY = 30000 # this decay is to slow. # TO DO: figure out the relationship between the decay and the totoal step. # try to use a good strategy to solve this problem. use_cuda = torch.cuda.is_available() self.LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor self.FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor self.model = QNet(self.state_dim, self.action_dim).cuda() if use_cuda else QNet( self.state_dim, self.action_dim) self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr) # self.scheduler = optim.StepLR(self.optimizer, step_size=10000, gamma=0.5) # the learning rate decrease by a factor gamma every 10000 step_size. util.weights_init(self.model)
def __init__(self, args, state_size, action_size, seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.per = args.per self.dueling = args.dueling self.buffer_size = args.buffer_size self.batch_size = args.batch_size self.gamma = args.gamma self.tau = args.tau self.lr = args.learning_rate self.update_freq = args.update_every # Q-Network if self.dueling: self.local_qnet = DuelingQNet(state_size, action_size, seed).to(device) self.target_qnet = DuelingQNet(state_size, action_size, seed).to(device) else: self.local_qnet = QNet(state_size, action_size, seed).to(device) self.target_qnet = QNet(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.local_qnet.parameters(), lr=self.lr) # Replay Memory if self.per: self.memory = PrioritizedReplayMemory(args, self.buffer_size) else: self.memory = ReplayMemory(action_size, self.buffer_size, self.batch_size, seed) self.t_step = 0 # init time step for updating every UPDATE_EVERY steps
def test(L, mouse_initial_indices, rewardlist, actions_list): online_net = QNet(3, 4).to(device) online_net.load_state_dict( torch.load("./qlearning_model", map_location=device)) env = deepcopy(L) done = False eaubue = 0. steps = 0 score = 0 if mouse_initial_indices is None: all_possible_starting_positions = np.array([*np.where(L == 1)]).T mouse_initial_indices = all_possible_starting_positions[ np.random.choice(range(len(all_possible_starting_positions)))] state = np.array(mouse_initial_indices) state = torch.Tensor(state).to(device) state = state.unsqueeze(0) def progress_loop(done, steps, state, score, eaubue): steps += 1 action = get_action(state, online_net, 1, env, True, eaubue) displacement = np.array(actions_list[action]) newstate = state + torch.Tensor(displacement).to(device) if env[int(newstate[0][0].tolist()), int(newstate[0][1].tolist())] != 0: next_state = newstate displayer.main_canva.move(displayer.mouse, *(displacement * displayer.square_size)) reward = rewardlist[env[int(newstate[0][0].tolist()), int(newstate[0][1].tolist())]] if env[int(newstate[0][0].tolist()), int(newstate[0][1].tolist())] == 2: done = True if env[int(newstate[0][0].tolist()), int(newstate[0][1].tolist() )] == 4: #if the mouse is in the water env[int(newstate[0][0].tolist()), int(newstate[0][1].tolist())] = 5 #there is no more water eaubue = 1. else: next_state = state reward = rewardlist[0] score += reward state = next_state print('position : ', state.tolist()[0], score) if done is False: displayer.window.after( 800, lambda: progress_loop(done, steps, state, score, eaubue)) displayer = Displayer() displayer.create_labyrinth(L, mouse_initial_indices) progress_loop(done, steps, state, score, 0.) displayer.window.mainloop()
class Agent(): def __init__(self, state_size, action_size, seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.qnetwork_local = QNet(state_size, action_size, seed).to(device) self.qnetwork_target = QNet(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay Memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) self.t_step = 0 def step(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.1): state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences # For normal DQN #Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # For double DQN Q_targets_next = np.argmax(self.qnetwork_local(next_states).detach(),axis=-1).unsqueeze(1) Q_targets_next = self.qnetwork_target(next_states).gather(1, Q_targets_next) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) Q_expected = self.qnetwork_local(states).gather(1, actions) loss = F.mse_loss(Q_expected, Q_targets) self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def main(): env = gym.make(env_name) env.seed(500) torch.manual_seed(500) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) net = QNet(num_inputs, num_actions) optimizer = optim.Adam(net.parameters(), lr=lr) writer = SummaryWriter('logs') net.to(device) net.train() running_score = 0 steps = 0 loss = 0 for e in range(3000): done = False memory = Memory() score = 0 state = env.reset() state = torch.Tensor(state).to(device) state = state.unsqueeze(0) while not done: steps += 1 action = net.get_action(state) next_state, reward, done, _ = env.step(action) next_state = torch.Tensor(next_state) next_state = next_state.unsqueeze(0) mask = 0 if done else 1 reward = reward if not done or score == 499 else -1 action_one_hot = torch.zeros(2) action_one_hot[action] = 1 memory.push(state, next_state, action_one_hot, reward, mask) score += reward state = next_state loss = QNet.train_model(net, memory.sample(), optimizer) score = score if score == 500.0 else score + 1 running_score = 0.99 * running_score + 0.01 * score if e % log_interval == 0: print('{} episode | score: {:.2f}'.format(e, running_score)) writer.add_scalar('log/score', float(running_score), e) writer.add_scalar('log/loss', float(loss), e) if running_score > goal_score: break
def __init__(self, state_size, action_size, seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.qnetwork_local = QNet(state_size, action_size, seed).to(device) self.qnetwork_target = QNet(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay Memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) self.t_step = 0
def __init__(self): #Creating environment self.env = gym.make(settings.env_name) self.env.seed(settings.seed) self.env.action_space.seed(settings.seed) self.state_space = self.env.observation_space.shape[0] self.action_space = self.env.action_space.shape[0] self.obs_normalizer = Normalizer(self.state_space) self.device = torch.device(settings.device) self.writer = SummaryWriter( 'runs/' + settings.env_name + "_" + settings.algo + '_{}_{}_{}'.format(p.alpha, p.ex_alpha, settings.seed)) #Initializing common networks and their optimizers self.exploitory_policy = GaussianPolicy( self.state_space, self.action_space).to(self.device) self.exploitory_Q = QNet(self.state_space, self.action_space).to(self.device) self.exploitory_Q_target = QNet(self.state_space, self.action_space).to(self.device) self.exploitory_policy_optim = Adam( self.exploitory_policy.parameters(), lr=p.lr) self.exploitory_Q_optim = Adam(self.exploitory_Q.parameters(), lr=p.lr) self.target_update(self.exploitory_Q_target, self.exploitory_Q, 1.0) p.alpha = torch.Tensor([p.alpha]).to(self.device) if settings.automatic_entropy_tuning: self.target_entropy = -torch.prod( torch.Tensor(self.env.action_space.shape).to( self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=p.lr) if settings.automatic_ex_entropy_tuning: self.ex_target_entropy = -torch.prod( torch.Tensor(self.env.action_space.shape).to( self.device)).item() self.ex_log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.ex_alpha_optim = Adam([self.log_alpha], lr=p.lr) if settings.reward_model == 'novelty': self.ex_reward_model = Novelty(self.state_space, self.device)
def main(): if not (os.path.isdir("logs")): os.makedirs("logs") working_dir = "logs/" + args.dir if not (os.path.isdir(working_dir)): raise NameError(args.dir + " does not exist in dir logs") print(args) env = QubeSwingupEnv(use_simulator=args.sim, batch_size= 2048*4) num_inputs = env.observation_space.shape[0] num_actions = NUMBER_OF_ACTIONS print('state size:', num_inputs) print('action size:', num_actions) net = QNet(num_inputs, num_actions) if not args.new_net else QNet_more_layers(num_inputs, num_actions) net.load_state_dict(torch.load(working_dir + "/best_model.pth", map_location=torch.device(device))) net.to(device) net.eval() running_score = 0 epsilon = 1.0 steps = 0 beta = beta_start loss = 0 best_running_score = -1000 for e in range(1): done = False score = 0 state = env.reset() state = torch.Tensor(state).to(device) state = state.unsqueeze(0) while not done: steps += 1 action = get_continuous_action(get_action(state, net)) if np.abs(state[0][1].item()) < deg2rad(25): action = pd_control_policy(state.cpu().numpy()[0])[0] next_state, reward, done, info = env.step(action) reward = give_me_reward(info["alpha"], info["theta"]) if args.sim: env.render() reward = give_me_reward(info["alpha"], info["theta"]) if done: print(info) print("theta:" , info["theta"] * 180/np.pi) next_state = torch.Tensor(next_state).to(device) next_state = next_state.unsqueeze(0) score += reward state = next_state running_score = 0.99 * running_score + 0.01 * score print('{} episode | running_score: {:.2f} | score: {:.2f} | steps: {} '.format(e, running_score, score, steps)) env.close()
def sample(self, batch_size, net, target_net, beta): probability_sum = sum(self.memory_probabiliy) p = [probability / probability_sum for probability in self.memory_probabiliy] indexes = np.random.choice(np.arange(len(self.memory)), batch_size, p=p) transitions = [self.memory[idx] for idx in indexes] transitions_p = [p[idx] for idx in indexes] batch = Transition(*zip(*transitions)) weights = [pow(self.capacity * p_j, -beta) for p_j in transitions_p] weights = torch.Tensor(weights).to(device) weights = weights / weights.max() td_error = QNet.get_td_error(net, target_net, batch.state, batch.next_state, batch.action, batch.reward, batch.mask) td_error = td_error.detach() td_error_idx = 0 for idx in indexes: self.memory_probabiliy[idx] = pow(abs(td_error[td_error_idx]) + small_epsilon, alpha).item() # print(pow(abs(td_error[td_error_idx]) + small_epsilon, alpha).item()) td_error_idx += 1 return batch, weights
def main(): env = gym.make(args.env_name) env.seed(500) torch.manual_seed(500) img_shape = env.observation_space.shape num_actions = 3 print('image size:', img_shape) print('action size:', num_actions) net = QNet(num_actions) net.load_state_dict(torch.load(args.save_path + 'model.pth')) net.to(device) net.eval() epsilon = 0 for e in range(5): done = False score = 0 state = env.reset() state = pre_process(state) state = torch.Tensor(state).to(device) history = torch.stack((state, state, state, state)) for i in range(3): action = env.action_space.sample() state, reward, done, info = env.step(action) state = pre_process(state) state = torch.Tensor(state).to(device) state = state.unsqueeze(0) history = torch.cat((state, history[:-1]), dim=0) while not done: if args.render: env.render() steps += 1 qvalue = net(history.unsqueeze(0)) action = get_action(0, qvalue, num_actions) next_state, reward, done, info = env.step(action + 1) next_state = pre_process(next_state) next_state = torch.Tensor(next_state).to(device) next_state = next_state.unsqueeze(0) next_history = torch.cat((next_state, history[:-1]), dim=0) score += reward history = next_history print('{} episode | score: {:.2f}'.format(e, score))
def __init__(self, input_size, action_size, gamma, tau, alpha, hidden_size, lr, device): self.gamma, self.tau, self.alpha = gamma, tau, alpha self.lr, self.device = lr, device self.policy = Actor(input_size, hidden_size, action_size).to(self.device) self.critic = QNet(input_size, hidden_size, action_size).to(self.device) self.policy_optim = torch.optim.Adam(self.policy.parameters(), lr=self.lr) self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=self.lr) self.critic_target = copy.deepcopy(self.critic) self.critic_target.requires_grad_(False)
def __init__(self, state_size, action_size, seed): """ Params ====== state_size (int): state dimension action_size (int): action dimension seed (int): random seed for replicating experiment """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.QNet_local = QNet(state_size, action_size, seed).to(device) self.QNet_target = QNet(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.QNet_local.parameters(), lr=LR) self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) self.t_step = 0
def __init__(self, actor_id, n_actors, shared_dict, device='cpu'): # params self.gamma = 0.99 self.epsilon = 0.4 ** (1 + actor_id * 7 / (n_actors - 1)) self.bootstrap_steps = 3 self.alpha = 0.6 self.priority_epsilon = 1e-6 self.device = device self.actor_id = actor_id # path self.memory_path = os.path.join( './', 'logs', 'memory') # memory self.memory_size = 50000 self.batch_size = 32 self.action_repeat = 4 self.n_stacks = 4 self.burn_in_length = 10 self.learning_length = 10 self.overlap_length = 10 self.eta = 0.9 self.sequence_length = self.burn_in_length + self.learning_length self.stack_count = self.n_stacks // self.action_repeat self.memory_save_interval = 5 self.episode_start_index = 0 self.n_steps_memory = NStepMemory(self.bootstrap_steps, self.gamma) self.replay_memory = ReplayMemory(self.memory_size, self.batch_size, self.bootstrap_steps) # net self.shared_dict = shared_dict self.net_load_interval = 5 self.net = QNet(self.device).to(self.device) self.target_net = QNet(self.device).to(self.device) self.target_net.load_state_dict(self.net.state_dict()) # env self.env = PongEnv(self.action_repeat, self.n_stacks) self.episode_reward = 0 self.n_episodes = 0 self.n_steps = 0 self.memory_count = 0 self.state = self.env.reset()
def __init__(self): super(Off_policy, self).__init__() self.memory = Replay_buffer(capacity=p.exploitory_policy_memory_size) self.exploratory_policy = GaussianPolicy( self.state_space, self.action_space).to(self.device) self.exploratory_Q = QNet(self.state_space, self.action_space).to(self.device) self.exploratory_Q_target = QNet(self.state_space, self.action_space).to(self.device) self.exploratory_policy_optim = Adam( self.exploratory_policy.parameters(), lr=p.lr) self.exploratory_Q_optim = Adam(self.exploratory_Q.parameters(), lr=p.lr) self.target_update(self.exploratory_policy, self.exploitory_policy, 1.0) self.kl_normalizer = Normalizer(1) self.ex_rewards_normalizer = Normalizer(1)
def __init__(self, *largs, **kwargs): super(PPO, self).__init__(*largs, **kwargs) self.pi_net = PiNet(self.ns, self.na, distribution='Normal', bounded=False, agent='ppo').to(self.device) self.v_net = QNet(self.ns, 0, agent='ppo').to(self.device) self.optimizer_v = torch.optim.Adam(self.v_net.parameters(), lr=self.lr_q, betas=(0.9, 0.999), weight_decay=self.weight_decay_q) self.optimizer_p = torch.optim.Adam(self.pi_net.parameters(), lr=self.lr_p, betas=(0.9, 0.999), weight_decay=self.weight_decay_p)
def __init__(self, env): super(DDPG, self).__init__() pi_net = PiNet(self.ns, self.na) self.pi_net = pi_net.to(self.device) pi_target = PiNet(self.ns, self.na) self.pi_target = pi_target.to(self.device) self.load_state_dict(self.pi_target, self.pi_net.state_dict()) q_net = QNet(self.ns, self.na) self.q_net = q_net.to(self.device) q_target = QNet(self.ns, self.na) self.q_target = q_target.to(self.device) self.load_state_dict(self.q_target, self.q_net.state_dict()) self.optimizer_q = torch.optim.Adam(self.q_net.parameters(), lr=self.lr_q, betas=(0.9, 0.999), weight_decay=1e-2) self.optimizer_p = torch.optim.Adam(self.pi_net.parameters(), lr=self.lr_p, betas=(0.9, 0.999), weight_decay=0) self.noise = OrnsteinUhlenbeckActionNoise( torch.zeros(1, self.na).to(self.device), self.epsilon * torch.ones(1, self.na).to(self.device))
def __init__(self, n_actors, device='cuda:0'): # params self.gamma = 0.99 self.alpha = 0.6 self.bootstrap_steps = 3 self.initial_exploration = 50000 self.priority_epsilon = 1e-6 self.device = device self.n_epochs = 0 self.n_actors = n_actors # path self.memory_path = os.path.join('./', 'logs', 'memory') self.net_path = os.path.join('./', 'logs', 'model', 'net.pt') self.target_net_path = os.path.join('./', 'logs', 'model', 'target_net.pt') # memory self.memory_size = 500000 self.batch_size = 128 self.memory_load_interval = 10 self.replay_memory = ReplayMemory(self.memory_size, self.batch_size, self.bootstrap_steps) # net self.net_save_interval = 50 self.target_update_interval = 1000 self.net = QNet(self.net_path, self.device).to(self.device) self.target_net = QNet(self.target_net_path, self.device).to(self.device) self.target_net.load_state_dict(self.net.state_dict()) self.net.save() self.target_net.save() self.optim = optim.RMSprop(self.net.parameters(), lr=0.00025 / 4.0, alpha=0.95, eps=1.5e-7, centered=True)
def run(self): epsilon = 1.0 steps = 0 while self.global_ep.value < max_episode: if self.global_ep_r.value > goal_score: break done = False score = 0 state = self.env.reset() state = torch.Tensor(state).to(device) state = state.unsqueeze(0) memory = Memory(async_update_step) while not done: steps += 1 action = self.get_action(state, epsilon) next_state, reward, done, _ = self.env.step(action) next_state = torch.Tensor(next_state) next_state = next_state.unsqueeze(0) mask = 0 if done else 1 reward = reward if not done or score == 499 else -1 action_one_hot = np.zeros(2) action_one_hot[action] = 1 memory.push(state, next_state, action_one_hot, reward, mask) score += reward state = next_state epsilon -= 0.00001 epsilon = max(epsilon, 0.1) if len(memory) == async_update_step or done: batch = memory.sample() loss = QNet.train_model(self.online_net, self.target_net, self.optimizer, batch) memory = Memory(async_update_step) if done: self.record(score, epsilon, loss) break if steps % update_target == 0: self.update_target_model() score = score if score == 500.0 else score + 1 self.res_queue.put(None)
def __init__(self, state_dim, action_dim, action_lim, critic='TD', learning_rate=0.001, reward_decay=0.99, e_greedy=0.9): self.use_cuda = torch.cuda.is_available() # self.FloatTensor = torch.FloatTensor.cuda() if self.use_cuda else torch.FloatTensor self.state_dim = state_dim self.action_dim = action_dim self.lr = 0.01 self.gamma = 0.95 self.action_lim = action_lim self.FloatTensor = torch.cuda.FloatTensor if self.use_cuda else torch.FloatTensor # critic model self.critic = VNet.VNet(state_dim=state_dim) self.optim_critic = torch.optim.Adam(self.critic.parameters(), self.lr) # self.optim_critic = torch.optim.LBFGS(self.critic.parameters(),lr = learning_rate) # self.critic.apply(util.weights_init) # actor model self.actor = QNet.Policy_trpo(state_dim=state_dim, action_dim=action_dim, action_lim=action_lim) # self.optim_actor = torch.optim.Adam(self.actor.parameters(), self.lr) # self.actor.apply(util.weights_init) # cuda self.critic = self.critic.cuda() if self.use_cuda else self.critic # self.target_critic = self.target_critic.cuda() if self.use_cuda else self.target_critic self.actor = self.actor.cuda() if self.use_cuda else self.actor # self.target_actor = self.target_actor.cuda() if self.use_cuda else self.target_actor # The buffer of the agent MAX_BUFFER_SIZE = 1000000 self.buffer = buffer.MemoryBuffer(size=MAX_BUFFER_SIZE) self.UD_BATCH_SIZE = 100 # update batch size # Hessian-vector product parameters self.use_finite_differences = True # must be true if we want to use hessian-vector product to solve this problem self.cg_damping = 0.1 self.cg_iters = 10 self.residual_tol = 1e-8 self.delta = 1e-2 # upper bound of kl divergence self.accept_radio = 1e-1 # bactracking line search self.max_backtracks = 10 # parmaters reconstuction from a vector self.actor_properties = OrderedDict() for k, v in self.actor.state_dict().items(): self.actor_properties[k] = v.size() self.line_search = True # If false, then the algorithm is Natural Gradient Method
def setup(self, obs_shape, nb_action): self.lr_coef = 1 self.epsilon = 1 self.nb_action = nb_action model_args = Singleton_arger()['model'] qnet = QNet(obs_shape, nb_action) self.qnet = copy.deepcopy(qnet) self.target_qnet = copy.deepcopy(qnet) self.memory = Memory(self.buffer_size, nb_action, self.with_cuda) if self.with_cuda: self.qnet.cuda() self.target_qnet.cuda() self.qnet_optim = Adam(self.qnet.parameters(), lr=self.critic_lr)
def main(): env = gym.make(args.env_name) env.seed(500) torch.manual_seed(500) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) net = QNet(num_inputs, num_actions) net.load_state_dict(torch.load(args.save_path + 'model.pth')) net.to(device) net.eval() running_score = 0 steps = 0 for e in range(5): done = False score = 0 state = env.reset() state = torch.Tensor(state).to(device) state = state.unsqueeze(0) while not done: env.render() steps += 1 qvalue = net(state) action = get_action(qvalue) next_state, reward, done, _ = env.step(action) next_state = torch.Tensor(next_state).to(device) next_state = next_state.unsqueeze(0) score += reward state = next_state print('{} episode | score: {:.2f}'.format(e, score))
def __init__(self, run): self.run = run ckpt_dir = os.path.join(run, 'ckpt') ckpts = glob2.glob(os.path.join(ckpt_dir, '*.pth')) assert ckpts, "No checkpoints to resume from!" def get_epoch(ckpt_url): s = re.findall("ckpt_e(\d+).pth", ckpt_url) epoch = int(s[0]) if s else -1 return epoch, ckpt_url start_epoch, ckpt = max(get_epoch(c) for c in ckpts) print('Checkpoint:', ckpt) if torch.cuda.is_available(): model = QNet().cuda() else: model = QNet() ckpt = torch.load(ckpt) model.load_state_dict(ckpt['model']) model.eval() self.model = model
def main(): net = QNet().cuda().train() # print(net) optimizer = optim.SGD([{ 'params': [ param for name, param in net.named_parameters() if name[-4:] == 'bias' ], 'lr': 2 * args['lr'] }, { 'params': [ param for name, param in net.named_parameters() if name[-4:] != 'bias' ], 'lr': args['lr'], 'weight_decay': args['weight_decay'] }], momentum=args['momentum']) if len(args['snapshot']) > 0: print('training resumes from ' + args['snapshot']) net.load_state_dict( torch.load( os.path.join(ckpt_path, exp_name, args['snapshot'] + '.pth'))) optimizer.load_state_dict( torch.load( os.path.join(ckpt_path, exp_name, args['snapshot'] + '_optim.pth'))) optimizer.param_groups[0]['lr'] = 2 * args['lr'] optimizer.param_groups[1]['lr'] = args['lr'] check_mkdir(ckpt_path) check_mkdir(os.path.join(ckpt_path, exp_name)) open(log_path, 'w').write(str(args) + '\n\n') train(net, optimizer)
class Actor: def __init__(self, actor_id, n_actors, shared_dict, device='cpu'): # params self.gamma = 0.99 self.epsilon = 0.4 ** (1 + actor_id * 7 / (n_actors - 1)) self.bootstrap_steps = 3 self.alpha = 0.6 self.priority_epsilon = 1e-6 self.device = device self.actor_id = actor_id # path self.memory_path = os.path.join( './', 'logs', 'memory') # memory self.memory_size = 50000 self.batch_size = 32 self.action_repeat = 4 self.n_stacks = 4 self.burn_in_length = 10 self.learning_length = 10 self.overlap_length = 10 self.eta = 0.9 self.sequence_length = self.burn_in_length + self.learning_length self.stack_count = self.n_stacks // self.action_repeat self.memory_save_interval = 5 self.episode_start_index = 0 self.n_steps_memory = NStepMemory(self.bootstrap_steps, self.gamma) self.replay_memory = ReplayMemory(self.memory_size, self.batch_size, self.bootstrap_steps) # net self.shared_dict = shared_dict self.net_load_interval = 5 self.net = QNet(self.device).to(self.device) self.target_net = QNet(self.device).to(self.device) self.target_net.load_state_dict(self.net.state_dict()) # env self.env = PongEnv(self.action_repeat, self.n_stacks) self.episode_reward = 0 self.n_episodes = 0 self.n_steps = 0 self.memory_count = 0 self.state = self.env.reset() def run(self): while True: self.step() def step(self): state = self.state action, q_value, h, c, target_q_value, target_h, target_c = self.select_action(state) q_value = q_value.detach().cpu().numpy() target_q_value = target_q_value.detach().cpu().numpy() next_state, reward, done, _ = self.env.step(action) self.episode_reward += reward self.n_steps += 1 self.n_steps_memory.add(q_value, state[-self.action_repeat:], h, c, target_h, target_c, action, reward, self.stack_count) if self.stack_count > 1: self.stack_count -= 1 if self.n_steps > self.bootstrap_steps: pre_q_value, state, h, c, target_h, target_c, action, reward, stack_count = self.n_steps_memory.get() priority = self.calc_priority(pre_q_value, action, reward, q_value, target_q_value, done) self.replay_memory.add(state, h, c, target_h, target_c, action, reward, done, stack_count, priority) self.memory_count += 1 self.state = next_state.copy() if done: while self.n_steps_memory.size > 0: pre_q_value, state, h, c, target_h, target_c, action, reward, stack_count = self.n_steps_memory.get() priority = self.calc_priority(pre_q_value, action, reward, q_value, target_q_value, done) self.replay_memory.add(state, h, c, target_h, target_c, action, reward, done, stack_count, priority) self.memory_count += 1 self.reset() def select_action(self, state): state = torch.FloatTensor(state).unsqueeze(0).to(self.device) with torch.no_grad(): q_value, h, c = self.net(state, True) target_q_value, target_h, target_c = self.target_net(state, True) if np.random.random() < self.epsilon: action = np.random.randint(6) else: action = q_value.argmax().item() return action, q_value, h, c, target_q_value, target_h, target_c def reset(self): if self.n_episodes % 1 == 0: print('episodes:', self.n_episodes, 'actor_id:', self.actor_id, 'return:', self.episode_reward) self.net.reset() self.target_net.reset() self.set_seq_start_index() self.state = self.env.reset() self.episode_start_index = self.replay_memory.index self.episode_reward = 0 self.n_episodes += 1 self.n_steps = 0 self.memory_count = 0 self.stack_count = self.n_stacks // self.action_repeat # reset n_step memory self.n_steps_memory = NStepMemory(self.bootstrap_steps, self.gamma) # save replay memory if self.n_episodes % self.memory_save_interval == 0: self.replay_memory.save(self.memory_path, self.actor_id) self.replay_memory = ReplayMemory(self.memory_size, self.batch_size, self.bootstrap_steps) self.episode_start_index = 0 gc.collect() # load net if self.n_episodes % self.net_load_interval == 0: self.load_model() def load_model(self): try: self.net.load_state_dict(self.shared_dict['net_state']) self.target_net.load_state_dict(self.shared_dict['target_net_state']) except: print('load error') def calc_priority(self, q_value, action, reward, next_q_value, target_next_q_value, done): q_value = q_value.reshape(-1)[action] target_next_q_value = target_next_q_value.reshape(-1) if done: target_q_value = reward else: next_action = next_q_value.argmax(-1) target_next_q_value = target_next_q_value[next_action] target_q_value = reward + (self.gamma**self.bootstrap_steps) * target_next_q_value priority = np.abs(q_value - target_q_value) + self.priority_epsilon priority = priority ** self.alpha return priority def set_seq_start_index(self): last_index = self.replay_memory.index start_index = self.episode_start_index seq_start_index = [i for i in range(start_index, last_index-self.sequence_length, self.overlap_length)] seq_start_index.append(last_index - self.sequence_length) seq_start_index = np.array(seq_start_index) self.replay_memory.update_sequence_priority(seq_start_index) self.replay_memory.memory['is_seq_start'][seq_start_index] = 1
def main(): env = gym.make(env_name) env.seed(500) torch.manual_seed(500) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) online_net = QNet(num_inputs, num_actions) target_net = QNet(num_inputs, num_actions) target_net.load_state_dict(online_net.state_dict()) online_net.share_memory() target_net.share_memory() optimizer = SharedAdam(online_net.parameters(), lr=lr) global_ep, global_ep_r, res_queue = mp.Value('i', 0), mp.Value('d', 0.), mp.Queue() writer = SummaryWriter('logs') online_net.to(device) target_net.to(device) online_net.train() target_net.train() workers = [ Worker(online_net, target_net, optimizer, global_ep, global_ep_r, res_queue, i) for i in range(mp.cpu_count()) ] [w.start() for w in workers] res = [] while True: r = res_queue.get() if r is not None: res.append(r) [ep, ep_r, loss] = r writer.add_scalar('log/score', float(ep_r), ep) writer.add_scalar('log/loss', float(loss), ep) else: break [w.join() for w in workers]
class Agent(): def __init__(self, args, state_size, action_size, seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.per = args.per self.dueling = args.dueling self.buffer_size = args.buffer_size self.batch_size = args.batch_size self.gamma = args.gamma self.tau = args.tau self.lr = args.learning_rate self.update_freq = args.update_every # Q-Network if self.dueling: self.local_qnet = DuelingQNet(state_size, action_size, seed).to(device) self.target_qnet = DuelingQNet(state_size, action_size, seed).to(device) else: self.local_qnet = QNet(state_size, action_size, seed).to(device) self.target_qnet = QNet(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.local_qnet.parameters(), lr=self.lr) # Replay Memory if self.per: self.memory = PrioritizedReplayMemory(args, self.buffer_size) else: self.memory = ReplayMemory(action_size, self.buffer_size, self.batch_size, seed) self.t_step = 0 # init time step for updating every UPDATE_EVERY steps def step(self, state, action, reward, next_state, done): if self.per: self.memory.append(state, action, reward, next_state, done) else: self.memory.add(state, action, reward, next_state, done) # save experience to replay memory. # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.update_freq if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: if self.dueling: self.learn_DDQN(self.gamma) else: self.learn(self.gamma) def act(self, state, eps=0.): state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.local_qnet.eval() with torch.no_grad(): action_values = self.local_qnet(state) self.local_qnet.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, gamma): if self.per: idxs, states, actions, rewards, next_states, dones, weights = self.memory.sample( self.batch_size) else: states, actions, rewards, next_states, dones = self.memory.sample() # Get max predicted Q values for next states from target model Q_targets_next = self.target_qnet(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) Q_expected = self.local_qnet(states).gather(1, actions) # Compute loss - element-wise mean squared error # Now loss is a Tensor of shape (1,) # loss.item() gets the scalar value held in the loss. loss = F.mse_loss(Q_expected, Q_targets) # Minimize loss self.optimizer.zero_grad() if self.per: (weights * loss).mean().backward( ) # Backpropagate importance-weighted minibatch loss else: loss.backward() self.optimizer.step() if self.per: errors = np.abs((Q_expected - Q_targets).detach().cpu().numpy()) self.memory.update_priorities(idxs, errors) # Update target network self.soft_update(self.local_qnet, self.target_qnet, self.tau) def learn_DDQN(self, gamma): if self.per: idxs, states, actions, rewards, next_states, dones, weights = self.memory.sample( self.batch_size) else: states, actions, rewards, next_states, dones = self.memory.sample() # Get index of maximum value for next state from Q_expected Q_argmax = self.local_qnet(next_states).detach() _, a_prime = Q_argmax.max(1) # Get max predicted Q values for next states from target model Q_targets_next = self.target_qnet(next_states).detach().gather( 1, a_prime.unsqueeze(1)) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.local_qnet(states).gather(1, actions) # Compute loss # Now loss is a Tensor of shape (1,) # loss.item() gets the scalar value held in the loss. loss = F.mse_loss(Q_expected, Q_targets) # Minimize loss self.optimizer.zero_grad() if self.per: (weights * loss).mean().backward( ) # Backpropagate importance-weighted minibatch loss else: loss.backward() self.optimizer.step() if self.per: errors = np.abs((Q_expected - Q_targets).detach().cpu().numpy()) self.memory.update_priorities(idxs, errors) # Update target network self.soft_update(self.local_qnet, self.target_qnet, self.tau) def soft_update(self, local_model, target_model, tau): # θ_target = τ*θ_local + (1 - τ)*θ_target for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def main(): env = gym.make(args.env_name) env.seed(500) torch.manual_seed(500) state_size = env.observation_space.shape[0] action_size = env.action_space.n print('state size:', state_size) print('action size:', action_size) q_net = QNet(state_size, action_size, args) target_q_net = QNet(state_size, action_size, args) optimizer = optim.Adam(q_net.parameters(), lr=0.001) update_target_model(q_net, target_q_net) writer = SummaryWriter(args.logdir) replay_buffer = deque(maxlen=10000) running_score = 0 steps = 0 for episode in range(args.max_iter_num): done = False score = 0 state = env.reset() state = np.reshape(state, [1, state_size]) while not done: if args.render: env.render() steps += 1 q_values = q_net(torch.Tensor(state)) action = get_action(q_values, action_size, args.epsilon) next_state, reward, done, _ = env.step(action) next_state = np.reshape(next_state, [1, state_size]) reward = reward if not done or score == 499 else -1 mask = 0 if done else 1 replay_buffer.append((state, action, reward, next_state, mask)) state = next_state score += reward if steps > args.initial_exploration: args.epsilon -= args.epsilon_decay args.epsilon = max(args.epsilon, 0.1) mini_batch = random.sample(replay_buffer, args.batch_size) q_net.train(), target_q_net.train() train_model(q_net, target_q_net, optimizer, mini_batch) if steps % args.update_target == 0: update_target_model(q_net, target_q_net) score = score if score == 500.0 else score + 1 running_score = 0.99 * running_score + 0.01 * score if episode % args.log_interval == 0: print( '{} episode | running_score: {:.2f} | epsilon: {:.2f}'.format( episode, running_score, args.epsilon)) writer.add_scalar('log/score', float(score), episode) if running_score > args.goal_score: if not os.path.isdir(args.save_path): os.makedirs(args.save_path) ckpt_path = args.save_path + 'model.pth.tar' torch.save(q_net.state_dict(), ckpt_path) print('Running score exceeds 400. So end') break
if __name__ == "__main__": env = env.MinecraftEnv() env.init(allowContinuousMovement=["move", "turn"], videoResolution=[800, 600]) env.seed(500) torch.manual_seed(500) render_map = False num_inputs = env.observation_space.shape num_actions = len(env.action_names[0]) print('state size:', num_inputs) print('action size:', num_actions) model = QNet(num_actions) model.apply(weights_init) target_model = QNet(num_actions) update_target_model(model, target_model) model.train() target_model.train() optimizer = optim.Adam(model.parameters(), lr=hp.lr, weight_decay=hp.l2_rate) memory = Memory(100000) if render_map: root, canvas = init_map() steps = 0
def main(): env = gym.make(env_name) env.seed(500) torch.manual_seed(500) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) online_net = QNet(num_inputs, num_actions) target_net = QNet(num_inputs, num_actions) update_target_model(online_net, target_net) optimizer = optim.Adam(online_net.parameters(), lr=lr) writer = SummaryWriter('logs') online_net.to(device) target_net.to(device) online_net.train() target_net.train() memory = Memory_With_TDError(replay_memory_capacity) running_score = 0 epsilon = 1.0 steps = 0 beta = beta_start loss = 0 for e in range(3000): done = False score = 0 state = env.reset() state = torch.Tensor(state).to(device) state = state.unsqueeze(0) while not done: steps += 1 action = get_action(state, target_net, epsilon, env) next_state, reward, done, _ = env.step(action) next_state = torch.Tensor(next_state) next_state = next_state.unsqueeze(0) mask = 0 if done else 1 reward = reward if not done or score == 499 else -1 action_one_hot = np.zeros(2) action_one_hot[action] = 1 memory.push(state, next_state, action_one_hot, reward, mask) score += reward state = next_state if steps > initial_exploration: epsilon -= 0.00005 epsilon = max(epsilon, 0.1) beta += 0.00005 beta = min(1, beta) batch, weights = memory.sample(batch_size, online_net, target_net, beta) loss = QNet.train_model(online_net, target_net, optimizer, batch, weights) if steps % update_target == 0: update_target_model(online_net, target_net) score = score if score == 500.0 else score + 1 running_score = 0.99 * running_score + 0.01 * score if e % log_interval == 0: print( '{} episode | score: {:.2f} | epsilon: {:.2f} | beta: {:.2f}'. format(e, running_score, epsilon, beta)) writer.add_scalar('log/score', float(running_score), e) writer.add_scalar('log/loss', float(loss), e) if running_score > goal_score: break
class QTDAgent(object): def __init__(self, state_dim, action_dim, learning_rate=0.001, reward_decay=0.99, e_greedy=0.9): self.action_dim = action_dim self.state_dim = state_dim self.lr = learning_rate self.gamma = reward_decay # in according to the parameters in the formulation. self.epsilon = e_greedy self.EPS_START = 0.9 self.EPS_END = 0.05 self.EPS_DECAY = 30000 # this decay is to slow. # TO DO: figure out the relationship between the decay and the totoal step. # try to use a good strategy to solve this problem. use_cuda = torch.cuda.is_available() self.LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor self.FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor self.model = QNet(self.state_dim, self.action_dim).cuda() if use_cuda else QNet( self.state_dim, self.action_dim) self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr) # self.scheduler = optim.StepLR(self.optimizer, step_size=10000, gamma=0.5) # the learning rate decrease by a factor gamma every 10000 step_size. util.weights_init(self.model) def sbc(self, v, volatile=False): return Variable(self.FloatTensor((np.expand_dims(v, 0).tolist())), volatile=volatile) def get_actions(self, state): action = self.model(self.sbc(state, volatile=True)) return action def select_action(self, state, steps_done): util.adjust_learning_rate(self.optimizer, self.lr, steps_done, 10000, lr_decay=0.2) # global steps_done sample = random.random() esp_threshold = self.EPS_END + (self.EPS_START - self.EPS_END) * \ np.exp(-1. * steps_done / self.EPS_DECAY) if sample > esp_threshold: actions = self.get_actions(state) action = actions.data.max(1)[1].view(1, 1) return action else: return self.LongTensor([[random.randrange(self.action_dim)]]) def update(self, pending): # def update(self, s, a, r, s_, a_,done=False): pending_len = len(pending) loss = 0 while (pending_len): pending_len = pending_len - 1 [s, a, r, s_, a_, done] = pending[pending_len] if (done == True): expect_state_action_value = r else: non_final_next_states = self.model(self.sbc(s_, volatile=True)) expect_state_action_value = r + self.gamma * non_final_next_states.max( 1)[0] expect_state_action_value.volatile = False # expect_state_action_value = r + self.gamma*self.model(Variable(torch.from_numpy(np.expand_dims(s_,0).astype('float32')))).max(1)[0] state_action_value = self.model(self.sbc(s))[0, a] loss += 0.5 * (state_action_value - expect_state_action_value).pow(2) self.optimizer.zero_grad() loss.backward() # loss.backward() # for param in self.model.parameters(): # param.grad.data.clamp_(-1,1) self.optimizer.step() def save_model(self, path): torch.save(self.model.state_dict(), '{}QTDAgent.pt'.format(path)) # torch.save(self.target_critic.state_dict(), '{}/critic.pt'.format(path)) print('Models saved successfully') def load_model(self, name): self.model.load_state_dict(name)