def main(): env = gym.make(env_name) env.seed(500) torch.manual_seed(500) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) online_net = QNet(num_inputs, num_actions) target_net = QNet(num_inputs, num_actions) target_net.load_state_dict(online_net.state_dict()) online_net.share_memory() target_net.share_memory() optimizer = SharedAdam(online_net.parameters(), lr=lr) global_ep, global_ep_r, res_queue = mp.Value('i', 0), mp.Value('d', 0.), mp.Queue() writer = SummaryWriter('logs') online_net.to(device) target_net.to(device) online_net.train() target_net.train() workers = [ Worker(online_net, target_net, optimizer, global_ep, global_ep_r, res_queue, i) for i in range(mp.cpu_count()) ] [w.start() for w in workers] res = [] while True: r = res_queue.get() if r is not None: res.append(r) [ep, ep_r, loss] = r writer.add_scalar('log/score', float(ep_r), ep) writer.add_scalar('log/loss', float(loss), ep) else: break [w.join() for w in workers]
class Actor: def __init__(self, actor_id, n_actors, shared_dict, device='cpu'): # params self.gamma = 0.99 self.epsilon = 0.4 ** (1 + actor_id * 7 / (n_actors - 1)) self.bootstrap_steps = 3 self.alpha = 0.6 self.priority_epsilon = 1e-6 self.device = device self.actor_id = actor_id # path self.memory_path = os.path.join( './', 'logs', 'memory') # memory self.memory_size = 50000 self.batch_size = 32 self.action_repeat = 4 self.n_stacks = 4 self.burn_in_length = 10 self.learning_length = 10 self.overlap_length = 10 self.eta = 0.9 self.sequence_length = self.burn_in_length + self.learning_length self.stack_count = self.n_stacks // self.action_repeat self.memory_save_interval = 5 self.episode_start_index = 0 self.n_steps_memory = NStepMemory(self.bootstrap_steps, self.gamma) self.replay_memory = ReplayMemory(self.memory_size, self.batch_size, self.bootstrap_steps) # net self.shared_dict = shared_dict self.net_load_interval = 5 self.net = QNet(self.device).to(self.device) self.target_net = QNet(self.device).to(self.device) self.target_net.load_state_dict(self.net.state_dict()) # env self.env = PongEnv(self.action_repeat, self.n_stacks) self.episode_reward = 0 self.n_episodes = 0 self.n_steps = 0 self.memory_count = 0 self.state = self.env.reset() def run(self): while True: self.step() def step(self): state = self.state action, q_value, h, c, target_q_value, target_h, target_c = self.select_action(state) q_value = q_value.detach().cpu().numpy() target_q_value = target_q_value.detach().cpu().numpy() next_state, reward, done, _ = self.env.step(action) self.episode_reward += reward self.n_steps += 1 self.n_steps_memory.add(q_value, state[-self.action_repeat:], h, c, target_h, target_c, action, reward, self.stack_count) if self.stack_count > 1: self.stack_count -= 1 if self.n_steps > self.bootstrap_steps: pre_q_value, state, h, c, target_h, target_c, action, reward, stack_count = self.n_steps_memory.get() priority = self.calc_priority(pre_q_value, action, reward, q_value, target_q_value, done) self.replay_memory.add(state, h, c, target_h, target_c, action, reward, done, stack_count, priority) self.memory_count += 1 self.state = next_state.copy() if done: while self.n_steps_memory.size > 0: pre_q_value, state, h, c, target_h, target_c, action, reward, stack_count = self.n_steps_memory.get() priority = self.calc_priority(pre_q_value, action, reward, q_value, target_q_value, done) self.replay_memory.add(state, h, c, target_h, target_c, action, reward, done, stack_count, priority) self.memory_count += 1 self.reset() def select_action(self, state): state = torch.FloatTensor(state).unsqueeze(0).to(self.device) with torch.no_grad(): q_value, h, c = self.net(state, True) target_q_value, target_h, target_c = self.target_net(state, True) if np.random.random() < self.epsilon: action = np.random.randint(6) else: action = q_value.argmax().item() return action, q_value, h, c, target_q_value, target_h, target_c def reset(self): if self.n_episodes % 1 == 0: print('episodes:', self.n_episodes, 'actor_id:', self.actor_id, 'return:', self.episode_reward) self.net.reset() self.target_net.reset() self.set_seq_start_index() self.state = self.env.reset() self.episode_start_index = self.replay_memory.index self.episode_reward = 0 self.n_episodes += 1 self.n_steps = 0 self.memory_count = 0 self.stack_count = self.n_stacks // self.action_repeat # reset n_step memory self.n_steps_memory = NStepMemory(self.bootstrap_steps, self.gamma) # save replay memory if self.n_episodes % self.memory_save_interval == 0: self.replay_memory.save(self.memory_path, self.actor_id) self.replay_memory = ReplayMemory(self.memory_size, self.batch_size, self.bootstrap_steps) self.episode_start_index = 0 gc.collect() # load net if self.n_episodes % self.net_load_interval == 0: self.load_model() def load_model(self): try: self.net.load_state_dict(self.shared_dict['net_state']) self.target_net.load_state_dict(self.shared_dict['target_net_state']) except: print('load error') def calc_priority(self, q_value, action, reward, next_q_value, target_next_q_value, done): q_value = q_value.reshape(-1)[action] target_next_q_value = target_next_q_value.reshape(-1) if done: target_q_value = reward else: next_action = next_q_value.argmax(-1) target_next_q_value = target_next_q_value[next_action] target_q_value = reward + (self.gamma**self.bootstrap_steps) * target_next_q_value priority = np.abs(q_value - target_q_value) + self.priority_epsilon priority = priority ** self.alpha return priority def set_seq_start_index(self): last_index = self.replay_memory.index start_index = self.episode_start_index seq_start_index = [i for i in range(start_index, last_index-self.sequence_length, self.overlap_length)] seq_start_index.append(last_index - self.sequence_length) seq_start_index = np.array(seq_start_index) self.replay_memory.update_sequence_priority(seq_start_index) self.replay_memory.memory['is_seq_start'][seq_start_index] = 1
else: mask = 1 memory.push(history, next_history, action, reward, mask) score += reward history = deepcopy(next_history) if steps > hp.initial_exploration: if epsilon > 0.1: episode_len += 1 epsilon -= 0.0001 if steps % hp.update_target: update_target_model(model, target_model) if done: print('episode: ', episode, 'steps: ', steps, 'epsilon: ', round(epsilon, 4), ' score: ', score) batch = memory.sample() for _ in range(episode_len): train_model(model, target_model, batch, optimizer) break if episode % hp.save_freq: score = int(score) directory = 'save_model/' if not os.path.exists(directory): os.makedirs(directory) torch.save(model.state_dict(), 'save_model/' + str(score) + 'model.pt')
def main(): env = gym.make(args.env_name) env.seed(500) torch.manual_seed(500) state_size = env.observation_space.shape[0] action_size = env.action_space.n print('state size:', state_size) print('action size:', action_size) q_net = QNet(state_size, action_size, args) target_q_net = QNet(state_size, action_size, args) optimizer = optim.Adam(q_net.parameters(), lr=0.001) update_target_model(q_net, target_q_net) writer = SummaryWriter(args.logdir) replay_buffer = deque(maxlen=10000) running_score = 0 steps = 0 for episode in range(args.max_iter_num): done = False score = 0 state = env.reset() state = np.reshape(state, [1, state_size]) while not done: if args.render: env.render() steps += 1 q_values = q_net(torch.Tensor(state)) action = get_action(q_values, action_size, args.epsilon) next_state, reward, done, _ = env.step(action) next_state = np.reshape(next_state, [1, state_size]) reward = reward if not done or score == 499 else -1 mask = 0 if done else 1 replay_buffer.append((state, action, reward, next_state, mask)) state = next_state score += reward if steps > args.initial_exploration: args.epsilon -= args.epsilon_decay args.epsilon = max(args.epsilon, 0.1) mini_batch = random.sample(replay_buffer, args.batch_size) q_net.train(), target_q_net.train() train_model(q_net, target_q_net, optimizer, mini_batch) if steps % args.update_target == 0: update_target_model(q_net, target_q_net) score = score if score == 500.0 else score + 1 running_score = 0.99 * running_score + 0.01 * score if episode % args.log_interval == 0: print( '{} episode | running_score: {:.2f} | epsilon: {:.2f}'.format( episode, running_score, args.epsilon)) writer.add_scalar('log/score', float(score), episode) if running_score > args.goal_score: if not os.path.isdir(args.save_path): os.makedirs(args.save_path) ckpt_path = args.save_path + 'model.pth.tar' torch.save(q_net.state_dict(), ckpt_path) print('Running score exceeds 400. So end') break
def main(): if not (os.path.isdir("logs")): os.makedirs("logs") if (args.entropy and args.boltzmann): raise ValueError("Entropy as well as Boltzmann set.") print(args) working_dir = "logs/" + args.dir if not (os.path.isdir(working_dir)): os.mkdir(working_dir) env = QubeSwingupEnv(use_simulator=True) num_inputs = env.observation_space.shape[0] num_actions = NUMBER_OF_ACTIONS print('state size:', num_inputs) print('action size:', num_actions) online_net = QNet(num_inputs, num_actions) target_net = QNet(num_inputs, num_actions) update_target_model(online_net, target_net) optimizer = optim.Adam(online_net.parameters(), lr=lr) writer = SummaryWriter(working_dir) online_net.to(device) target_net.to(device) online_net.train() target_net.train() memory = Memory_With_TDError(replay_memory_capacity) running_score = 0 epsilon = 1.0 steps = 0 beta = beta_start loss = 0 training_started = False best_running_score = -1000 for e in range(args.e): done = False score = 0 state = env.reset() state = torch.Tensor(state).to(device) state = state.unsqueeze(0) start_time = time.time() while not done: steps += 1 action = get_action(state, target_net, epsilon, use_entropy=args.entropy, use_boltzmann=args.boltzmann) next_state, reward, done, info = env.step( get_continuous_action(action)) reward = give_me_reward(info["alpha"], info["theta"]) next_state = torch.Tensor(next_state).to(device) next_state = next_state.unsqueeze(0) mask = 0 if done else 1 action_one_hot = np.zeros(NUMBER_OF_ACTIONS) action_one_hot[action] = 1 memory.push(state, next_state, action_one_hot, reward, mask) score += reward state = next_state if steps > initial_exploration: if not training_started: print("---------------- training started ---------------") training_started = True epsilon -= 0.000005 epsilon = max(epsilon, 0.1) beta += 0.000005 beta = min(1, beta) batch, weights = memory.sample(batch_size, online_net, target_net, beta) loss = QNet.train_model(online_net, target_net, optimizer, batch, weights, device) if steps % update_target == 0: update_target_model(online_net, target_net) end_time = time.time() running_score = 0.99 * running_score + 0.01 * score if e % log_interval == 0: print( '{} episode | score: {:.2f} | epsilon: {:.2f} | beta: {:.2f}'. format(e, running_score, epsilon, beta)) writer.add_scalar('log/score', float(running_score), e) writer.add_scalar('log/loss', float(loss), e) if running_score > best_running_score and args.save: torch.save(online_net.state_dict(), working_dir + "/best_model.pth") best_running_score = running_score
class QTDAgent(object): def __init__(self, state_dim, action_dim, learning_rate=0.001, reward_decay=0.99, e_greedy=0.9): self.action_dim = action_dim self.state_dim = state_dim self.lr = learning_rate self.gamma = reward_decay # in according to the parameters in the formulation. self.epsilon = e_greedy self.EPS_START = 0.9 self.EPS_END = 0.05 self.EPS_DECAY = 30000 # this decay is to slow. # TO DO: figure out the relationship between the decay and the totoal step. # try to use a good strategy to solve this problem. use_cuda = torch.cuda.is_available() self.LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor self.FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor self.model = QNet(self.state_dim, self.action_dim).cuda() if use_cuda else QNet( self.state_dim, self.action_dim) self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr) # self.scheduler = optim.StepLR(self.optimizer, step_size=10000, gamma=0.5) # the learning rate decrease by a factor gamma every 10000 step_size. util.weights_init(self.model) def sbc(self, v, volatile=False): return Variable(self.FloatTensor((np.expand_dims(v, 0).tolist())), volatile=volatile) def get_actions(self, state): action = self.model(self.sbc(state, volatile=True)) return action def select_action(self, state, steps_done): util.adjust_learning_rate(self.optimizer, self.lr, steps_done, 10000, lr_decay=0.2) # global steps_done sample = random.random() esp_threshold = self.EPS_END + (self.EPS_START - self.EPS_END) * \ np.exp(-1. * steps_done / self.EPS_DECAY) if sample > esp_threshold: actions = self.get_actions(state) action = actions.data.max(1)[1].view(1, 1) return action else: return self.LongTensor([[random.randrange(self.action_dim)]]) def update(self, pending): # def update(self, s, a, r, s_, a_,done=False): pending_len = len(pending) loss = 0 while (pending_len): pending_len = pending_len - 1 [s, a, r, s_, a_, done] = pending[pending_len] if (done == True): expect_state_action_value = r else: non_final_next_states = self.model(self.sbc(s_, volatile=True)) expect_state_action_value = r + self.gamma * non_final_next_states.max( 1)[0] expect_state_action_value.volatile = False # expect_state_action_value = r + self.gamma*self.model(Variable(torch.from_numpy(np.expand_dims(s_,0).astype('float32')))).max(1)[0] state_action_value = self.model(self.sbc(s))[0, a] loss += 0.5 * (state_action_value - expect_state_action_value).pow(2) self.optimizer.zero_grad() loss.backward() # loss.backward() # for param in self.model.parameters(): # param.grad.data.clamp_(-1,1) self.optimizer.step() def save_model(self, path): torch.save(self.model.state_dict(), '{}QTDAgent.pt'.format(path)) # torch.save(self.target_critic.state_dict(), '{}/critic.pt'.format(path)) print('Models saved successfully') def load_model(self, name): self.model.load_state_dict(name)
def main(): env = gym.make(args.env_name) env.seed(500) torch.manual_seed(500) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) net = QNet(num_inputs, num_actions) target_net = QNet(num_inputs, num_actions) update_target_model(net, target_net) optimizer = optim.Adam(net.parameters(), lr=0.001) writer = SummaryWriter('logs') if not os.path.isdir(args.save_path): os.makedirs(args.save_path) net.to(device) target_net.to(device) net.train() target_net.train() memory = Memory(10000) running_score = 0 epsilon = 1.0 steps = 0 for e in range(3000): done = False score = 0 state = env.reset() state = torch.Tensor(state).to(device) state = state.unsqueeze(0) while not done: if args.render: env.render() steps += 1 qvalue = net(state) action = get_action(epsilon, qvalue, num_actions) next_state, reward, done, _ = env.step(action) next_state = torch.Tensor(next_state) next_state = next_state.unsqueeze(0) mask = 0 if done else 1 reward = reward if not done or score == 499 else -1 memory.push(state, next_state, action, reward, mask) score += reward state = next_state if steps > args.initial_exploration: epsilon -= 0.00005 epsilon = max(epsilon, 0.1) batch = memory.sample(args.batch_size) train_model(net, target_net, optimizer, batch, args.batch_size) if steps % args.update_target: update_target_model(net, target_net) score = score if score == 500.0 else score + 1 running_score = 0.99 * running_score + 0.01 * score if e % args.log_interval == 0: print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format( e, running_score, epsilon)) writer.add_scalar('log/score', float(score), running_score) if running_score > args.goal_score: ckpt_path = args.save_path + 'model.pth' torch.save(net.state_dict(), ckpt_path) print('running score exceeds 400 so end') break
class Actor: def __init__(self, actor_id, n_actors, device='cpu'): # params self.gamma = 0.99 self.epsilon = 0.4**(1 + actor_id * 7 / (n_actors - 1)) self.bootstrap_steps = 3 self.alpha = 0.6 self.priority_epsilon = 1e-6 self.device = device self.actor_id = actor_id # path self.memory_path = os.path.join('./', 'logs', 'memory') self.net_path = os.path.join('./', 'logs', 'model', 'net.pt') self.target_net_path = os.path.join('./', 'logs', 'model', 'target_net.pt') # memory self.memory_size = 50000 self.batch_size = 32 self.action_repeat = 4 self.n_stacks = 4 self.stack_count = self.n_stacks // self.action_repeat self.memory_save_interval = 1 self.n_steps_memory = NStepMemory(self.bootstrap_steps, self.gamma) self.replay_memory = ReplayMemory(self.memory_size, self.batch_size, self.bootstrap_steps) # net self.net_load_interval = 5 self.net = QNet(self.net_path).to(self.device) self.target_net = QNet(self.target_net_path).to(self.device) self.target_net.load_state_dict(self.net.state_dict()) # env self.env = PongEnv(self.action_repeat, self.n_stacks) self.episode_reward = 0 self.n_episodes = 0 self.n_steps = 0 self.memory_count = 0 self.state = self.env.reset() def run(self): while True: self.step() def step(self): state = self.state action = self.select_action(state) next_state, reward, done, _ = self.env.step(action) self.episode_reward += reward self.n_steps += 1 self.n_steps_memory.add(state[-self.action_repeat:], action, reward, self.stack_count) if self.stack_count > 1: self.stack_count -= 1 if self.n_steps > self.bootstrap_steps: state, action, reward, stack_count = self.n_steps_memory.get() self.replay_memory.add(state, action, reward, done, stack_count) self.memory_count += 1 self.state = next_state.copy() if done: while self.n_steps_memory.size > 0: state, action, reward, stack_count = self.n_steps_memory.get() self.replay_memory.add(state, action, reward, done, stack_count) self.memory_count += 1 self.reset() def select_action(self, state): if np.random.random() < self.epsilon: action = np.random.randint(6) else: state = torch.FloatTensor(state).unsqueeze(0).to(self.device) with torch.no_grad(): q_val = self.net(state) action = q_val.argmax().item() return action def reset(self): if self.n_episodes % 1 == 0: print('episodes:', self.n_episodes, 'actor_id:', self.actor_id, 'return:', self.episode_reward) self.calc_priority() self.state = self.env.reset() self.episode_reward = 0 self.n_episodes += 1 self.n_steps = 0 self.memory_count = 0 self.stack_count = self.n_stacks // self.action_repeat # reset n_step memory self.n_steps_memory = NStepMemory(self.bootstrap_steps, self.gamma) # save replay memory if self.n_episodes % self.memory_save_interval == 0: self.replay_memory.save(self.memory_path, self.actor_id) self.replay_memory = ReplayMemory(self.memory_size, self.batch_size, self.bootstrap_steps) # load net if self.n_episodes % self.net_load_interval == 0: self.net.load() self.target_net.load() def calc_priority(self): last_index = self.replay_memory.size start_index = last_index - self.memory_count batch, index = self.replay_memory.indexing_sample( start_index, last_index, self.device) batch_size = batch['state'].shape[0] priority = np.zeros(batch_size, dtype=np.float32) mini_batch_size = 500 for start_index in range(0, batch_size, mini_batch_size): last_index = min(start_index + mini_batch_size, batch_size) mini_batch = dict() for key in batch.keys(): if key in ['reward', 'done']: mini_batch[key] = batch[key][start_index:last_index] else: mini_batch[key] = torch.tensor( batch[key][start_index:last_index]).to(self.device) mini_batch['action'] = mini_batch['action'].view(-1, 1).long() with torch.no_grad(): # q_value q_value = self.net(mini_batch['state']).gather( 1, mini_batch['action']).view(-1, 1).cpu().numpy() # taget_q_value next_action = torch.argmax(self.net(mini_batch['next_state']), 1).view(-1, 1) next_q_value = self.target_net( mini_batch['next_state']).gather( 1, next_action).cpu().numpy() target_q_value = mini_batch['reward'] + ( self.gamma** self.bootstrap_steps) * next_q_value * (1 - mini_batch['done']) delta = np.abs(q_value - target_q_value).reshape(-1) + self.priority_epsilon delta = delta**self.alpha priority[start_index:last_index] = delta self.replay_memory.update_priority(index, priority)
def main(): env = gym.make(args.env_name) env.seed(500) torch.manual_seed(500) img_shape = env.observation_space.shape num_actions = 3 print('image size:', img_shape) print('action size:', num_actions) net = QNet(num_actions) target_net = QNet(num_actions) update_target_model(net, target_net) optimizer = optim.RMSprop(net.parameters(), lr=0.00025, eps=0.01) writer = SummaryWriter('logs') if not os.path.isdir(args.save_path): os.makedirs(args.save_path) net.to(device) target_net.to(device) net.train() target_net.train() memory = Memory(100000) running_score = 0 epsilon = 1.0 steps = 0 for e in range(10000): done = False dead = False score = 0 avg_loss = [] start_life = 5 state = env.reset() state = pre_process(state) state = torch.Tensor(state).to(device) history = torch.stack((state, state, state, state)) for i in range(3): action = env.action_space.sample() state, reward, done, info = env.step(action) state = pre_process(state) state = torch.Tensor(state).to(device) state = state.unsqueeze(0) history = torch.cat((state, history[:-1]), dim=0) while not done: if args.render: env.render() steps += 1 qvalue = net(history.unsqueeze(0)) action = get_action(epsilon, qvalue, num_actions) next_state, reward, done, info = env.step(action + 1) next_state = pre_process(next_state) next_state = torch.Tensor(next_state).to(device) next_state = next_state.unsqueeze(0) next_history = torch.cat((next_state, history[:-1]), dim=0) if start_life > info['ale.lives']: dead = True start_life = info['ale.lives'] score += reward reward = np.clip(reward, -1, 1) mask = 0 if dead else 1 memory.push(history.cpu(), next_history.cpu(), action, reward, mask) if dead: dead = False if steps > args.initial_exploration: epsilon -= 1e-6 epsilon = max(epsilon, 0.1) batch = memory.sample(args.batch_size) loss = train_model(net, target_net, optimizer, batch) if steps % args.update_target: update_target_model(net, target_net) else: loss = 0 avg_loss.append(loss) history = next_history if e % args.log_interval == 0: print( '{} episode | score: {:.2f} | epsilon: {:.4f} | steps: {} | loss: {:.4f}' .format(e, score, epsilon, steps, np.mean(avg_loss))) writer.add_scalar('log/score', float(score), steps) writer.add_scalar('log/score', np.mean(avg_loss), steps) if score > args.goal_score: ckpt_path = args.save_path + 'model.pth' torch.save(net.state_dict(), ckpt_path) print('running score exceeds 400 so end') break
def train(render): online_net = QNet(h=84, w=84, outputs=36) online_net.load_state_dict(torch.load('saved/online_net.pt')) target_net = QNet(h=84, w=84, outputs=36) update_target_model(online_net, target_net) optimizer = optim.Adam(online_net.parameters(), lr=lr) online_net.to(device) target_net.to(device) online_net.train() target_net.train() memory = Memory(replay_memory_capacity) memory = torch.load('saved/model_memory.pt') epsilon = 0.1 steps = 0 beta = beta_start loss = 0 for e in range(100000): #level = random.choice(LEVEL_SET) level = 'Level01' env = make_retro(game=env_name, state=level, use_restricted_actions=retro.Actions.DISCRETE) done = False total_reward = 0.0 state = env.reset() state = torch.Tensor(state).to(device).permute(2, 0, 1) #state = state.view(state.size()[0], -1) state = state.unsqueeze(0) while not done: steps += 1 action = get_action(state.to(device), target_net, epsilon, env) if render: env.render() next_state, reward, done, info = env.step(action) next_state = torch.Tensor(next_state).permute(2, 0, 1) #next_state = next_state.view(next_state.size()[0], -1) next_state = next_state.unsqueeze(0) total_reward += reward mask = 0 if done else 1 action_one_hot = torch.zeros(36) action_one_hot[action] = 1 reward = torch.tensor([info['score']]).to(device) memory.push(state, next_state, action_one_hot, reward, mask) state = next_state if len(memory) > initial_exploration: epsilon -= 0.00005 epsilon = max(epsilon, 0.02) beta += 0.00005 beta = min(1, beta) batch, weights = memory.sample(batch_size, online_net, target_net, beta) loss = QNet.train_model(online_net, target_net, optimizer, batch, weights) if steps % update_target == 0: update_target_model(online_net, target_net) if e % 1 == 0: print('{} episode | Total Reward: {}'.format(e, total_reward)) torch.save(online_net.state_dict(), 'saved/online_net.pt') torch.save(memory, 'saved/model_memory.pt') env.close()
class Learner: def __init__(self, n_actors, shared_dict, device='cuda:0'): # params self.gamma = 0.99 self.alpha = 0.6 self.bootstrap_steps = 3 self.initial_exploration = 50000 self.priority_epsilon = 1e-6 self.device = device self.n_epochs = 0 self.n_actors = n_actors # path self.memory_path = os.path.join('./', 'logs', 'memory') # memory self.burn_in_length = 10 self.learning_length = 10 self.sequence_length = self.burn_in_length + self.learning_length self.memory_size = 500000 self.batch_size = 8 self.memory_load_interval = 20 self.replay_memory = ReplayMemory(self.memory_size, self.batch_size, self.bootstrap_steps) # net self.shared_dict = shared_dict self.net_save_interval = 100 self.target_update_interval = 1000 self.net = QNet(self.device).to(self.device) self.target_net = QNet(self.device).to(self.device) self.target_net.load_state_dict(self.net.state_dict()) self.save_model() self.optim = optim.RMSprop(self.net.parameters(), lr=0.00025 / 4.0, alpha=0.95, eps=1.5e-7, centered=True) def run(self): while True: if self.replay_memory.size > self.initial_exploration: self.train() if self.n_epochs % 100 == 0: print('trained', self.n_epochs, 'epochs') self.interval() def train(self): batch, seq_index, index = self.replay_memory.sample(self.device) self.net.set_state(batch['hs'], batch['cs']) self.target_net.set_state(batch['target_hs'], batch['target_cs']) ### burn-in step ### state = batch['state'][:self.burn_in_length] next_state = batch['next_state'][:self.burn_in_length] with torch.no_grad(): _ = self.net(state) _ = self.target_net(next_state) ### learning step ### state = batch['state'][self.burn_in_length:] next_state = batch['next_state'][self.burn_in_length:] # q_value q_value = self.net(state).gather(1, batch['action'].view(-1, 1)) # target q_value with torch.no_grad(): next_action = torch.argmax(self.net(next_state), 1).view(-1, 1) next_q_value = self.target_net(next_state).gather(1, next_action) target_q_value = batch["reward"].view( -1, 1) + (self.gamma**self.bootstrap_steps) * next_q_value * ( 1 - batch['done'].view(-1, 1)) # update self.optim.zero_grad() loss = torch.mean(0.5 * (q_value - target_q_value)**2) loss.backward() self.optim.step() priority = (np.abs( (q_value - target_q_value).detach().cpu().numpy()).reshape(-1) + self.priority_epsilon)**self.alpha self.replay_memory.update_priority( index[self.burn_in_length:].reshape(-1), priority) self.replay_memory.update_sequence_priority(seq_index, True) def interval(self): self.n_epochs += 1 if self.n_epochs % self.target_update_interval == 0: self.target_net.load_state_dict(self.net.state_dict()) if self.n_epochs % self.net_save_interval == 0: self.save_model() if self.n_epochs % self.memory_load_interval == 0: for i in range(self.n_actors): self.replay_memory.load(self.memory_path, i) def save_model(self): self.shared_dict['net_state'] = deepcopy(self.net).cpu().state_dict() self.shared_dict['target_net_state'] = deepcopy( self.target_net).cpu().state_dict()
cur_loss = trainEval(train_loader, model, optimizer, args, True) val_loss = trainEval(val_loader, model, optimizer, args, False) test_loss = trainEval(test_loader, model, optimizer, args, False) metrics = {'epoch': epoch} metrics['mse_train'] = cur_loss metrics['mse_val'] = val_loss metrics['mse_test'] = test_loss log = log.append(metrics, ignore_index=True) log.to_csv(log_file, index=False) a_t.append(cur_loss) a_v.append(val_loss) a_te.append(test_loss) if best_mse is None or (val_loss < best_mse): plotGraph(a_t, a_v, a_te, '.', run_name) plotGraph(a_t, a_v, a_te, run_dir, run_name) best_mse = val_loss ckpt = os.path.join(ckpt_dir, 'ckpt_e{}.pth'.format(epoch)) torch.save({ 'epoch': epoch, 'mse_train': cur_loss, 'mse_val': val_loss, 'mse_test': test_loss, 'model': model.state_dict(), 'optimizer': optimizer.state_dict() }, ckpt) scheduler.step(val_loss)
optimizer.step() return loss # Build environment env = make_atari('PongNoFrameskip-v4', stack=2) env = wrap_pytorch(env) number_actions = env.action_space.n replay_buffer = ReplayBuffer(replay_memory_size) # Separate target net & policy net input_shape = env.reset().shape current_net = QNet(input_shape, number_actions).to(device) target_net = QNet(input_shape, number_actions).to(device) # with older weights target_net.load_state_dict(current_net.state_dict()) target_net.eval() optimizer = opt_algorithm(current_net.parameters(), lr=learning_rate) n_episode = 1 episode_return = 0 best_return = 0 returns = [] state = env.reset() for i in count(): # env.render() eps = get_epsilon(i) action = select_action(state, current_net, eps, number_action=number_actions)
def main(L, mouse_initial_indices, rewardlist, actions_list): if mouse_initial_indices is None: all_possible_starting_positions = np.array([*np.where(L == 1)]).T scores = [0] best_scores = [0] env = deepcopy(L) torch.manual_seed(2020) num_inputs = 2 + 1 num_actions = 4 print('state size:', num_inputs) print('action size:', num_actions) online_net = QNet(num_inputs, num_actions) target_net = QNet(num_inputs, num_actions) update_target_model(online_net, target_net) optimizer = optim.Adam(online_net.parameters(), lr=lr) # writer = SummaryWriter('logs') online_net.to(device) target_net.to(device) online_net.train() target_net.train() memory = Memory(replay_memory_capacity) running_score = 0 epsilon = 1.0 steps = 0 loss = 0 inint = mouse_initial_indices best_score = 0 number_episode = 1000 for e in range(number_episode): if inint is None: mouse_initial_indices = all_possible_starting_positions[ np.random.choice(range(len(all_possible_starting_positions)))] done = False env = deepcopy(L) eaubue = 0. score = 0 state = np.array(mouse_initial_indices) state = torch.Tensor(state).to(device) state = state.unsqueeze(0) while not done: steps += 1 action = get_action(state, target_net, epsilon, env, eaubue=eaubue) newstate = state + torch.Tensor(np.array( actions_list[action])).to(device) if env[int(newstate[0][0].tolist()), int(newstate[0][1].tolist())] != 0: next_state = newstate new_eaubue = eaubue reward = rewardlist[env[int(newstate[0][0].tolist()), int(newstate[0][1].tolist())]] if env[int(newstate[0][0].tolist()), int(newstate[0][1].tolist())] == 2: done = True if env[int(newstate[0][0].tolist()), int(newstate[0][1].tolist() )] == 4: #if the mouse is in the water env[int(newstate[0][0].tolist()), int(newstate[0][1].tolist() )] = 5 #there is no more water new_eaubue = 1. else: next_state = state reward = rewardlist[0] new_eaubue = eaubue mask = 0 if done else 1 action_one_hot = np.zeros(4) action_one_hot[action] = 1 memory.push( torch.cat(( state, torch.tensor(eaubue).unsqueeze(0).unsqueeze(0).to(device)), 1), torch.cat((next_state, torch.tensor(new_eaubue).unsqueeze( 0).unsqueeze(0).to(device)), 1), action_one_hot, reward, mask) score += reward state = next_state eaubue = new_eaubue if steps > initial_exploration: epsilon -= 0.00005 epsilon = max(epsilon, 0.1) batch = memory.sample(batch_size) loss = QNet.train_model(online_net, target_net, optimizer, batch) if steps % update_target == 0: update_target_model(online_net, target_net) # print("OK") if score > 35: print(score) running_score = 0.99 * running_score + 0.01 * score # running_score=score scores.append(running_score) best_scores.append( score if score > best_scores[-1] else best_scores[-1]) if e % log_interval == 0: print( '{} episode | score: {:.2f} | best score: {:.2f} | epsilon: {:.2f}' .format(e, running_score, best_score, epsilon)) # writer.add_scalar('log/score', float(running_score), e) # writer.add_scalar('log/loss', float(loss), e) if score > best_score: best_score = score torch.save(online_net.state_dict(), "./qlearning_model") if running_score > goal_score: break return number_episode, scores, best_scores
class Learner: def __init__(self, n_actors, device='cuda:0'): # params self.gamma = 0.99 self.alpha = 0.6 self.bootstrap_steps = 3 self.initial_exploration = 50000 self.priority_epsilon = 1e-6 self.device = device self.n_epochs = 0 self.n_actors = n_actors # path self.memory_path = os.path.join('./', 'logs', 'memory') self.net_path = os.path.join('./', 'logs', 'model', 'net.pt') self.target_net_path = os.path.join('./', 'logs', 'model', 'target_net.pt') # memory self.memory_size = 500000 self.batch_size = 128 self.memory_load_interval = 10 self.replay_memory = ReplayMemory(self.memory_size, self.batch_size, self.bootstrap_steps) # net self.net_save_interval = 50 self.target_update_interval = 1000 self.net = QNet(self.net_path, self.device).to(self.device) self.target_net = QNet(self.target_net_path, self.device).to(self.device) self.target_net.load_state_dict(self.net.state_dict()) self.net.save() self.target_net.save() self.optim = optim.RMSprop(self.net.parameters(), lr=0.00025 / 4.0, alpha=0.95, eps=1.5e-7, centered=True) def run(self): while True: if self.replay_memory.size > self.initial_exploration: self.train() self.interval() def train(self): batch, index, weights = self.replay_memory.sample(self.device) # q_value q_value = self.net(batch['state']) q_value = q_value.gather(1, batch['action']) # target q_value with torch.no_grad(): next_action = torch.argmax(self.net(batch["next_state"]), 1).view(-1, 1) next_q_value = self.target_net(batch["next_state"]).gather( 1, next_action) target_q_value = batch["reward"] + ( self.gamma** self.bootstrap_steps) * next_q_value * (1 - batch['done']) # update self.optim.zero_grad() loss = torch.mean(0.5 * (q_value - target_q_value)**2) loss.backward() self.optim.step() priority = (np.abs( (q_value - target_q_value).detach().cpu().numpy()).reshape(-1) + self.priority_epsilon)**self.alpha self.replay_memory.update_priority(index, priority) def interval(self): self.n_epochs += 1 if self.n_epochs % self.target_update_interval == 0: self.target_net.load_state_dict(self.net.state_dict()) if self.n_epochs % self.net_save_interval == 0: self.net.save() self.target_net.save() if self.n_epochs % self.memory_load_interval == 0: for i in range(self.n_actors): self.replay_memory.load(self.memory_path, i)