def __init__(self, env): super(DDPG, self).__init__() pi_net = PiNet(self.ns, self.na) self.pi_net = pi_net.to(self.device) pi_target = PiNet(self.ns, self.na) self.pi_target = pi_target.to(self.device) self.load_state_dict(self.pi_target, self.pi_net.state_dict()) q_net = QNet(self.ns, self.na) self.q_net = q_net.to(self.device) q_target = QNet(self.ns, self.na) self.q_target = q_target.to(self.device) self.load_state_dict(self.q_target, self.q_net.state_dict()) self.optimizer_q = torch.optim.Adam(self.q_net.parameters(), lr=self.lr_q, betas=(0.9, 0.999), weight_decay=1e-2) self.optimizer_p = torch.optim.Adam(self.pi_net.parameters(), lr=self.lr_p, betas=(0.9, 0.999), weight_decay=0) self.noise = OrnsteinUhlenbeckActionNoise( torch.zeros(1, self.na).to(self.device), self.epsilon * torch.ones(1, self.na).to(self.device))
def main(): env = gym.make(env_name) env.seed(500) torch.manual_seed(500) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) net = QNet(num_inputs, num_actions) optimizer = optim.Adam(net.parameters(), lr=lr) writer = SummaryWriter('logs') net.to(device) net.train() running_score = 0 steps = 0 loss = 0 for e in range(3000): done = False memory = Memory() score = 0 state = env.reset() state = torch.Tensor(state).to(device) state = state.unsqueeze(0) while not done: steps += 1 action = net.get_action(state) next_state, reward, done, _ = env.step(action) next_state = torch.Tensor(next_state) next_state = next_state.unsqueeze(0) mask = 0 if done else 1 reward = reward if not done or score == 499 else -1 action_one_hot = torch.zeros(2) action_one_hot[action] = 1 memory.push(state, next_state, action_one_hot, reward, mask) score += reward state = next_state loss = QNet.train_model(net, memory.sample(), optimizer) score = score if score == 500.0 else score + 1 running_score = 0.99 * running_score + 0.01 * score if e % log_interval == 0: print('{} episode | score: {:.2f}'.format(e, running_score)) writer.add_scalar('log/score', float(running_score), e) writer.add_scalar('log/loss', float(loss), e) if running_score > goal_score: break
def main(): if not (os.path.isdir("logs")): os.makedirs("logs") working_dir = "logs/" + args.dir if not (os.path.isdir(working_dir)): raise NameError(args.dir + " does not exist in dir logs") print(args) env = QubeSwingupEnv(use_simulator=args.sim, batch_size= 2048*4) num_inputs = env.observation_space.shape[0] num_actions = NUMBER_OF_ACTIONS print('state size:', num_inputs) print('action size:', num_actions) net = QNet(num_inputs, num_actions) if not args.new_net else QNet_more_layers(num_inputs, num_actions) net.load_state_dict(torch.load(working_dir + "/best_model.pth", map_location=torch.device(device))) net.to(device) net.eval() running_score = 0 epsilon = 1.0 steps = 0 beta = beta_start loss = 0 best_running_score = -1000 for e in range(1): done = False score = 0 state = env.reset() state = torch.Tensor(state).to(device) state = state.unsqueeze(0) while not done: steps += 1 action = get_continuous_action(get_action(state, net)) if np.abs(state[0][1].item()) < deg2rad(25): action = pd_control_policy(state.cpu().numpy()[0])[0] next_state, reward, done, info = env.step(action) reward = give_me_reward(info["alpha"], info["theta"]) if args.sim: env.render() reward = give_me_reward(info["alpha"], info["theta"]) if done: print(info) print("theta:" , info["theta"] * 180/np.pi) next_state = torch.Tensor(next_state).to(device) next_state = next_state.unsqueeze(0) score += reward state = next_state running_score = 0.99 * running_score + 0.01 * score print('{} episode | running_score: {:.2f} | score: {:.2f} | steps: {} '.format(e, running_score, score, steps)) env.close()
def main(): env = gym.make(args.env_name) env.seed(500) torch.manual_seed(500) img_shape = env.observation_space.shape num_actions = 3 print('image size:', img_shape) print('action size:', num_actions) net = QNet(num_actions) net.load_state_dict(torch.load(args.save_path + 'model.pth')) net.to(device) net.eval() epsilon = 0 for e in range(5): done = False score = 0 state = env.reset() state = pre_process(state) state = torch.Tensor(state).to(device) history = torch.stack((state, state, state, state)) for i in range(3): action = env.action_space.sample() state, reward, done, info = env.step(action) state = pre_process(state) state = torch.Tensor(state).to(device) state = state.unsqueeze(0) history = torch.cat((state, history[:-1]), dim=0) while not done: if args.render: env.render() steps += 1 qvalue = net(history.unsqueeze(0)) action = get_action(0, qvalue, num_actions) next_state, reward, done, info = env.step(action + 1) next_state = pre_process(next_state) next_state = torch.Tensor(next_state).to(device) next_state = next_state.unsqueeze(0) next_history = torch.cat((next_state, history[:-1]), dim=0) score += reward history = next_history print('{} episode | score: {:.2f}'.format(e, score))
def __init__(self, *largs, **kwargs): super(SSPG, self).__init__(*largs, **kwargs) pi_net = PiNet(self.ns, self.na, distribution='Normal') self.pi_net = pi_net.to(self.device) pi_target = PiNet(self.ns, self.na, distribution='Normal') self.pi_target = pi_target.to(self.device) self.load_state_dict(self.pi_target, self.pi_net.state_dict()) q_net_1 = QNet(self.ns, self.na) self.q_net_1 = q_net_1.to(self.device) q_target_1 = QNet(self.ns, self.na) self.q_target_1 = q_target_1.to(self.device) self.load_state_dict(self.q_target_1, self.q_net_1.state_dict()) q_net_2 = QNet(self.ns, self.na) self.q_net_2 = q_net_2.to(self.device) q_target_2 = QNet(self.ns, self.na) self.q_target_2 = q_target_2.to(self.device) self.load_state_dict(self.q_target_2, self.q_net_2.state_dict()) self.optimizer_q_1 = torch.optim.Adam(self.q_net_1.parameters(), lr=self.lr_q, betas=(0.9, 0.999), weight_decay=self.weight_decay_q) self.optimizer_q_2 = torch.optim.Adam(self.q_net_2.parameters(), lr=self.lr_q, betas=(0.9, 0.999), weight_decay=self.weight_decay_q) self.optimizer_p = torch.optim.Adam(self.pi_net.parameters(), lr=self.lr_p, betas=(0.9, 0.999), weight_decay=self.weight_decay_p) if self.entropy_tunning: self.target_entropy = -torch.prod( torch.Tensor(self.na).to(self.device)).item() self.log_alpha = torch.tensor([0.], requires_grad=True, device=self.device) self.optimizer_alpha = torch.optim.Adam([self.log_alpha], lr=self.lr_q) self.alpha = float(self.log_alpha.exp())
def __init__(self, env): super(SACV, self).__init__() self.env = env n_a = env.action_space.shape[0] n_s = env.observation_space.shape[0] pi_net = PiNet(n_s, n_a, distribution='Normal') self.pi_net = pi_net.to(self.device) q_net_1 = QNet(n_s, n_a) self.q_net_1 = q_net_1.to(self.device) q_net_2 = QNet(n_s, n_a) self.q_net_2 = q_net_2.to(self.device) v_net = QNet(n_s, 0) self.v_net = v_net.to(self.device) v_target = QNet(n_s, 0) self.v_target = v_target.to(self.device) self.load_state_dict(self.v_target, self.v_net.state_dict()) self.optimizer_q_1 = torch.optim.Adam(self.q_net_1.parameters(), lr=self.lr_q, betas=(0.9, 0.999), weight_decay=1e-2) self.optimizer_q_2 = torch.optim.Adam(self.q_net_2.parameters(), lr=self.lr_q, betas=(0.9, 0.999), weight_decay=1e-2) self.optimizer_v = torch.optim.Adam(self.v_net.parameters(), lr=self.lr_q, betas=(0.9, 0.999), weight_decay=1e-2) # eps = 1e-04, self.optimizer_p = torch.optim.Adam(self.pi_net.parameters(), lr=self.lr_p, betas=(0.9, 0.999), weight_decay=0) self.sample = self.actor_rb
def main(): env = gym.make(env_name) env.seed(500) torch.manual_seed(500) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) online_net = QNet(num_inputs, num_actions) target_net = QNet(num_inputs, num_actions) target_net.load_state_dict(online_net.state_dict()) online_net.share_memory() target_net.share_memory() optimizer = SharedAdam(online_net.parameters(), lr=lr) global_ep, global_ep_r, res_queue = mp.Value('i', 0), mp.Value('d', 0.), mp.Queue() writer = SummaryWriter('logs') online_net.to(device) target_net.to(device) online_net.train() target_net.train() workers = [ Worker(online_net, target_net, optimizer, global_ep, global_ep_r, res_queue, i) for i in range(mp.cpu_count()) ] [w.start() for w in workers] res = [] while True: r = res_queue.get() if r is not None: res.append(r) [ep, ep_r, loss] = r writer.add_scalar('log/score', float(ep_r), ep) writer.add_scalar('log/loss', float(loss), ep) else: break [w.join() for w in workers]
def main(): env = gym.make(args.env_name) env.seed(500) torch.manual_seed(500) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) net = QNet(num_inputs, num_actions) net.load_state_dict(torch.load(args.save_path + 'model.pth')) net.to(device) net.eval() running_score = 0 steps = 0 for e in range(5): done = False score = 0 state = env.reset() state = torch.Tensor(state).to(device) state = state.unsqueeze(0) while not done: env.render() steps += 1 qvalue = net(state) action = get_action(qvalue) next_state, reward, done, _ = env.step(action) next_state = torch.Tensor(next_state).to(device) next_state = next_state.unsqueeze(0) score += reward state = next_state print('{} episode | score: {:.2f}'.format(e, score))
def __init__(self, *largs, **kwargs): super(RBI, self).__init__(*largs, **kwargs) pi_net = PiNet(self.ns, self.na, distribution='Normal') self.pi_net = pi_net.to(self.device) pi_target = PiNet(self.ns, self.na, distribution='Normal') self.pi_target = pi_target.to(self.device) self.load_state_dict(self.pi_target, self.pi_net.state_dict()) q_net_1 = QNet(self.ns, self.na) self.q_net_1 = q_net_1.to(self.device) q_target_1 = QNet(self.ns, self.na) self.q_target_1 = q_target_1.to(self.device) self.load_state_dict(self.q_target_1, self.q_net_1.state_dict()) q_net_2 = QNet(self.ns, self.na) self.q_net_2 = q_net_2.to(self.device) q_target_2 = QNet(self.ns, self.na) self.q_target_2 = q_target_2.to(self.device) self.load_state_dict(self.q_target_2, self.q_net_2.state_dict()) self.optimizer_q_1 = torch.optim.Adam(self.q_net_1.parameters(), lr=self.lr_q, betas=(0.9, 0.999), weight_decay=self.weight_decay_q) self.optimizer_q_2 = torch.optim.Adam(self.q_net_2.parameters(), lr=self.lr_q, betas=(0.9, 0.999), weight_decay=self.weight_decay_q) self.optimizer_p = torch.optim.Adam(self.pi_net.parameters(), lr=self.lr_p, betas=(0.9, 0.999), weight_decay=self.weight_decay_p) self.alpha = self.rbi_alpha if self.entropy_tunning: # self.target_entropy = -float(self.na) std_target = 0.3 / math.sqrt(self.na) self.target_entropy = self.na * 0.5 * math.log(2 * math.pi * math.e * (std_target ** 2)) print(f'target entropy: {self.target_entropy}') self.lr_alpha = 0.01
def __init__(self, *largs, **kwargs): super(TD3, self).__init__(*largs, **kwargs) pi_net = PiNet(self.ns, self.na) self.pi_net = pi_net.to(self.device) pi_target = PiNet(self.ns, self.na) self.pi_target = pi_target.to(self.device) self.load_state_dict(self.pi_target, self.pi_net.state_dict()) q_net_1 = QNet(self.ns, self.na) self.q_net_1 = q_net_1.to(self.device) q_target_1 = QNet(self.ns, self.na) self.q_target_1 = q_target_1.to(self.device) self.load_state_dict(self.q_target_1, self.q_net_1.state_dict()) q_net_2 = QNet(self.ns, self.na) self.q_net_2 = q_net_2.to(self.device) q_target_2 = QNet(self.ns, self.na) self.q_target_2 = q_target_2.to(self.device) self.load_state_dict(self.q_target_2, self.q_net_2.state_dict()) self.optimizer_q_1 = torch.optim.Adam(self.q_net_1.parameters(), lr=self.lr_q, betas=(0.9, 0.999)) self.optimizer_q_2 = torch.optim.Adam(self.q_net_2.parameters(), lr=self.lr_q, betas=(0.9, 0.999)) self.optimizer_p = torch.optim.Adam(self.pi_net.parameters(), lr=self.lr_p, betas=(0.9, 0.999)) self.noise = RandomNoise( torch.zeros(1, self.na).to(self.device), self.epsilon)
def test(level_list, render=True): online_net = QNet(h=84, w=84, outputs=36) online_net.load_state_dict(torch.load('saved/online_net.pt')) online_net.to(device) cnt = 0 death = 0 total_reward = 0.0 str_level_list = [LEVEL_SET[idx - 1] for idx in level_list] for level in str_level_list: env = make_retro(game=env_name, state=level, use_restricted_actions=retro.Actions.DISCRETE) obs = env.reset() state = torch.Tensor(obs).to(device).permute(2, 0, 1) #state = state.view(state.size()[0], -1) state = state.unsqueeze(0) previous_lives = 3 previous_level = level_list[cnt] cnt += 1 if death >= 3: break for t in count(): action = online_net.get_action(state.to(device)) if render: env.render() time.sleep(0.02) next_state, reward, done, info = env.step(action) next_state = torch.Tensor(next_state).permute(2, 0, 1) #next_state = next_state.view(next_state.size()[0], -1) next_state = next_state.unsqueeze(0) total_reward += reward current_lives = info['lives'] current_level = info['level'] if current_lives != previous_lives: print('Dead') previous_lives = info['lives'] death += 1 #if death >= 3: # print("Finished ", level, " Total reward: {}".format(total_reward)) # break if current_level != previous_level: print('Stage changed') print("Finished ", level, " Total reward: {}".format(total_reward)) break state = next_state if done: print('All lives gone') print("Finished ", level, " Total reward: {}".format(total_reward)) break env.close() return
def main(): if not (os.path.isdir("logs")): os.makedirs("logs") if (args.entropy and args.boltzmann): raise ValueError("Entropy as well as Boltzmann set.") print(args) working_dir = "logs/" + args.dir if not (os.path.isdir(working_dir)): os.mkdir(working_dir) env = QubeSwingupEnv(use_simulator=True) num_inputs = env.observation_space.shape[0] num_actions = NUMBER_OF_ACTIONS print('state size:', num_inputs) print('action size:', num_actions) online_net = QNet(num_inputs, num_actions) target_net = QNet(num_inputs, num_actions) update_target_model(online_net, target_net) optimizer = optim.Adam(online_net.parameters(), lr=lr) writer = SummaryWriter(working_dir) online_net.to(device) target_net.to(device) online_net.train() target_net.train() memory = Memory_With_TDError(replay_memory_capacity) running_score = 0 epsilon = 1.0 steps = 0 beta = beta_start loss = 0 training_started = False best_running_score = -1000 for e in range(args.e): done = False score = 0 state = env.reset() state = torch.Tensor(state).to(device) state = state.unsqueeze(0) start_time = time.time() while not done: steps += 1 action = get_action(state, target_net, epsilon, use_entropy=args.entropy, use_boltzmann=args.boltzmann) next_state, reward, done, info = env.step( get_continuous_action(action)) reward = give_me_reward(info["alpha"], info["theta"]) next_state = torch.Tensor(next_state).to(device) next_state = next_state.unsqueeze(0) mask = 0 if done else 1 action_one_hot = np.zeros(NUMBER_OF_ACTIONS) action_one_hot[action] = 1 memory.push(state, next_state, action_one_hot, reward, mask) score += reward state = next_state if steps > initial_exploration: if not training_started: print("---------------- training started ---------------") training_started = True epsilon -= 0.000005 epsilon = max(epsilon, 0.1) beta += 0.000005 beta = min(1, beta) batch, weights = memory.sample(batch_size, online_net, target_net, beta) loss = QNet.train_model(online_net, target_net, optimizer, batch, weights, device) if steps % update_target == 0: update_target_model(online_net, target_net) end_time = time.time() running_score = 0.99 * running_score + 0.01 * score if e % log_interval == 0: print( '{} episode | score: {:.2f} | epsilon: {:.2f} | beta: {:.2f}'. format(e, running_score, epsilon, beta)) writer.add_scalar('log/score', float(running_score), e) writer.add_scalar('log/loss', float(loss), e) if running_score > best_running_score and args.save: torch.save(online_net.state_dict(), working_dir + "/best_model.pth") best_running_score = running_score
def main(): # cartpole test if (cartpole_test): envs_fun = [lambda: gym.make('CartPole-v0')] envs_fun = np.tile(envs_fun, 3) envs = ShmemVecEnv(envs_fun) dummy_env = envs_fun[0]() else: INPUT_FILE = '../data/05f2a901.json' with open(INPUT_FILE, 'r') as f: puzzle = json.load(f) envs_fun = [ lambda: gym.make('arc-v0', input=task['input'], output=task['output'], need_ui=need_ui) for task in puzzle['train'] ] #pdb.set_trace() envs_fun = envs_fun[0:1] envs = ShmemVecEnv(envs_fun) dummy_env = envs_fun[0]() env_num = len(envs_fun) torch.manual_seed(500) num_inputs = dummy_env.observation_space.shape[0] num_actions = dummy_env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) online_net = QNet(num_inputs, num_actions, cartpole_test, evalution_mode) target_net = QNet(num_inputs, num_actions, cartpole_test, evalution_mode) if (evalution_mode): online_net = torch.load('../result/arc0.model') target_net = torch.load('../result/arc0.model') update_target_model(online_net, target_net) optimizer = optim.Adam(online_net.parameters(), lr=lr) writer = SummaryWriter('logs') online_net.to(device) target_net.to(device) online_net.train() target_net.train() memory = Memory(replay_memory_capacity) score = 0 epsilon = 1.0 steps = 0 loss = 0 states = envs.reset() try: while True: if (need_ui): envs.render() steps += 1 global initial_exploration if (initial_exploration > 0): initial_exploration -= 1 actions = [] for state in states: state = torch.Tensor(state).to(device) state = state.unsqueeze(0) action = get_action(state, target_net, 0 if evalution_mode else epsilon, dummy_env) if (evalution_mode): print(action) actions.append(action) next_states, rewards, dones, info = envs.step(actions) #print(rewards) masks = np.zeros(envs.num_envs) for i in range(envs.num_envs): masks[i] = 0 if dones[i] else 1 for i in range(envs.num_envs): #print(rewards[i]) action_one_hot = np.zeros(dummy_env.action_space.n) action_one_hot[actions[i]] = 1 memory.push(states[i], next_states[i], action_one_hot, rewards[i], masks[i]) #score += reward states = next_states if not evalution_mode and steps > initial_exploration: epsilon -= 0.00003 epsilon = max(epsilon, 0.1) batch = memory.sample(batch_size) loss = QNet.train_model(online_net, target_net, optimizer, batch, device) if steps % update_target == 0: update_target_model(online_net, target_net) if (steps > 1028): states = envs.reset() steps = 0 print( 'new epsisode ------------------------------------------') except KeyboardInterrupt: print('save model') torch.save(target_net, '../result/arc.model') sys.exit(0)
def main(): ### 環境を初期化 env = gym.make(env_name) env.seed(500) torch.manual_seed(500) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) ### ポリシーネットワークの構築 net = QNet(num_inputs, num_actions) optimizer = optim.Adam(net.parameters(), lr=lr) net.to(device) net.train() ### もろもろの初期化 running_score = 0 steps = 0 loss = 0 steps_before = 0 for e in range(10000): done = False ### 1エピソードごとにMemoryは空にする(実質、Experience Replay がない) memory = Memory() ### 環境を初期状態に score = 0 state = env.reset() state = torch.Tensor(state).to(device) state = state.unsqueeze(0) while not done: steps += 1 ### epsilon は使わず、各行動の評価値を確率に直接変換して行動を決定 action = net.get_action(state) next_state, reward, done, _ = env.step(action) next_state = torch.Tensor(next_state) next_state = next_state.unsqueeze(0) mask = 0 if done else 1 reward = reward if not done or score == 499 else -1 action_one_hot = torch.zeros(num_actions) action_one_hot[action] = 1 memory.push(state, next_state, action_one_hot, reward, mask) score += reward state = next_state ### 1エピソード分をまとめて学習 ### memory.sample はランダムに選択ではなく、1エピソードのmemory全体を返す loss = QNet.train_model(net, optimizer, memory.sample()) print("Ep {0:04d}: {1} step".format(e, steps - steps_before)) steps_before = steps score = score if score == 500.0 else score + 1 running_score = 0.99 * running_score + 0.01 * score if e % log_interval == 0: print('{} episode | score: {:.2f}'.format(e, running_score)) if running_score > goal_score: break
def main(): env = gym.make(env_name) env.seed(500) torch.manual_seed(500) ### NNのIn-Outは環境によって異なる num_inputs = env.observation_space.shape[0] num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) ### 2つのNWを作成・初期化 online_net = QNet(num_inputs, num_actions) target_net = QNet(num_inputs, num_actions) update_target_model(online_net, target_net) optimizer = optim.Adam(online_net.parameters(), lr=lr) ### 各NWの設定 CPU / GPU online_net.to(device) target_net.to(device) ### 各NWの設定 初めは学習モードにする online_net.train() target_net.train() ### 学習前の初期設定 memory = Memory(replay_memory_capacity) running_score = 0 epsilon = 1.0 steps = 0 loss = 0 steps_before = 0 for e in range(3000): done = False score = 0 state = env.reset() state = torch.Tensor(state).to(device) state = state.unsqueeze(0) while not done: steps += 1 ### 行動の決定はtarget_netで行う action = get_action(state, target_net, epsilon, env) ### 次の状態の観測、報酬の獲得 next_state, reward, done, _ = env.step(action) next_state = torch.Tensor(next_state) next_state = next_state.unsqueeze(0) if e % 10 == 0: print(next_state, action, reward) ### わかりにくいので書き変えた if done: mask = 0 else: mask = 1 ### memoryに記録 action_one_hot = np.zeros(num_actions) action_one_hot[action] = 1 memory.push(state, next_state, action_one_hot, reward, mask) ### rewardは基本的に-1 score += reward ### そのepisodeで何ステップ行ったかを記録するためだけのもの state = next_state if steps > initial_exploration: epsilon -= 0.00005 epsilon = max(epsilon, 0.1) ### online_net の学習 batch = memory.sample(batch_size) loss = QNet.train_model(online_net, target_net, optimizer, batch) ### たまにtarget_netをonline_netで上書きする if steps % update_target == 0: update_target_model(online_net, target_net) print("Ep {0:04d}: {1} step".format(e, steps - steps_before)) steps_before = steps score = score if score == 200.0 else score + 1 running_score = 0.99 * running_score + 0.01 * score if e % log_interval == 0: print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format( e, running_score, epsilon)) if running_score > goal_score: break
def main(): env = gym.make(env_name) env.seed(500) torch.manual_seed(500) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) ### ポリシーネットワークの構築 ### inputに対してπ(a|s) と Q(s, a) が出力される ### 次元とユニット数は2つで同じ net = QNet(num_inputs, num_actions) optimizer = optim.Adam(net.parameters(), lr=lr) net.to(device) net.train() ### もろもろの初期化 running_score = 0 steps = 0 loss = 0 steps_before = 0 df = pd.DataFrame(index=range(10000), columns=["steps", "loss_policy", "loss_value"]) for e in range(10000): done = False ### 1エピソード分のメモリすら持たずに1ステップずつ学習 ### 環境を初期状態に score = 0 state = env.reset() state = torch.Tensor(state).to(device) state = state.unsqueeze(0) lp = [] lv = [] while not done: steps += 1 ### epsilon は使わず、各行動の評価値を確率に直接変換して行動を決定 action = net.get_action(state) next_state, reward, done, _ = env.step(action) next_state = torch.Tensor(next_state) next_state = next_state.unsqueeze(0) mask = 0 if done else 1 reward = reward if not done or score == 499 else -1 transition = [state, next_state, action, reward, mask] score += reward state = next_state ### 1ステップごとに、そのステップの結果のみを学習 loss, loss_policy, loss_value = QNet.train_model(net, optimizer, transition) # loss = QNet.train_model(net, optimizer, transition) lp.append(loss_policy.item()) lv.append(loss_value.item()) lp = np.asarray(lp[:-1]).sum() / (len(lp) - 1) lv = np.asarray(lv[:-1]).sum() / (len(lv) - 1) print("Ep {0:04d}: {1} step, loss_policy: {2}, loss_value: {3}".format(e, steps - steps_before, lp, lv)) # print("Ep {0:04d}: {1} step".format(e, steps - steps_before)) df.loc[e, "steps"] = steps - steps_before df.loc[e, "loss_policy"] = lp df.loc[e, "loss_value"] = lv steps_before = steps score = score if score == 500.0 else score + 1 running_score = 0.99 * running_score + 0.01 * score if e % log_interval == 0: print('{} episode | score: {:.2f}'.format(e, running_score)) if running_score > goal_score: break df.to_csv("loss.csv")
def main(): env = gym.make(args.env_name) env.seed(500) torch.manual_seed(500) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) net = QNet(num_inputs, num_actions) target_net = QNet(num_inputs, num_actions) update_target_model(net, target_net) optimizer = optim.Adam(net.parameters(), lr=0.001) writer = SummaryWriter('logs') if not os.path.isdir(args.save_path): os.makedirs(args.save_path) net.to(device) target_net.to(device) net.train() target_net.train() memory = Memory(10000) running_score = 0 epsilon = 1.0 steps = 0 for e in range(3000): done = False score = 0 state = env.reset() state = torch.Tensor(state).to(device) state = state.unsqueeze(0) while not done: if args.render: env.render() steps += 1 qvalue = net(state) action = get_action(epsilon, qvalue, num_actions) next_state, reward, done, _ = env.step(action) next_state = torch.Tensor(next_state) next_state = next_state.unsqueeze(0) mask = 0 if done else 1 reward = reward if not done or score == 499 else -1 memory.push(state, next_state, action, reward, mask) score += reward state = next_state if steps > args.initial_exploration: epsilon -= 0.00005 epsilon = max(epsilon, 0.1) batch = memory.sample(args.batch_size) train_model(net, target_net, optimizer, batch, args.batch_size) if steps % args.update_target: update_target_model(net, target_net) score = score if score == 500.0 else score + 1 running_score = 0.99 * running_score + 0.01 * score if e % args.log_interval == 0: print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format( e, running_score, epsilon)) writer.add_scalar('log/score', float(score), running_score) if running_score > args.goal_score: ckpt_path = args.save_path + 'model.pth' torch.save(net.state_dict(), ckpt_path) print('running score exceeds 400 so end') break
def main(): env = gym.make(env_name) env.seed(500) torch.manual_seed(500) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) ### ポリシーネットワークの構築 ### inputに対してπ(a|s) と V(s) が出力される ### Vの出力は1つ が学習時にはAdvantage関数を計算する net = QNet(num_inputs, num_actions) optimizer = optim.Adam(net.parameters(), lr=lr) net.to(device) net.train() ### もろもろの初期化 running_score = 0 steps = 0 loss = 0 steps_before = 0 df = pd.DataFrame(index=range(10000), columns=["steps", "loss_policy", "loss_value"]) memory = Memory() for e in range(10000): done = False ### 1エピソード分のメモリすら持たずに1ステップずつ学習 ### 環境を初期状態に score = 0 state = env.reset() state = torch.Tensor(state).to(device) state = state.unsqueeze(0) while not done: steps += 1 ### epsilon は使わず、各行動の評価値を確率に直接変換して行動を決定 action = net.get_action(state) next_state, reward, done, _ = env.step(action) next_state = torch.Tensor(next_state) next_state = next_state.unsqueeze(0) mask = 0 if done else 1 reward = reward if not done or score == 499 else -1 action_one_hot = torch.zeros(num_actions) action_one_hot[action] = 1 transition = [state, next_state, action, reward, mask] memory.push(state, next_state, action_one_hot, reward, mask) score += reward state = next_state steps_before = steps score = score if score == 500.0 else score + 1 running_score = 0.99 * running_score + 0.01 * score if e % 16 == 0: ### 16ステップごとに、まとめて学習 loss, loss_policy, loss_value = QNet.train_model( net, optimizer, memory.sample()) ### メモリの初期化 memory = Memory() df.loc[e, "steps"] = running_score df.loc[e, "loss_policy"] = loss_policy df.loc[e, "loss_value"] = loss_value print( "Ep {0:04d}: score: {1:02d}, loss_policy: {2}, loss_value: {3}" .format(e, int(running_score), loss_policy, loss_value)) if running_score > goal_score: break df.to_csv("loss.csv")
def train(render): online_net = QNet(h=84, w=84, outputs=36) online_net.load_state_dict(torch.load('saved/online_net.pt')) target_net = QNet(h=84, w=84, outputs=36) update_target_model(online_net, target_net) optimizer = optim.Adam(online_net.parameters(), lr=lr) online_net.to(device) target_net.to(device) online_net.train() target_net.train() memory = Memory(replay_memory_capacity) memory = torch.load('saved/model_memory.pt') epsilon = 0.1 steps = 0 beta = beta_start loss = 0 for e in range(100000): #level = random.choice(LEVEL_SET) level = 'Level01' env = make_retro(game=env_name, state=level, use_restricted_actions=retro.Actions.DISCRETE) done = False total_reward = 0.0 state = env.reset() state = torch.Tensor(state).to(device).permute(2, 0, 1) #state = state.view(state.size()[0], -1) state = state.unsqueeze(0) while not done: steps += 1 action = get_action(state.to(device), target_net, epsilon, env) if render: env.render() next_state, reward, done, info = env.step(action) next_state = torch.Tensor(next_state).permute(2, 0, 1) #next_state = next_state.view(next_state.size()[0], -1) next_state = next_state.unsqueeze(0) total_reward += reward mask = 0 if done else 1 action_one_hot = torch.zeros(36) action_one_hot[action] = 1 reward = torch.tensor([info['score']]).to(device) memory.push(state, next_state, action_one_hot, reward, mask) state = next_state if len(memory) > initial_exploration: epsilon -= 0.00005 epsilon = max(epsilon, 0.02) beta += 0.00005 beta = min(1, beta) batch, weights = memory.sample(batch_size, online_net, target_net, beta) loss = QNet.train_model(online_net, target_net, optimizer, batch, weights) if steps % update_target == 0: update_target_model(online_net, target_net) if e % 1 == 0: print('{} episode | Total Reward: {}'.format(e, total_reward)) torch.save(online_net.state_dict(), 'saved/online_net.pt') torch.save(memory, 'saved/model_memory.pt') env.close()
def main(): env = gym.make(env_name) env.seed(500) torch.manual_seed(500) num_inputs = 2 num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) online_net = QNet(num_inputs, num_actions) target_net = QNet(num_inputs, num_actions) update_target_model(online_net, target_net) optimizer = optim.Adam(online_net.parameters(), lr=lr) writer = SummaryWriter('logs') online_net.to(device) target_net.to(device) online_net.train() target_net.train() memory = Memory(replay_memory_capacity) running_score = 0 epsilon = 1.0 steps = 0 loss = 0 for e in range(30000): done = False state_series = deque(maxlen=sequence_length) next_state_series = deque(maxlen=sequence_length) score = 0 state = env.reset() state = state_to_partial_observability(state) state = torch.Tensor(state).to(device) next_state_series.append(state) while not done: steps += 1 state_series.append(state) action = get_action(state_series, target_net, epsilon, env) next_state, reward, done, _ = env.step(action) next_state = state_to_partial_observability(next_state) next_state = torch.Tensor(next_state) mask = 0 if done else 1 reward = reward if not done or score == 499 else -1 action_one_hot = np.zeros(2) action_one_hot[action] = 1 if len(state_series) >= sequence_length: memory.push(state_series, next_state_series, action_one_hot, reward, mask) score += reward state = next_state if steps > initial_exploration: epsilon -= 0.000005 epsilon = max(epsilon, 0.1) batch = memory.sample(batch_size) loss = QNet.train_model(online_net, target_net, optimizer, batch) if steps % update_target == 0: update_target_model(online_net, target_net) score = score if score == 500.0 else score + 1 if running_score == 0: running_score = score else: running_score = 0.99 * running_score + 0.01 * score if e % log_interval == 0: print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format( e, running_score, epsilon)) writer.add_scalar('log/score', float(running_score), e) writer.add_scalar('log/loss', float(loss), e) if running_score > goal_score: break
def main(): env = gym.make(env_name) env.seed(500) torch.manual_seed(500) ### NNのIn-Outは環境によって異なる # num_inputs = env.observation_space.shape[0] num_inputs = 1024 num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) ### 2つのNWを作成・初期化 online_net = QNet(num_inputs, num_actions) target_net = QNet(num_inputs, num_actions) update_target_model(online_net, target_net) optimizer = optim.Adam(online_net.parameters(), lr=lr) ### 各NWの設定 CPU / GPU online_net.to(device) target_net.to(device) ### 各NWの設定 初めは学習モードにする online_net.train() target_net.train() ### 特徴抽出用の学習済みモデル # pre_model = models.resnet50(pretrained=True) # pre_model.fc = nn.Identity() pre_model = models.squeezenet1_0(pretrained=True) pre_model.classifier = nn.AdaptiveAvgPool2d((1, 1)) pre_model.to(device) def state_to_feature(state): state_img = render_cv2img(state[0], state[2]) state_img = cv2.resize(state_img, (224, 224))[:, :, 0] state_img = state_img.reshape((1, 224, 224)) state_img_rgb = np.zeros((1, 3, 224, 224)) state_img_rgb[:, 0] = state_img state_img_rgb[:, 1] = state_img state_img_rgb[:, 2] = state_img state_img_rgb_tensor = torch.Tensor(state_img_rgb).to(device) state_feature = pre_model(state_img_rgb_tensor) return state_feature ### メモリの保存場所(改修中) memory_dir = "memory/" memory = Memory(replay_memory_capacity, memory_dir) ### 学習前の初期設定 running_score = 0 epsilon = 1.0 steps = 0 loss = 0 steps_before = 0 for e in range(3000): done = False score = 0 ### state = [位置, 速度, 角度, 角速度] state = env.reset( ) ### [-0.01517264 0.02423424 0.02480018 -0.04009749] ### state = [[2048次元のベクトル]] state = state_to_feature(state) ### 前の時間の情報が無いときついため、それを入れるためのもの 最初はstateと同値でよさそう previous_state = state while not done: steps += 1 ### 行動の決定はtarget_netで行う previous_present_state = torch.cat((previous_state, state), 1) action = get_action(previous_present_state, target_net, epsilon, env) ### 次の状態の観測、報酬の獲得 next_state, reward, done, _ = env.step(action) next_state = state_to_feature(next_state) present_next_state = torch.cat((state, next_state), 1) ### わかりにくいので書き変えた if done: mask = 0 else: mask = 1 if (done and (score != 499)): ### 499ステップまで行かずにdoneになったら reward = -1 else: pass ### rewardは基本的に1 ### memoryに記録 action_one_hot = np.zeros(2) action_one_hot[action] = 1 memory.push(previous_present_state, present_next_state, action_one_hot, reward, mask) ### rewardは基本的に1 score += reward ### そのepisodeで何ステップ行ったかを記録するためだけのもの if steps > initial_exploration: epsilon -= 0.00005 epsilon = max(epsilon, 0.1) ### online_net の学習 batch = memory.sample(batch_size) loss = QNet.train_model(online_net, target_net, optimizer, batch) ### たまにtarget_netをonline_netで上書きする if steps % update_target == 0: update_target_model(online_net, target_net) ### 次のステップ previous_state = state state = next_state print("Ep {0:04d}: {1} step".format(e, steps - steps_before)) steps_before = steps score = score if score == 500.0 else score + 1 running_score = 0.99 * running_score + 0.01 * score if e % log_interval == 0: print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format( e, running_score, epsilon)) if running_score > goal_score: break
def main(): env = gym.make(args.env_name) env.seed(500) torch.manual_seed(500) img_shape = env.observation_space.shape num_actions = 3 print('image size:', img_shape) print('action size:', num_actions) net = QNet(num_actions) target_net = QNet(num_actions) update_target_model(net, target_net) optimizer = optim.RMSprop(net.parameters(), lr=0.00025, eps=0.01) writer = SummaryWriter('logs') if not os.path.isdir(args.save_path): os.makedirs(args.save_path) net.to(device) target_net.to(device) net.train() target_net.train() memory = Memory(100000) running_score = 0 epsilon = 1.0 steps = 0 for e in range(10000): done = False dead = False score = 0 avg_loss = [] start_life = 5 state = env.reset() state = pre_process(state) state = torch.Tensor(state).to(device) history = torch.stack((state, state, state, state)) for i in range(3): action = env.action_space.sample() state, reward, done, info = env.step(action) state = pre_process(state) state = torch.Tensor(state).to(device) state = state.unsqueeze(0) history = torch.cat((state, history[:-1]), dim=0) while not done: if args.render: env.render() steps += 1 qvalue = net(history.unsqueeze(0)) action = get_action(epsilon, qvalue, num_actions) next_state, reward, done, info = env.step(action + 1) next_state = pre_process(next_state) next_state = torch.Tensor(next_state).to(device) next_state = next_state.unsqueeze(0) next_history = torch.cat((next_state, history[:-1]), dim=0) if start_life > info['ale.lives']: dead = True start_life = info['ale.lives'] score += reward reward = np.clip(reward, -1, 1) mask = 0 if dead else 1 memory.push(history.cpu(), next_history.cpu(), action, reward, mask) if dead: dead = False if steps > args.initial_exploration: epsilon -= 1e-6 epsilon = max(epsilon, 0.1) batch = memory.sample(args.batch_size) loss = train_model(net, target_net, optimizer, batch) if steps % args.update_target: update_target_model(net, target_net) else: loss = 0 avg_loss.append(loss) history = next_history if e % args.log_interval == 0: print( '{} episode | score: {:.2f} | epsilon: {:.4f} | steps: {} | loss: {:.4f}' .format(e, score, epsilon, steps, np.mean(avg_loss))) writer.add_scalar('log/score', float(score), steps) writer.add_scalar('log/score', np.mean(avg_loss), steps) if score > args.goal_score: ckpt_path = args.save_path + 'model.pth' torch.save(net.state_dict(), ckpt_path) print('running score exceeds 400 so end') break
def main(): env = gym.make(env_name) env.seed(500) torch.manual_seed(500) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) online_net = QNet(num_inputs, num_actions) target_net = QNet(num_inputs, num_actions) update_target_model(online_net, target_net) optimizer = optim.Adam(online_net.parameters(), lr=lr) writer = SummaryWriter('logs') online_net.to(device) target_net.to(device) online_net.train() target_net.train() memory = Memory_With_TDError(replay_memory_capacity) running_score = 0 epsilon = 1.0 steps = 0 beta = beta_start loss = 0 for e in range(3000): done = False score = 0 state = env.reset() state = torch.Tensor(state).to(device) state = state.unsqueeze(0) while not done: steps += 1 action = get_action(state, target_net, epsilon, env) next_state, reward, done, _ = env.step(action) next_state = torch.Tensor(next_state) next_state = next_state.unsqueeze(0) mask = 0 if done else 1 reward = reward if not done or score == 499 else -1 action_one_hot = np.zeros(2) action_one_hot[action] = 1 memory.push(state, next_state, action_one_hot, reward, mask) score += reward state = next_state if steps > initial_exploration: epsilon -= 0.00005 epsilon = max(epsilon, 0.1) beta += 0.00005 beta = min(1, beta) batch, weights = memory.sample(batch_size, online_net, target_net, beta) loss = QNet.train_model(online_net, target_net, optimizer, batch, weights) if steps % update_target == 0: update_target_model(online_net, target_net) score = score if score == 500.0 else score + 1 running_score = 0.99 * running_score + 0.01 * score if e % log_interval == 0: print( '{} episode | score: {:.2f} | epsilon: {:.2f} | beta: {:.2f}'. format(e, running_score, epsilon, beta)) writer.add_scalar('log/score', float(running_score), e) writer.add_scalar('log/loss', float(loss), e) if running_score > goal_score: break
def main(L, mouse_initial_indices, rewardlist, actions_list): if mouse_initial_indices is None: all_possible_starting_positions = np.array([*np.where(L == 1)]).T scores = [0] best_scores = [0] env = deepcopy(L) torch.manual_seed(2020) num_inputs = 2 + 1 num_actions = 4 print('state size:', num_inputs) print('action size:', num_actions) online_net = QNet(num_inputs, num_actions) target_net = QNet(num_inputs, num_actions) update_target_model(online_net, target_net) optimizer = optim.Adam(online_net.parameters(), lr=lr) # writer = SummaryWriter('logs') online_net.to(device) target_net.to(device) online_net.train() target_net.train() memory = Memory(replay_memory_capacity) running_score = 0 epsilon = 1.0 steps = 0 loss = 0 inint = mouse_initial_indices best_score = 0 number_episode = 1000 for e in range(number_episode): if inint is None: mouse_initial_indices = all_possible_starting_positions[ np.random.choice(range(len(all_possible_starting_positions)))] done = False env = deepcopy(L) eaubue = 0. score = 0 state = np.array(mouse_initial_indices) state = torch.Tensor(state).to(device) state = state.unsqueeze(0) while not done: steps += 1 action = get_action(state, target_net, epsilon, env, eaubue=eaubue) newstate = state + torch.Tensor(np.array( actions_list[action])).to(device) if env[int(newstate[0][0].tolist()), int(newstate[0][1].tolist())] != 0: next_state = newstate new_eaubue = eaubue reward = rewardlist[env[int(newstate[0][0].tolist()), int(newstate[0][1].tolist())]] if env[int(newstate[0][0].tolist()), int(newstate[0][1].tolist())] == 2: done = True if env[int(newstate[0][0].tolist()), int(newstate[0][1].tolist() )] == 4: #if the mouse is in the water env[int(newstate[0][0].tolist()), int(newstate[0][1].tolist() )] = 5 #there is no more water new_eaubue = 1. else: next_state = state reward = rewardlist[0] new_eaubue = eaubue mask = 0 if done else 1 action_one_hot = np.zeros(4) action_one_hot[action] = 1 memory.push( torch.cat(( state, torch.tensor(eaubue).unsqueeze(0).unsqueeze(0).to(device)), 1), torch.cat((next_state, torch.tensor(new_eaubue).unsqueeze( 0).unsqueeze(0).to(device)), 1), action_one_hot, reward, mask) score += reward state = next_state eaubue = new_eaubue if steps > initial_exploration: epsilon -= 0.00005 epsilon = max(epsilon, 0.1) batch = memory.sample(batch_size) loss = QNet.train_model(online_net, target_net, optimizer, batch) if steps % update_target == 0: update_target_model(online_net, target_net) # print("OK") if score > 35: print(score) running_score = 0.99 * running_score + 0.01 * score # running_score=score scores.append(running_score) best_scores.append( score if score > best_scores[-1] else best_scores[-1]) if e % log_interval == 0: print( '{} episode | score: {:.2f} | best score: {:.2f} | epsilon: {:.2f}' .format(e, running_score, best_score, epsilon)) # writer.add_scalar('log/score', float(running_score), e) # writer.add_scalar('log/loss', float(loss), e) if score > best_score: best_score = score torch.save(online_net.state_dict(), "./qlearning_model") if running_score > goal_score: break return number_episode, scores, best_scores