def main(): """메인.""" # 환경 생성 env = make_env(ENV_NAME) net = DQN(env.observation_space.shape, env.action_space.n) net.apply(weights_init) tgt_net = DQN(env.observation_space.shape, env.action_space.n) tgt_net.load_state_dict(net.state_dict()) if PRIORITIZED: memory = PrioReplayBuffer(PRIO_BUF_SIZE) else: memory = ReplayBuffer(SEND_SIZE) # 고정 eps로 에이전트 생성 epsilon = EPS_BASE**(1 + actor_id / (num_actor - 1) * EPS_ALPHA) agent = Agent(env, memory, epsilon, PRIORITIZED) log("Actor {} - epsilon {:.5f}".format(actor_id, epsilon)) # zmq 초기화 context, lrn_sock, buf_sock = init_zmq() # 러너에게서 기본 가중치 받고 시작 net, tgt_net = receive_model(lrn_sock, net, tgt_net, True) # # 시뮬레이션 # episode = frame_idx = 0 p_time = p_frame = None p_reward = -50.0 while True: frame_idx += 1 # 스텝 진행 (에피소드 종료면 reset까지) reward = agent.play_step(net, tgt_net, epsilon, frame_idx) # 리워드가 있는 경우 (에피소드 종료) if reward is not None: episode += 1 p_reward = reward # 보내기 if frame_idx % SEND_FREQ == 0: # 학습관련 정보 if p_time is None: speed = 0.0 else: speed = (frame_idx - p_frame) / (time.time() - p_time) info = ActorInfo(episode, frame_idx, p_reward, speed) # 리플레이 정보와 정보 전송 agent.send_replay(buf_sock, info) # 동작 선택 횟수 agent.show_action_rate() p_time = time.time() p_frame = frame_idx # 새로운 모델 받기 net, tgt_net = receive_model(lrn_sock, net, tgt_net, False)
def train(agent, env_name): writer = SummaryWriter() if env_name in ['MountainCar-v0', 'CartPole-v0']: sction = 1 env = gym.make(env_name) elif env_name in ["PongNoFrameskip-v4"]: sction = 2 env = make_env(env_name) step = 0 for i in range(99999): agent.epsilon = 1 / (i * 0.1 + 1) done = False state = env.reset() score = 0 step_per_q_value = 0 step_per_loss = 0 sum_of_q_value = 0 agent.n_step_buffer.reset() agent.save_model('model/model') total_loss = 0 while not done: step += 1 if i % 10 == 0: env.render() action, q_value = agent.get_action(state, agent.epsilon) if not q_value == None: sum_of_q_value += q_value step_per_q_value += 1 if sction == 1: next_state, reward, done, info = env.step(action) elif sction == 2: next_state, reward, done, info = env.step(action + 1) score += reward agent.append_to_memory(state, next_state, reward, done, action) state = next_state if step > agent.batch_size: if step % agent.train_size == 0: step_per_loss += 1 loss = agent.update() total_loss += loss if step % agent.update_size == 0: agent.update_parameter() writer.add_scalar('data/step', step, i) writer.add_scalar('data/score', score, i) writer.add_scalar('data/epsilon', agent.epsilon, i) if not step_per_q_value == 0: writer.add_scalar('data/average_of_q_value', sum_of_q_value / step_per_q_value, i) if not step_per_loss == 0: writer.add_scalar('data/loss', total_loss / step_per_loss, i) print(score, i)
def main(): env = make_env('PongNoFrameskip-v4') policy_net = PolicyNet(env.observation_space.shape, env.action_space.n).to(torch.device('cuda')) base_net = Baseline(env.observation_space.shape).to(torch.device('cuda')) policy_net.load_state_dict(torch.load('./policynet')) base_net.load_state_dict(torch.load('./basenet')) agent = Agent(policy_net, base_net) agent.train(env, 16, 20000, 0.98, 5)
def main(): """메인.""" # 환경 생성 env = make_env(ENV_NAME) set_random_seed(env, actor_id) net = A2C(env.observation_space.shape, env.action_space.n) net.apply(weights_init) memory = ReplayBuffer(SEND_SIZE) agent = Agent(env, memory, NUM_UNROLL) log("Actor {}".format(actor_id)) # zmq 초기화 context, lrn_sock, buf_sock = init_zmq() # 러너에게서 기본 가중치 받고 시작 net = receive_model(lrn_sock, net, True) # # 시뮬레이션 # episode = frame_idx = 0 p_time = p_frame = None p_reward = -50.0 while True: frame_idx += 1 # 스텝 진행 (에피소드 종료면 reset까지) ep_reward = agent.play_step(net, frame_idx) # 에피소드 리워드가 있는 경우 (에피소드 종료) if ep_reward is not None: episode += 1 p_reward = ep_reward log("Episode finished! reward {}".format(ep_reward)) # 보내기 if frame_idx % SEND_FREQ == 0: # 학습관련 정보 if p_time is None: speed = 0.0 else: speed = (frame_idx - p_frame) / (time.time() - p_time) info = ActorInfo(episode, frame_idx, p_reward, speed) # 리플레이 정보와 정보 전송 agent.send_replay(buf_sock, info) # 동작 선택 횟수 agent.show_action_rate() p_time = time.time() p_frame = frame_idx # 새로운 모델 받기 net = receive_model(lrn_sock, net, False)
def run_experiment(params, log_dir, local_log_path, random_seed=None): # create env and add specific conifigurations to Malmo env = make_env(params["DEFAULT_ENV_NAME"]) env.configure(client_pool=[('127.0.0.1', 10000), ('127.0.0.1', 10001)]) env.configure(allowDiscreteMovement=["move", "turn"]) # , log_level="INFO") env.configure(videoResolution=[420, 420]) env.configure(stack_frames=4) env = wrap_env_malmo(env) while True: action = env.action_space.sample() new_state, reward, is_done, _ = env.step(action) env.render('human') if is_done: env.reset()
def train_agent(device: Any) -> None: """ Train agent using embedder and embedded checkpoints. TODO Fix docstrings once finished. """ # Load embedded network tdc = TDC().to(device) load_tdc(tdc) # Create checkpoints loader = get_checkpoint_loader() checkpoints: List[torch.Tensor] = get_checkpoints(tdc, loader) # Create environment env = make_env(tdc, checkpoints) # TODO Temporarily added to disable flake8 error print(env)
def test(agent, env_name): if env_name in ['MountainCar-v0', 'CartPole-v0']: sction = 1 env = gym.make(env_name) elif env_name in ["PongNoFrameskip-v4"]: sction = 2 env = make_env(env_name) agent.load_model('model/model') while True: state = env.reset() done = False while not done: env.render() time.sleep(0.01) action, q_value = agent.get_action(state, 0) if sction == 1: next_state, reward, done, _ = env.step(action) elif sction == 2: next_state, reward, done, _ = env.step(action + 1) state = next_state
def prep_agent(params, log_dir, local_log, random_seed, trial, agent_id, port): # define device on which to run device = torch.device(params["DEVICE"]) # create env and add specific conifigurations to Malmo env = make_env(params["DEFAULT_ENV_NAME"]) env.configure(client_pool=[('127.0.0.1', int(port))]) # fix port env.configure(allowDiscreteMovement=["move", "turn"]) # , log_level="INFO") env.configure(videoResolution=[84,84]) env.configure(stack_frames=4) env = wrap_env_malmo(env) if random_seed: env.seed(random_seed) print(colored("Observation Space: ", COLORS[agent_id]), colored(env.observation_space, COLORS[agent_id])) print(colored("Action Space: ", COLORS[agent_id]), colored(env.action_space, COLORS[agent_id])) # initialize agent bufer = ExperienceBufferGridImage(params["REPLAY_SIZE"]) # buffer = ExperienceBuffer(params["REPLAY_SIZE"]) net = DQN(env.observation_space.shape, env.action_space.n, params["DEVICE"]).to(device) tgt_net = DQN(env.observation_space.shape, env.action_space.n, params["DEVICE"]).to(device) epsilon = params["EPSILON_START"] gamma = params["GAMMA"] tau = params["SOFT_UPDATE_TAU"] agent = Agent('agent' + str(agent_id), env, bufer, net, tgt_net, gamma, epsilon, tau, trial, log_dir, params) # other variables agent.optimizer = optim.Adam(agent.net.parameters(), lr=params["LEARNING_RATE"]) agent.print_color = COLORS[agent_id] # fill buffer with initial size - don't count these episodes agent.fill_buffer() return agent
def make_gif(extension): agent = Agent(alpha=1e-4, gamma=0.99, n_actions=3, action_map={ 0: 0, 1: 4, 2: 5 }, mem_size=25000, batch_size=32, replace=0, input_dims=(4, 80, 80), epsilon=0.02, epsilon_dec=0, epsilon_min=0, load_from_checkpoint=False) agent.load_models(extension=extension) frames = [] done = False env = make_env("PongNoFrameskip-v4") observation = env.reset() i = 0 while not done: if i % 3 == 0: frames.append(Image.fromarray(env.render(mode='rgb_array'))) action = agent.get_action(observation) move = agent.action_map[action] new_observation, reward, done, info = env.step(move) observation = new_observation i += 1 with open(f'{extension}.gif', 'wb') as f: # change the path if necessary im = Image.new('RGB', frames[0].size) im.save(f, save_all=True, append_images=frames)
'NB_FRAMES': 10000, 'BATCH_SIZE': 32, 'DISCOUNT': 0.99, 'TARGET_UPDATE_STEPS': 100, 'LEARNING_RATE': 1e-3, 'REPLAY_BUFFER_SIZE': 1000, 'MIN_REPLAY_BUFFER_SIZE': 100, 'EPSILON_START': 1, 'EPSILON_END': 0.1, 'EPSILON_DECAY_DURATION': 5000, } # Allow changing hyperparameters from command-line arguments args = get_args(default_args=args_dict) # Create wrapped environment env = make_env(args.ENV_ID) # Set Seed set_seed(env, args.SEED) # GPU or CPU device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # Create agent agent = Agent(env, device, args) # Train agent for args.NB_FRAMES agent.train() # Save agent agent.save()
"--env", default=DEFAULT_ENV_NAME, help="Environment name to use" "default=" + DEFAULT_ENV_NAME) parser.add_argument("-r", "--record", help="Directory to store video " "recording") parser.add_argument("--no-visualize", default=True, action='store_false', dest='visualize', help="Disable visualization of the game play") args = parser.parse_args() env = make_env(args.env) if args.record: env = gym.wrappers.Monitor(env, args.record) net = torch.load(args.model, map_location={'cuda:0': 'cpu'}) state = env.reset() total_reward = 0.0 dead = False start_life = 5 c = collections.Counter() while True: start_ts = time.time() if args.visualize: env.render() state_v = torch.tensor(np.array([state], copy=False))
def main(): # Get Atari games. env = make_env() # Run training atari_learn(env, num_timesteps=2e8)
# d = local_log._getvalue() # with open( local_log_path , "w") as f: # json.dump(d, f) # Inform experiment is done print("Experiment complet. Results found at: " + local_log_path) def prep_agent(params, log_dir, local_log, random_seed, trial, agent_id, port, give, receive, req_give, req_receive) # define device on which to run device = torch.device(params["DEVICE"]) # create env and add specific conifigurations to Malmo env = make_env(params["DEFAULT_ENV_NAME"]) env.configure(client_pool=[('127.0.0.1', int(port))]) # fix port env.configure(allowDiscreteMovement=["move", "turn"]) # , log_level="INFO") env.configure(videoResolution=[84,84]) env.configure(stack_frames=4) env = wrap_env_malmo(env) if random_seed: env.seed(random_seed) print(colored("Observation Space: ", COLORS[agent_id]), colored(env.observation_space, COLORS[agent_id])) print(colored("Action Space: ", COLORS[agent_id]), colored(env.action_space, COLORS[agent_id])) # initialize agent bufer = ExperienceBufferGridImage(params["REPLAY_SIZE"]) # buffer = ExperienceBuffer(params["REPLAY_SIZE"])
print(f"episode score: {total_return}") if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--task', type=str, default='pong') parser.add_argument('--render', action='store_true') parser.add_argument('--cpu', action='store_true') parser.add_argument('--evaluate', type=str, default=None) parser.add_argument('--resume', type=str, default=None, nargs=2) args = parser.parse_args() params = HPS[args.task] device = torch.device('cpu') if args.cpu else torch.device('cuda') env = make_env(params.env_name) obs_shape = env.observation_space.shape nb_actions = env.action_space.n if params.net_type == 'conv': net = CategoricalDQN((params.frame_stack, *obs_shape), nb_actions) agent = CategoricalDQNAgent( net=net, nb_actions=nb_actions, gamma=params.gamma, device=device, ) if args.evaluate: agent.net.load_state_dict(torch.load(args.evaluate)) env = make_env(params.env_name, episodic=False)
def DQN_experiment(params, log_dir, random_seed=None): # define device on which to run device = torch.device(params["DEVICE"]) # fix replay start sie to be equal to replay size params["REPLAY_START_SIZE"] = params["REPLAY_SIZE"] ## initialize global variables # initialize local log trackers log_episodes_count = [] log_ma_steps = [] log_md_steps = [] log_ma_rewards = [] log_md_rewards = [] colors=['green','red','blue','yellow','cyan','magenta','grey','white'] # try several times and average results, needs to compensate for stochasticity for trial in range(params["NUM_TRIALS"]): # initialize environment agents = [] # need to be one env per agent env = make_env(params["DEFAULT_ENV_NAME"]) if random_seed: env.seed(random_seed) # initialize agents for idx in range(params["NUM_AGENTS"]): # initialize agent buffer = ExperienceBuffer(params["REPLAY_SIZE"], env) net = DQN(env.observation_space.shape[0], env.action_space.n, params["DEVICE"]).to(device) tgt_net = DQN(env.observation_space.shape[0], env.action_space.n, params["DEVICE"]).to(device) epsilon = params["EPSILON_START"] gamma = params["GAMMA"] tau = params["SOFT_UPDATE_TAU"] agent = Agent('agent' + str(idx+1), env, buffer, net, tgt_net, gamma, epsilon, tau, trial, log_dir) # other variables agent.optimizer = optim.Adam(agent.net.parameters(), lr=params["LEARNING_RATE"]) agent.print_color = colors[idx] agents.append(agent) ######### training loop ################################ ts = time.time() # track start time ######### 1. Filling replay bugg ################################ # both agents fill their buffer prior to experience for agent in agents: while True: # add frame count agent.frame_idx+= 1 # play step episode_over, done_reward = agent.play_step(device=device) if params["DEBUG"]: agent.record() # check if minimum buffer size has been achieved. if not, move on, do not do learning if len(agent.exp_buffer) >= params["REPLAY_START_SIZE"]: agent.reset() break ######### 1. They start alternating ################################ episode_start = time.time() ep_count = 0 # while all agents have not completed: while sum(map(lambda agent:agent.completed, agents)) != len(agents): ep_count += 1 # agents alternate for agent in agents: ## Before 2 agents perform, act, do one round of experience share # given a sharing interval and it is not the first episode if params["SHARING"] and ep_count % params["SHARING_INTERVAL"] == 0 and ep_count > 0: # agent 1 requests student, teacher = agents[0], agents[1] transfer_mask = student.request_share(threshold=0) transfer_batch = teacher.exp_buffer.sample_with_mask(student.steps[-1], transfer_mask) student.exp_buffer.extend(transfer_batch) # agent 2 requests student, teacher = agents[1], agents[0] transfer_mask = student.request_share(threshold=0) transfer_batch = teacher.exp_buffer.sample_with_mask(student.steps[1], transfer_mask) student.exp_buffer.extend(transfer_batch) # check if agent has not completed the task already # if it does, go to the next agent if not agent.completed: # play until episode is over episode_over = False while not episode_over: # add frame count agent.frame_idx+= 1 # play step episode_over, done_reward = agent.play_step(device=device) if done_reward is not None: # calculate speed agent.speed = (agent.frame_idx - agent.ts_frame) / (time.time() - ts) agent.ts_frame = agent.frame_idx ts = time.time() # get time between episodes ## verify completion and report metrics if params["INDEPENDENT_EVALUATION"]: if len(agent.total_rewards) % params["TRACKING_INTERVAL"] == 0: agent.test_rewards = [] evaluation_start = time.time() for _ in range(100): done_reward = False while not done_reward: _, done_reward = agent.play_step(device=device, test=True) agent.test_rewards.append(done_reward) evaluation_time = time.time() - evaluation_start # only report after one episode ends agent.mean_reward = np.mean(agent.test_rewards) agent.std_reward = np.std(agent.test_rewards) # calculate elapsed time episode_end = time.time() episode_speed = params["TRACKING_INTERVAL"] / (episode_end - episode_start) episode_start = time.time() # report print(colored("%s, %d: done %d episodes, mean reward %.2f, std reward %.2f, eps %.2f, speed %d f/s, ep_speed %.2f e/s, eval_time %.2f s" % ( agent.alias, agent.frame_idx, len(agent.total_rewards), agent.mean_reward, agent.std_reward, agent.epsilon, agent.speed, episode_speed, evaluation_time ), agent.print_color)) ## check if reward has improved from last iteration if agent.mean_reward is not None: if agent.mean_reward > params["MEAN_REWARD_BOUND"]: print(colored("%s solved in %d episodes!" % (agent.alias, len(agent.total_rewards)), agent.print_color)) # save final version # save final version # torch.save(agent.net.state_dict(), "weights/" + params["DEFAULT_ENV_NAME"] + "-" + agent.alias + "-best.dat") # mark as completed agent.completed = True # save local log log_episodes_count[agent.alias].append(len(agent.total_rewards)) log_steps[agent.alias].append(len(agent.total_rewards)) ## approach to track evaluation using moving averages: else: # only report after one episode ends agent.mean_reward = np.mean(agent.total_rewards[-params["NUMBER_EPISODES_MEAN"]:]) agent.std_reward = np.std(agent.total_rewards[-params["NUMBER_EPISODES_MEAN"]:]) # calculate elapsed time episode_end = time.time() episode_speed = 1 / (episode_end - episode_start) episode_start = time.time() # report if len(agent.total_rewards) % params["TRACKING_INTERVAL"] == 0: print(colored("%s, %d: done %d episodes, mean reward %.2f, std reward %.2f, eps %.2f, speed %d f/s, ep_speed %.2f e/s" % ( agent.alias, agent.frame_idx, len(agent.total_rewards), agent.mean_reward, agent.std_reward, agent.epsilon, agent.speed, episode_speed ), agent.print_color)) ## check if reward has improved from last iteration if agent.mean_reward is not None: if agent.mean_reward > params["MEAN_REWARD_BOUND"]: print(colored("%s solved in %d episodes!" % (agent.alias, len(agent.total_rewards)), agent.print_color)) # save final version # torch.save(agent.net.state_dict(), "weights/" + params["DEFAULT_ENV_NAME"] + "-" + agent.alias + "-best.dat") # mark as completed agent.completed = True # save local log log_episodes_count.append(len(agent.total_rewards)) log_ma_rewards.append(np.mean(agent.total_rewards[-params["REPORTING_INTERVAL"]:])) log_md_rewards.append(np.std(agent.total_rewards[-params["REPORTING_INTERVAL"]:])) log_ma_steps.append(np.mean(agent.total_steps[-params["REPORTING_INTERVAL"]:])) log_md_steps.append(np.std(agent.total_steps[-params["REPORTING_INTERVAL"]:])) # if no sign of converging, also break # but don't store the result if len(agent.total_rewards) > params["MAX_GAMES_PLAYED"]: agent.completed = True # decay epsilon after the first episodes that fill the buffer # decay epsilon linearly on frames agent.epsilon = max(params["EPSILON_FINAL"], params["EPSILON_START"] - (agent.frame_idx-params["REPLAY_START_SIZE"]) / params["EPSILON_DECAY_LAST_FRAME"]) # update at every frame using soft updates if params["SOFT"]: agent.soft_update_target_network() else: if agent.frame_idx % params["SYNC_TARGET_FRAMES"] == 0: agent.tgt_net.load_state_dict(agent.net.state_dict()) ## learn # zero gradients agent.optimizer.zero_grad() # sample from buffer batch = agent.exp_buffer.sample(params["BATCH_SIZE"]) # calculate loss # decide to leave it on the agent as a static method, instead of floating around loss_t = agent.calc_loss(batch, device=device) # calculate gradients loss_t.backward() # gradient clipping if params["GRADIENT_CLIPPING"]: nn.utils.clip_grad_norm_(net.parameters(), params["GRAD_L2_CLIP"]) # optimize agent.optimizer.step() # track agent parameters, including loss function # detach loss before extracting value - not sure if needed, but better safe than sorry if params["DEBUG"]: agent.record(loss_t.detach().item()) for agent in agents: agent.writer.close() # return local log with results local_log = { "episodes_count": log_episodes_count, "ma_steps": log_ma_steps, "md_steps": log_md_steps, "ma_rewards": log_ma_rewards, "md_rewards": log_md_rewards } return local_log
from networks import * from wrappers import make_env import random import sys import pyglet path = "checkpoints/%s.pt" % sys.argv[1] q = OCNN() #q.load_state_dict(torch.load(path,map_location=torch.device('cpu'))) q.eval() env = gym.make('BreakoutNoFrameskip-v0') env = make_env(env, noop_max=10) # sometimes it no-ops until the ball is unsaveable obs = env.reset() action = 0 kold = None def update(dt): global obs, action, kold env.render() k = q(torch.cat(obs).unsqueeze(0)) kk = k.tolist() if kold != kk: print(*('- '[i > 0] + str(round(abs(i), 3)).ljust(5, '0') for i in kk[0]))
def main(): """메인 함수.""" # 환경 생성 env = make_env(ENV_NAME) device = get_device() net = DQN(env.observation_space.shape, env.action_space.n).to(device) net.apply(weights_init) tgt_net = DQN(env.observation_space.shape, env.action_space.n).to(device) tgt_net.load_state_dict(net.state_dict()) writer = SummaryWriter(comment="-" + ENV_NAME) log(net) # ZMQ 초기화 context, act_sock, buf_sock = init_zmq() # 입력을 기다린 후 시작 log("Press Enter when the actors are ready: ") input() # 기본 모델을 발행해 액터 시작 log("sending parameters to actors…") publish_model(net, tgt_net, act_sock) optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE) # optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE, # momentum=0.9) # scheduler = ReduceLROnPlateau(optimizer, 'min') fps = q_max = 0.0 p_time = idxs = errors = None train_cnt = 1 max_reward = -1000 while True: # 버퍼에게 학습을 위한 배치를 요청 log("request new batch {}.".format(train_cnt)) st = time.time() if PRIORITIZED: # 배치의 에러를 보냄 payload = pickle.dumps((idxs, errors)) if errors is not None: priority = np.mean(errors) else: payload = b'' buf_sock.send(payload) payload = buf_sock.recv() log("receive batch elapse {:.2f}".format(time.time() - st)) if payload == b'not enough': # 아직 배치가 부족 log("not enough data to batch.") time.sleep(1) else: # 배치 학습 st = time.time() train_cnt += 1 if PRIORITIZED: exps, idxs, ainfos, binfo = pickle.loads(payload) batch = Experience(*map(np.concatenate, zip(*exps))) else: batch, ainfos, binfo = pickle.loads(payload) loss_t, errors, q_maxs = calc_loss(batch, net, tgt_net, device=device) optimizer.zero_grad() loss_t.backward() # scheduler.step(float(loss_t)) q_max = q_maxs.mean() optimizer.step() # gradient clipping for param in net.parameters(): param.grad.data.clamp_(-GRADIENT_CLIP, GRADIENT_CLIP) # 타겟 네트워크 갱신 if train_cnt % SYNC_TARGET_FREQ == 0: log("sync target network") log(net.state_dict()['conv.0.weight'][0][0]) tgt_net.load_state_dict(net.state_dict()) if train_cnt % SHOW_FREQ == 0: # 보드 게시 # for name, param in net.named_parameters(): # writer.add_histogram("learner/" + name, # param.clone().cpu().data.numpy(), # train_cnt) writer.add_scalar("learner/loss", float(loss_t), train_cnt) writer.add_scalar("learner/Qmax", q_max, train_cnt) if PRIORITIZED: writer.add_scalar("learner/priority", priority, train_cnt) writer.add_scalar("buffer/replay", binfo.replay, train_cnt) for ano, ainfo in ainfos.items(): writer.add_scalar("actor/{}-reward".format(ano), ainfo.reward, ainfo.frame) # 최고 리워드 모델 저장 _max_reward = np.max([ainfo.reward for ainfo in ainfos.values()]) if _max_reward > max_reward and train_cnt % SAVE_FREQ == 0: log("save best model - reward {:.2f}".format(_max_reward)) torch.save(net, ENV_NAME + "-best.dat") max_reward = _max_reward # 모델 발행 if train_cnt % PUBLISH_FREQ == 0: publish_model(net, tgt_net, act_sock) if p_time is not None: elapsed = time.time() - p_time fps = 1.0 / elapsed log("train elapsed {:.2f} speed {:.2f} f/s".format(elapsed, fps)) p_time = time.time() writer.close()
layer.register_forward_hook(hook_fn) return visualization def modify(t, i): t[i] += torch.randn_like(t[i]) * 0.05 if __name__ == "__main__": path = "checkpoints/%s.pt" % sys.argv[1] q = OCNN() q.load_state_dict(torch.load(path, map_location=torch.device('cpu'))) q.eval() env = gym.make('BreakoutNoFrameskip-v0') env = make_env(env) d = hook_layers(q) obs = env.reset() done = False R = 0 c = 0 c1 = 0 c2 = 0 c3 = 0 for i in range(1000): state = torch.cat(obs).unsqueeze(0)
"target_update": 1000, "num_episodes": 1500, "batch_size": 32, "replay_initial": 10000, "capacity": 100000, "max_nb_elements": 4, }, } params = HYPERPARAMS["breakout"] scores, eps_history = [], [] device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') env = wrappers.make_env(params["env_name"]) policy_network = model.NoisyDQN(env.observation_space.shape, env.action_space.n).to(device) target_network = ptan.agent.TargetNet(policy_network) optimizer = optim.Adam(policy_network.parameters(), lr=params["learning_rate"]) action_selector = ptan.actions.ArgmaxActionSelector() agent = ptan.agent.DQNAgent(policy_network, action_selector, device) exp_source = ptan.experience.ExperienceSourceFirstLast( env, agent, gamma=params["gamma"], steps_count=1) buffer = ptan.experience.ExperienceReplayBuffer( exp_source, buffer_size=params["capacity"])
def main(): logger.configure('{}{}_logs'.format(filePath, envName)) for k, v in C.items(): logger.record_tabular(k, v) logger.dump_tabular() logger.log('MsPacman') #Start the session sess = tf.InteractiveSession() train_env = make_env(C['env_id'], C['noop_max']) eval_env = make_env(C['env_id'], C['noop_max']) #Intitialize variables to record outputs train_track = [0.0] eval_track = [] best_reward = 0 train_reward = tf.placeholder(tf.float32) eval_reward = tf.placeholder(tf.float32) train_env = make_env(C['env_id'], C['noop_max']) eval_env = make_env(C['env_id'], C['noop_max']) agent = Agent(train_env, C) train_fs = reset_fs() train_s = train_env.reset() best_reward = 0 train_mean = [] eval_mean = [] train_summary = tf.summary.scalar('train_reward', train_reward) eval_summary = tf.summary.scalar('eval_reward', eval_reward) writer = tf.summary.FileWriter('{}{}_summary'.format(filePath, envName), sess.graph) sess.run(tf.global_variables_initializer()) agent.net.update_target_network() for it in range(C['iterations']): train_fs.append(train_s) train_a = agent.act(np.transpose(train_fs, (1, 2, 0))) ns, train_r, train_d, _ = train_env.step(train_a) #print('Iteration ',it, ' Reward ', train_r) train_track[-1] += train_r agent.record(train_s, train_a, train_r, float(train_d), it) train_s = ns if train_d: if train_env.env.env.was_real_done: # one env for MsPacman, Freeway (No Fire action) if len(train_track) % 100 == 0: mean = np.mean(train_track[-100:]) train_mean.append(mean) summary = sess.run(train_summary, feed_dict={train_reward: mean}) writer.add_summary(summary, it) logger.record_tabular('steps', it) logger.record_tabular('episode', len(train_track)) logger.record_tabular('epsilon', 100 * agent.epsilon) logger.record_tabular('learning rate', agent.lr) logger.record_tabular('Mean Reward 100 episdoes', mean) logger.dump_tabular() with open(resultPath + 'reward_atari_base.pk1', 'wb') as f: pickle.dump(train_track, f, protocol=pickle.HIGHEST_PROTOCOL) train_track.append(0.0) train_fs = reset_fs() train_s = train_env.reset() if (it + 1) % C['eval_freq'] == 0: for i in range(C['eval_episodes']): temp_video = [] eval_track.append(0.0) eval_fs = reset_fs() eval_s = eval_env.reset() while True: temp_video.append(eval_s) eval_fs.append(eval_s) eval_a = agent.greedy_act(np.transpose(eval_fs, (1, 2, 0))) eval_s, eval_r, eval_d, _ = eval_env.step(eval_a) eval_track[-1] += eval_r if eval_env.env.env.was_real_done: break if eval_d: eval_fs = reset_fs() eval_s = eval_env.reset() if eval_track[-1] > best_reward: best_reward = eval_track[-1] best_video = temp_video with open(resultPath + 'video_atari_base.pk1', 'wb') as f: pickle.dump(best_video, f, protocol=pickle.HIGHEST_PROTOCOL) eval_mean.append(np.mean(eval_track[-C['eval_episodes']:])) summary = sess.run(eval_summary, feed_dict={ eval_reward: np.mean(eval_track[-C['eval_episodes']:]) }) writer.add_summary(summary, it) if it == 1000000: outputs = agent.net.get_outputs(np.transpose(train_fs, (1, 2, 0))) with open(resultPath + 'outputs.pk1', 'wb') as f: pickle.dump(outputs, f, protocol=pickle.HIGHEST_PROTOCOL) with open(resultPath + 'outputs_screen.pk1', 'wb') as f: pickle.dump(train_fs, f, protocol=pickle.HIGHEST_PROTOCOL) with open(resultPath + 'reward_atari_base.pk1', 'wb') as f: pickle.dump(train_track, f, protocol=pickle.HIGHEST_PROTOCOL) with open(resultPath + 'trainMean_atari_base.pk1', 'wb') as f: pickle.dump(train_mean, f, protocol=pickle.HIGHEST_PROTOCOL) with open(resultPath + 'evalMean_atari_base.pk1', 'wb') as f: pickle.dump(eval_mean, f, protocol=pickle.HIGHEST_PROTOCOL) agent.net.save(filePath + '{}_model2'.format(C['env_id'])) sess.close()
def main(): #Adding configuraion file details into logger logger.configure('{}{}_logs'.format(filePath, envName)) for k, v in C.items(): logger.record_tabular(k, v) logger.dump_tabular() logger.log('Practice DQN with Dense 512') sess = tf.InteractiveSession() train_env = make_env(C['env_id'], C['noop_max']) eval_env = make_env(C['env_id'], C['noop_max']) train_s = train_env.reset() agent = Agent(train_env, C) train_reward = tf.placeholder(tf.float32) eval_reward = tf.placeholder(tf.float32) train_summary = tf.summary.scalar('train_reward', train_reward) eval_summary = tf.summary.scalar('eval_reward', eval_reward) writer = tf.summary.FileWriter('{}{}_summary'.format(filePath, envName), sess.graph) sess.run(tf.global_variables_initializer()) #Practice for it in range(C['pre_iterations']): train_a = agent.act_pre() ns, train_r, train_d, _ = train_env.step(train_a) agent.record(train_s, train_a, train_r, float(train_d), it, True) train_s = ns if train_d: train_s = train_env.reset() logger.log('Pre-training completed') #Initializing Online RL training network agent.net.initialize_online_network() train_track = [0.0] eval_track = [] best_reward = 0 train_fs = reset_fs() train_s = train_env.reset() best_reward = 0 train_mean = [] eval_mean = [] agent.net.update_target_network() #RL training for it in range(C['iterations']): train_fs.append(train_s) train_a = agent.act(np.transpose(train_fs, (1,2,0))) ns, train_r, train_d, _ = train_env.step(train_a) train_track[-1]+= train_r agent.record(train_s, train_a, train_r, float(train_d), it, False) train_s = ns if train_d: if train_env.env.env.was_real_done: if len(train_track) % 100 == 0: #records statistics to logger and tensorboard train_mean.append(np.mean(train_track[-100:])) summary = sess.run(train_summary, feed_dict={train_reward:np.mean(train_track[-100:])}) writer.add_summary(summary, it) logger.record_tabular('steps', it) logger.record_tabular('episode', len(train_track)) logger.record_tabular('epsilon', 100*agent.epsilon) logger.record_tabular('learning rate', agent.lr) logger.record_tabular('Mean Reward 100 episdoes', np.mean(train_track[-100:])) logger.dump_tabular() with open(resultPath + 'reward_atari_practice.pk1', 'wb') as f: pickle.dump(train_track, f, protocol=pickle.HIGHEST_PROTOCOL) train_track.append(0.0) train_fs = reset_fs() train_s = train_env.reset() #Evaluation if (it+1)%C['eval_freq'] == 0: for i in range(C['eval_episodes']): temp_video = [] eval_track.append(0.0) eval_fs = reset_fs() eval_s = eval_env.reset() while True: temp_video.append(eval_s) eval_fs.append(eval_s) eval_a = agent.greedy_act(np.transpose(eval_fs, (1,2,0))) eval_s, eval_r, eval_d, _ = eval_env.step(eval_a) eval_track[-1] += eval_r if eval_env.env.env.was_real_done: break if eval_d: eval_fs = reset_fs() eval_s = eval_env.reset() if eval_track[-1] > best_reward: best_reward = eval_track[-1] best_video = temp_video with open(resultPath + 'video_atari_practice.pk1', 'wb') as f: pickle.dump(best_video, f, protocol=pickle.HIGHEST_PROTOCOL) eval_mean.append(np.mean(eval_track[-C['eval_episodes']:])) logger.log('Evaluate mean reward: {:.2f}, max reward: {:.2f}, std: {:.2f}'.format(np.mean(eval_track[-C['eval_episodes']:]), np.max(eval_track[-C['eval_episodes']:]), np.std(eval_track[-C['eval_episodes']:]))) summary = sess.run(eval_summary, feed_dict={eval_reward:np.mean(eval_track[-C['eval_episodes']:])}) writer.add_summary(summary, it) with open(resultPath + 'eval_reward_atari_practice.pk1', 'wb') as f: pickle.dump(eval_track, f, protocol=pickle.HIGHEST_PROTOCOL) #Storing current state and outputs from Convolution layers if it%1000000 == 0: outputs = agent.net.get_outputs(np.transpose(train_fs, (1,2,0))) with open(resultPath+str(it)+'outputs.pk1', 'wb') as f: pickle.dump(outputs, f, protocol=pickle.HIGHEST_PROTOCOL) with open(resultPath+str(it)+'outputs_screen.pk1', 'wb') as f: pickle.dump(train_fs, f, protocol=pickle.HIGHEST_PROTOCOL) #Storing required outputs as pickle files with open(resultPath + 'reward_atari_practice.pk1', 'wb') as f: pickle.dump(train_track, f, protocol=pickle.HIGHEST_PROTOCOL) with open(resultPath + 'trainMean_atari_practice.pk1', 'wb') as f: pickle.dump(train_mean, f, protocol=pickle.HIGHEST_PROTOCOL) with open(resultPath+ 'evalMean_atari_practice.pk1', 'wb') as f: pickle.dump(eval_mean, f, protocol=pickle.HIGHEST_PROTOCOL) agent.net.save(filePath + '{}_model2'.format(C['env_id'])) sess.close()
def train(params, log_dir, local_log, random_seed, trial): # define device on which to run device = torch.device(params["DEVICE"]) # create env and add specific conifigurations to Malmo env = make_env(params["DEFAULT_ENV_NAME"]) # port = int('1000'+str(aid)) # env.configure(client_pool=[('127.0.0.1', port)]) # env.configure(client_pool=[('127.0.0.1', 10000), ('127.0.0.1', 10001), ('127.0.0.1', 10002)]) env.configure(client_pool=[('127.0.0.1', 10000), ('127.0.0.1', 10001)]) env.configure(allowDiscreteMovement=["move", "turn"]) # , log_level="INFO") env.configure(videoResolution=[84,84]) env.configure(stack_frames=4) env = wrap_env_malmo(env) if random_seed: env.seed(random_seed) print("Observation Space: ", env.observation_space) print("Action Space: ", env.action_space) agents = [] for aid in range(params["NUM_AGENTS"]): # initialize bufer if params["SHARING"] and params["PRIORITIZED_SHARING"]: bufer = ExperienceBufferGridImage(params["REPLAY_SIZE"]) else: bufer = ExperienceBuffer(params["REPLAY_SIZE"]) # initialize agent net = DQN(env.observation_space.shape, env.action_space.n, params["DEVICE"]).to(device) tgt_net = DQN(env.observation_space.shape, env.action_space.n, params["DEVICE"]).to(device) epsilon = params["EPSILON_START"] gamma = params["GAMMA"] tau = params["SOFT_UPDATE_TAU"] agent = Agent('agent' + str(aid), env, bufer, net, tgt_net, gamma, epsilon, tau, trial, log_dir, params) # other variables agent.optimizer = optim.Adam(agent.net.parameters(), lr=params["LEARNING_RATE"]) agent.print_color = COLORS[aid] local_log[agent.alias+"-"+str(trial)] = {"rewards": [],"steps": []} # fill buffer with initial size - don't count these episodes agent.fill_buffer() agents.append(agent) # training loop ep_count = 0 while sum(map(lambda agent:agent.completed, agents)) != len(agents): # overall count of episodes ep_count += 1 # sharing if params["SHARING"] and ep_count % params["SHARING_INTERVAL"] == 0 and ep_count > 0: if params["PRIORITIZED_SHARING"]: share(agents, params["BATCH_SIZE_TRANSFER"], params["REPLAY_START_SIZE"], params["SHARING_THRESHOLD"]) else: share_no_mask(agents, params["BATCH_SIZE_TRANSFER"], params["REPLAY_START_SIZE"]) # each agent does one episode for agent in agents: ## Before 2 agents perform, act, do one round of experience share # given a sharing interval and it is not the first episode if not agent.completed: episode_over = False episode_start = time.time() while not episode_over: # play step frame_start = time.time() episode_over, done_reward = agent.play_step(device=device) agent.frame_idx+= 1 #### Folllowing methods on episode basis if done_reward is not None: # calculate episode speed agent.ep_speed = 1 / (time.time() - episode_start) # reset trackers episode_start = time.time() # save to local log as well local_log[agent.alias+"-"+str(trial)]["rewards"].append(agent.total_rewards[-1]) local_log[agent.alias+"-"+str(trial)]["steps"].append(agent.total_steps[-1]) if params["INDEPENDENT_EVALUATION"]: offline_evaluation(params, agent) else: online_evaluation(params, agent) ## check if problem has been solved # need a minimum number of episodes to evaluate if len(agent.total_rewards) >= params["NUMBER_EPISODES_MEAN"]: # and mean reward has to go above boundary if agent.mean_reward >= params["MEAN_REWARD_BOUND"]: print(colored("%s solved in %d episodes!" % (agent.alias, len(agent.total_rewards)), agent.print_color)) agent.completed = True # if no sign of converging, also break if len(agent.total_rewards) >= params["MAX_GAMES_PLAYED"]: agent.completed = True #### Folllowing methods on frame basis # decay epsilon linearly on frames agent.epsilon = max(params["EPSILON_FINAL"], params["EPSILON_START"] - \ agent.frame_idx / params["EPSILON_DECAY_LAST_FRAME"]) # update at every frame using soft updates if params["SOFT"]: agent.soft_update_target_network() # or hard updates else: if agent.frame_idx % params["SYNC_TARGET_FRAMES"] == 0: agent.hard_update_target_network() ## learn loss_t = agent.learn(device) # record agent.frame_speed = 1 / (time.time() - frame_start) if params["DEBUG"]: agent.record_frame(loss_t.detach().item()) # detach required? # del bufer to force gc later, occupies too much memory del bufer for agent in agents: del agent.exp_buffer # closes tensorboard writer agent.writer.close()
lr = 0.001 gamma = 0.99 eps_start = 1 eps_end = 0.01 eps_decay = 0.001 target_update = 10 num_episodes = 1000 batch_size = 256 capacity = 1000000 max_nb_elements = 4 scores, eps_history = [], [] device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') env = wrappers.make_env("Breakout-v0") strategy = EpsilonGreedyStrategy(eps_start, eps_end, eps_decay) agent = Agent(env.action_space.n, strategy, device) memory = ReplayMemory(capacity) policy_network = DQN(env.action_space.n, lr).to(device) target_network = DQN(env.action_space.n, lr).to(device) target_network.load_state_dict(policy_network.state_dict()) target_network.eval() optimizer = optim.Adam(params=policy_network.parameters(), lr=lr) for episode in range(num_episodes):
update_freq = args.update_freq epsilon_start = args.epsilon_start epsilon_final = args.epsilon_final epsilon_decay = args.epsilon_decay lr = args.lr epsilon = lambda frame_num: max( epsilon_start - (epsilon_start - epsilon_final) * (frame_num / epsilon_decay), epsilon_final) load_model = args.load_model checkpoint_save = args.checkpoint_save log = args.log # make the environment env = wrappers.make_env(game, clip_rewards=clip_rewards, frame_stack=frame_stack, frame_skip=frame_skip) # Model settings output_dim = env.action_space.n input_dims = (frame_stack, 84, 84) # make the agent agent = agent.Agent(memory_capacity, gamma, input_dims, output_dim, lr) if load_model: print(load_model) agent.load_variables(direc=load_model, copy_model_to_target=True, load_mem=True)
env = gym.make("Pong-ram-v4") env = env.unwrapped dqn = DQN(env, inputlen, cnn, fc, gamma = 0.9, learning_rate = 0.0001, epoch = 100000, replay = 10000, update_round = 1000, render = 0) ''' #Pong CNN inputlen = 4 cnn = [ (32, 8, 0, 4, 1, 0), (64, 4, 0, 2, 1, 0), (64, 3, 0, 1, 1, 0), ] n_atoms = 51 fc = [7 * 7 * 64, 1000, 6 * n_atoms] env = wrappers.make_env('PongNoFrameskip-v4') dqn = DQN(env, inputlen, cnn, fc, gamma=0.99, learning_rate=0.0001, eps=[1, 0.00001, 0.02], epoch=100000, replay=10000, update_round=1000, render=-1, batch_size=32, n_atoms=n_atoms, value_min=-21,
state_action_values = net(states_v).gather( 1, actions_v.unsqueeze(-1)).squeeze(-1) next_state_actions = net(next_states_v).max(1)[1] next_state_values = tgt_net(next_states_v).gather( 1, next_state_actions.unsqueeze(-1)).squeeze(-1) #next_state_values = tgt_net(next_states_v).max(1)[0] next_state_values[done_mask] = 0.0 next_state_values = next_state_values.detach() expected_state_action_values = next_state_values * GAMMA + rewards_v return nn.MSELoss()(state_action_values, expected_state_action_values) if __name__ == "__main__": env = wrappers.make_env(ENV_NAME) device = torch.device("cpu") net = dqn_model.DuelingDQN(env.observation_space.shape, env.action_space.n).to(device) tgt_net = dqn_model.DuelingDQN(env.observation_space.shape, env.action_space.n).to(device) writer = SummaryWriter(comment="-" + ENV_NAME) print(net) buffer = ExperienceBuffer(REPLAY_SIZE) agent = Agent(env, buffer) epsilon = EPSILON_START optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
def train(params, log_dir, local_log, random_seed, trial, agent_id): # define device on which to run device = torch.device(params["DEVICE"]) # create env and add specific conifigurations to Malmo env = make_env(params["DEFAULT_ENV_NAME"]) env.configure(client_pool=[('127.0.0.1', 10000), ('127.0.0.1', 10001)]) env.configure(allowDiscreteMovement=["move", "turn"]) # , log_level="INFO") env.configure(videoResolution=[84,84]) env.configure(stack_frames=4) env = wrap_env_malmo(env) if random_seed: env.seed(random_seed) print("Observation Space: ", env.observation_space) print("Action Space: ", env.action_space) # initialize agent bufer = ExperienceBuffer(params["REPLAY_SIZE"]) # buffer = ExperienceBuffer(params["REPLAY_SIZE"]) net = DQN(env.observation_space.shape, env.action_space.n, params["DEVICE"]).to(device) tgt_net = DQN(env.observation_space.shape, env.action_space.n, params["DEVICE"]).to(device) epsilon = params["EPSILON_START"] gamma = params["GAMMA"] tau = params["SOFT_UPDATE_TAU"] agent = Agent('agent' + str(agent_id), env, bufer, net, tgt_net, gamma, epsilon, tau, trial, log_dir, params) # other variables agent.optimizer = optim.Adam(agent.net.parameters(), lr=params["LEARNING_RATE"]) agent.print_color = COLORS[agent_id] local_log[agent.alias+"-"+str(trial)] = {"rewards": [],"steps": []} # fill buffer with initial size - don't count these episodes agent.fill_buffer() # training loop ep_count = 0 while not agent.completed: ep_count += 1 episode_over = False episode_start = time.time() while not episode_over: # play step frame_start = time.time() episode_over, done_reward = agent.play_step(device=device) agent.frame_idx+= 1 #### Folllowing methods on episode basis if done_reward is not None: # calculate episode speed agent.ep_speed = time.time() - episode_start # reset trackers episode_start = time.time() # save to local log as well local_log[agent.alias+"-"+str(trial)]["rewards"].append(agent.total_rewards[-1]) local_log[agent.alias+"-"+str(trial)]["steps"].append(agent.total_steps[-1]) if params["INDEPENDENT_EVALUATION"]: offline_evaluation(params, agent) else: online_evaluation(params, agent) ## check if problem has been solved if agent.mean_reward is not None: if agent.mean_reward > params["MEAN_REWARD_BOUND"]: print(colored("%s solved in %d episodes!" % (agent.alias, len(agent.total_rewards)), agent.print_color)) agent.completed = True # if no sign of converging, also break if len(agent.total_rewards) >= params["MAX_GAMES_PLAYED"]: agent.completed = True #### Folllowing methods on frame basis # decay epsilon linearly on frames agent.epsilon = max(params["EPSILON_FINAL"], params["EPSILON_START"] - \ (agent.frame_idx-params["REPLAY_START_SIZE"]) / params["EPSILON_DECAY_LAST_FRAME"]) # update at every frame using soft updates if params["SOFT"]: agent.soft_update_target_network() # or hard updates else: if agent.frame_idx % params["SYNC_TARGET_FRAMES"] == 0: agent.hard_update_target_network() ## learn loss_t = agent.learn(device) # record agent.frame_speed = 1000 / (time.time() - frame_start) if params["DEBUG"]: agent.record_frame(loss_t.detach().item()) # detach required? # del bufer to force gc later, occupies too much memory del bufer # closes tensorboard writer agent.writer.close()
def train(env_name='PongNoFrameskip-v4', gamma=0.99, batch_size=32, replay_size=1000000, replay_start_size=50000, learning_rate=0.00025, adam_epsilon=0.0000001, sync_target_frames=10000, epsilon_decay_last_frame=1000000, epsilon_start=1.0, epsilon_final=0.1, train_frames=50000000, train_rewards=495, n_steps=3, save_checkpoints=True, run_name=None, use_double=True, use_dense=None, dueling=False, priority_replay=None, categorical=None, record=False, random_seed=None, index=0): n_atoms = v_min = v_max = None use_categorical = False if categorical is not None: use_categorical = True n_atoms = categorical['n_atoms'] v_min = categorical['v'][0] v_max = categorical['v'][1] alpha = beta = None use_priority_replay = False if priority_replay is not None: use_priority_replay = True alpha = priority_replay['alpha'] beta = priority_replay['beta'] print(f'Training DQN on {env_name} environment') print(f'Params: gamma:{gamma}, batch_size:{batch_size}, replay_size:{replay_size}') print(f' replay_start_size: {replay_start_size}, learning_rate:{learning_rate}') print(f' sync_target_frames: {sync_target_frames}, epsilon_decay_last_frame:{epsilon_decay_last_frame}') print(f' epsilon_start: {epsilon_start}, epsilon_final: {epsilon_final}, train_frames: {train_frames}') print(f' train_rewards: {train_rewards}, n_steps: {n_steps}, save_checkpoints: {save_checkpoints}') print(f' run_name: {run_name}, use_double: {use_double}, use_dense: {use_dense}, dueling: {dueling}') if use_categorical: print(f' categorical - n_atoms: {n_atoms}, v_min: {v_min}, v_max: {v_max}') if use_priority_replay: print(f' priority buffer - alpha: {alpha} beta: {beta}') print(f' random_seed: {random_seed}, index: {index}') f_name = env_name + "_" + run_name if run_name is not None else env_name env = wrappers.make_env(env_name, record, f_name) if random_seed is not None: tf.random.set_seed(random_seed) env.seed(random_seed) optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=adam_epsilon) agent = Agent(env, replay_size, optimizer, batch_size, n_steps, gamma, use_double, use_dense, dueling, use_categorical, n_atoms, v_min, v_max, train_frames if train_frames is not None else 5000000) if save_checkpoints: agent.load_checkpoint(f'checkpoints/{f_name}/checkpoint') total_rewards = [] rewards_mean_std = [] frame_idx = 0 count = 0 update_count = 0 ts_frame = 0 ts = time.time() best_mean_reward = None while True: frame_idx += 1 epsilon = max(epsilon_final, epsilon_start - frame_idx / epsilon_decay_last_frame) reward = agent.play_step(epsilon) if reward is not None: count += 1 total_rewards.append(reward) speed = (frame_idx - ts_frame) / (time.time() - ts) ts_frame = frame_idx ts = time.time() mean_reward = np.mean(total_rewards[-100:]) print(f'{index}:{frame_idx}: done {count} games, mean reward: {mean_reward}, eps {epsilon}, speed: {speed}') if best_mean_reward is None or best_mean_reward < mean_reward: # Save network if best_mean_reward is not None: if save_checkpoints: agent.save_checkpoint(f'./checkpoints/{f_name}/checkpoint') print(f'Best mean reward updated {best_mean_reward} -> {mean_reward}, model saved') best_mean_reward = mean_reward if train_frames is not None: if frame_idx >= train_frames: print(f'Trained for {frame_idx} frames. Done.') break if train_rewards is not None: if mean_reward >= train_rewards: print(f'Reached reward: {mean_reward}. Done.') break if agent.buffer_size() < replay_start_size: continue if frame_idx % sync_target_frames == 0: agent.sync_weights() agent.step(gamma, True if update_count % 1000 == 0 else False) update_count += 1 rewards_mean_std.append({'reward': total_rewards[-1:][0], 'step': update_count}) env.close() plot.directory_check('./plots') plot.plot(rewards_mean_std, f'./plots/{f_name}.png', f_name) plot.directory_check('./data') plot.save(rewards_mean_std, f'./data/{f_name}.csv')
return nn.MSELoss()(Qs, Qtarget) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--cuda', default=False, action='store_true', help='Enable CUDA') parser.add_argument('--env', default=DEFAULT_ENV_NAME) parser.add_argument('--reward', type=float, default=MEAN_REWARD_BOUND, \ help='Mean reward bound for stop of training') args = parser.parse_args() device = torch.device("cuda" if args.cuda else "cpu") env = wrappers.make_env(args.env) net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) target_net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) writer = SummaryWriter(comment='-' + args.env) print(net) buffer = ReplayBuffer(REPLAY_SIZE) agent = Agent(env, buffer) epsilon = EPSILON_START optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE) total_rewards = [] frame_idx = 0
def train(args): print(args) torch.manual_seed(args.seed) np.random.seed(args.seed) if args.return_function == "GAE": return_function = GAE elif args.return_function == "Q": return_function = Q elif args.return_function == "A": return_function = A MONTE_CARLO = True if args.num_steps == 200 else False envs = SubprocVecEnv( [make_env(args.env, i + args.num_envs) for i in range(args.num_envs)], MONTE_CARLO) test_env = gym.make(args.env) test_env.seed(args.seed + args.num_envs) policy = ActorCriticMLP(input_dim=envs.observation_space.shape[0], n_acts=envs.action_space.n) optim = torch.optim.Adam(params=policy.parameters(), lr=args.lr, weight_decay=args.weight_decay) test_rewards = [] steps = 1 obs = torch.from_numpy(envs.reset()) while steps < args.max_steps: logp_actions = [] state_values = [] rewards = [] masks = [] for _ in range(args.num_steps): probs, state_value = policy.forward(obs) dist = Categorical(probs) action = dist.sample() obs, reward, done, _ = envs.step(action.numpy()) logp_actions.append(dist.log_prob(action).unsqueeze(1)) state_values.append(state_value) rewards.append(torch.FloatTensor(reward).unsqueeze(1)) masks.append(torch.FloatTensor(1 - done).unsqueeze(1)) obs = torch.from_numpy(obs) steps += 1 if steps % args.test_every == 0: test_reward = np.mean( [test(test_env, policy) for _ in range(10)]) test_rewards.append(test_reward) print(f"Running reward at timestep {steps}: and {test_reward}") if (1 - done).sum() == 0: break next_value = 0 if not (1 - done).sum() == 0: _, next_value = policy(obs) returns = return_function(next_value, rewards, masks, state_values, args) loss = policy_gradient(logp_actions, returns) optim.zero_grad() loss.backward() optim.step() # if monte carlo, we need to reset the environment by hand if MONTE_CARLO: obs = torch.from_numpy(envs.reset()) return test_rewards