def main(env_name, render=False): env = gym.make(env_name) # Inicialize seu agente aqui agent = RandomAgent(env) for episode_i in range(100000): state = env.reset() done = False while not done: if render and episode_i % 10 == 0: env.render() # Ação do seu agente aqui action = agent.act(state) state, reward, done, info = env.step(action)
import time import gym from agent import RandomAgent env = gym.make("CartPole-v1") agent = RandomAgent(env.action_space) episode_count = 10 reward = 0 done = False for i in range(episode_count): ob = env.reset() while True: action = agent.act(ob, reward, done) ob, reward, done, info = env.step(action) if done: print("Game Finished!") break env.render() time.sleep(1 / 30) env.close()
# set seed random.seed(args.seed) np.random.seed(args.seed) env = MazeEnv(args, args.game_name, args.graph_param, args.game_len, args.gamma) # agent if args.agent == 'random': agent = RandomAgent(args, env) NUM_GRAPH = 100 NUM_ITER = 32 ep_rews = [] for graph_id in range(NUM_GRAPH): for _ in range(NUM_ITER): ep_rew = 0 state, info = env.reset(graph_index=graph_id) done = False while not done: action = agent.act(state) state, rew, done, info = env.step(action) ep_rew += rew ep_rews.append(ep_rew) string = 'Graph={:02d}/{:02d}, Return={:.4f}' print(string.format(graph_id, NUM_GRAPH, sum(ep_rews) / len(ep_rews))) print('Avg. Ep Return={:.4f}'.format(sum(ep_rews) / len(ep_rews))) print('This should be around 0.0455')
def challenger_round(): challengers = [] leaders = [] leader_checkpoints = os.listdir(LEADER_DIR) # Need to share the same schedule with all challengers, so they all anneal # at same rate epsilon_schedule = LinearSchedule(EPS_START, EPS_END, TRAIN_FRAMES) for i in xrange(NUM_LEADERS): challenger = try_gpu( DQNAgent(6, epsilon_schedule, OBSERVATION_MODE, lr=LR, max_grad_norm=GRAD_CLIP_NORM)) if i < len(leader_checkpoints): leader = try_gpu( DQNAgent(6, LinearSchedule(0.1, 0.1, 500000), OBSERVATION_MODE)) leader_path = os.path.join(LEADER_DIR, leader_checkpoints[i]) print "LOADING CHECKPOINT: {}".format(leader_path) challenger.load_state_dict( torch.load(leader_path, map_location=lambda storage, loc: storage)) leader.load_state_dict( torch.load(leader_path, map_location=lambda storage, loc: storage)) else: leader = RandomAgent(6) print "INITIALIZING NEW CHALLENGER AND LEADER" challengers.append(challenger) leaders.append(leader) if CHALLENGER_DIR is not None: challengers = [] # Load in all of the leaders for checkpoint in os.listdir(CHALLENGER_DIR): path = os.path.join(CHALLENGER_DIR, checkpoint) print "LOADING FROM CHALLENGER_DIR: {}".format(path) challenger = try_gpu( DQNAgent(6, LinearSchedule(0.05, 0.05, 1), CHALLENGER_OBSERVATION_MODE, lr=LR, max_grad_norm=GRAD_CLIP_NORM, name=checkpoint)) challenger.load_state_dict( torch.load(path, map_location=lambda storage, loc: storage)) challengers.append(challenger) challenger = EnsembleDQNAgent(challengers) leader = EnsembleDQNAgent(leaders) if OPPONENT is not None or HUMAN: leader = NoOpAgent() replay_buffer = ReplayBuffer(1000000) rewards = collections.deque(maxlen=1000) frames = 0 # number of training frames seen episodes = 0 # number of training episodes that have been played with tqdm(total=TRAIN_FRAMES) as progress: # Each loop completes a single episode while frames < TRAIN_FRAMES: states = env.reset() challenger.reset() leader.reset() episode_reward = 0. episode_frames = 0 # Each loop completes a single step, duplicates _evaluate() to # update at the appropriate frame #s for _ in xrange(MAX_EPISODE_LENGTH): frames += 1 episode_frames += 1 action1 = challenger.act(states[0]) action2 = leader.act(states[1]) next_states, reward, done = env.step(action1, action2) episode_reward += reward # NOTE: state and next_state are LazyFrames and must be # converted to np.arrays replay_buffer.add( Experience(states[0], action1._action_index, reward, next_states[0], done)) states = next_states if len(replay_buffer) > 50000 and \ frames % 4 == 0: experiences = replay_buffer.sample(32) challenger.update_from_experiences(experiences) if frames % 10000 == 0: challenger.sync_target() if frames % SAVE_FREQ == 0: # TODO: Don't access internals for agent in challenger._agents: path = os.path.join(LEADER_DIR, agent.name + "-{}".format(frames)) print "SAVING CHECKPOINT TO: {}".format(path) torch.save(agent.state_dict(), path) #path = os.path.join( # LEADER_DIR, challenger.name + "-{}".format(frames)) #torch.save(challenger.state_dict(), path) if frames >= TRAIN_FRAMES: break if done: break if episodes % 300 == 0: print "Evaluation: {}".format( evaluate(challenger, leader, EPISODES_EVALUATE_TRAIN)) print "Episode reward: {}".format(episode_reward) episodes += 1 rewards.append(episode_reward) stats = challenger.stats stats["Avg Episode Reward"] = float(sum(rewards)) / len(rewards) stats["Num Episodes"] = episodes stats["Replay Buffer Size"] = len(replay_buffer) progress.set_postfix(stats, refresh=False) progress.update(episode_frames) episode_frames = 0
possible_actions = [0, 1] # Cooperate or Defect cooperator, defector = RandomAgent(possible_actions, p=0.9), RandomAgent(possible_actions, p=0.1) # Stateless interactions (agents do not have memory) s = None n_iter = 1000 for i in range(n_iter): # A full episode: done = False while not done: # Agents decide a0 = cooperator.act() a1 = defector.act() # World changes new_s, (r0, r1), done, _ = env.step(([a0], [a1])) # Agents learn cooperator.update(s, (a0, a1), (r0, r1), new_s ) defector.update(s, (a1, a0), (r1, r0), new_s ) s = new_s print(r0, r1) env.reset()