def autoplay(method, environment, resume, render): game = Game(name=environments_to_names[environment], render=render) init_state, state_shape = game.get_state(True) n_actions = game.env.action_space.n agent_cls = agent_factory[method] agent = agent_cls(state_shape, n_actions, environment, 1, 1, eps_start=0.0) agent.load(resume) log.info(f'Evaluating agent, loaded from {resume}, starting ...') game.reset() state = game.get_state() for t in count(): state = game.get_state() action = agent.select_action(state) transition, done = game.step(int(action.cpu().numpy())) # agent.eval( # transition, 1, 0.0) time.sleep(0.1) if done: log.info(f'agent survived {t} steps') game.reset() break game.env.close()
def main(): env = Game(fps=fps, screensize=screen, state=state, playersize=player_size, playercolor=player_color, enemysize=enemy_size, enemycolor=enemy_color, squaresize=square_size, squarecolor=square_color) get_ann = { "clever": build_ann, "forged": forge_ann, "keras": keras_ann }.get(agent_type, lambda *args: None) # the agent name was already taken by the agent module :( actor = get_agent(env=env, get_network=get_ann) env.reset(actor) env.mainloop()
def train(method, environment, resume, episodes, lr, lr_episodes, min_lr, eval_only, replay_width, batch_size, gamma, update_rate, save_interval): history = History(method + '_' + environment, ['steps', 'avg_reward', 'loss'], resume is not None) history.flush() memory = ReplayMemory(replay_width) game = Game(name=environments_to_names[environment], memory=memory, render=False) init_state, state_shape = game.get_state(True) n_actions = game.env.action_space.n agent_cls = agent_factory[method] agent = agent_cls(state_shape, n_actions, environment, episodes, update_rate, step_size=lr_episodes, lr=lr, save_interval=save_interval) # resume from a ckpt if resume is not None: agent.load(resume) avg_reward = MovingAverage(100) avg_loss = MovingAverage(100) log.info(f'Training with {episodes}, starting ...') # main training loop for i in range(episodes): state = game.reset() done = False loss = None while not done: state = game.state action = agent.select_action(state) transition, done = game.step(int(action.to('cpu').numpy())) if len(memory) > batch_size: batched = memory.sample(batch_size) loss = agent.train(batched, batch_size, gamma, i) avg_loss.add(loss) reward = game.rewards # agent.save_best(reward) agent.save() agent.scheduler.step() avg_reward.add(reward) # moving averages text = [ f'steps: {agent.step_cnt}', f'game epochs: {i}/{episodes}', f'train loss: {float(avg_loss):.5}', f'avg reward: {float(avg_reward):.5}', # f'best reward: {float(agent.best_reward):.5}', f'reward: {float(reward):.5}', f'epsilon: {agent.epsilon:.3}', ] log.info(', '.join(text), update=True) if agent.step_cnt % save_interval == 0: history.record({ 'steps': agent.step_cnt, 'avg_reward': float(avg_reward), 'loss': float(avg_loss), }) game.env.close()
batch_size = 32 max_score = 0 frames = [] best = [] print('get game window in focus') for i in list(range(4))[::-1]: print(i + 1) time.sleep(1) for e in range(EPISODES): frames = [] reward = -1 while reward != 0: state = env.reset() state, reward, done = env.render() # apparently the Conv2D wants a 4D shape like (1,64,64,1) state = shapeState(state) for time in range(500): action = agent.act(state) #!# in the subsequent line, it is unclear what is the object that normally would be # assigned to next_state, but # in the original code the variable is overwritten by the `next_state = shapeState(pix)` # i.e. with image representation of the state; # the goal seems to be to obtain the reward & done values from the `env.action()` call; # while the subsequent call `pix = env.render()` is to actually get the image capture... # next_state, reward, done, _ = env.step(action) # pix = env.render()
statistics = { 'reward': [], 'val_loss': [], 'policy_loss': [], } best_reward = 0 for i in range(0, N_EPISODES): memory = Memory() num_steps = 0 num_ep = 0 reward_batch = 0 while num_steps < BATCH_SIZE: S = env.reset() S = running_state(S) t = 0 reward_sum = 0 while True: t += 1 A = ppo_agent.select_best_action(S) S_prime, R, is_done = env.take_one_step(A.item()) reward_sum += R mask = 1 - int(is_done) memory.push(S, np.array([A.item()]), mask, R)