def autoplay(method, environment, resume, render): game = Game(name=environments_to_names[environment], render=render) init_state, state_shape = game.get_state(True) n_actions = game.env.action_space.n agent_cls = agent_factory[method] agent = agent_cls(state_shape, n_actions, environment, 1, 1, eps_start=0.0) agent.load(resume) log.info(f'Evaluating agent, loaded from {resume}, starting ...') game.reset() state = game.get_state() for t in count(): state = game.get_state() action = agent.select_action(state) transition, done = game.step(int(action.cpu().numpy())) # agent.eval( # transition, 1, 0.0) time.sleep(0.1) if done: log.info(f'agent survived {t} steps') game.reset() break game.env.close()
def train(method, environment, resume, episodes, lr, lr_episodes, min_lr, eval_only, replay_width, batch_size, gamma, update_rate, save_interval): history = History(method + '_' + environment, ['steps', 'avg_reward', 'loss'], resume is not None) history.flush() memory = ReplayMemory(replay_width) game = Game(name=environments_to_names[environment], memory=memory, render=False) init_state, state_shape = game.get_state(True) n_actions = game.env.action_space.n agent_cls = agent_factory[method] agent = agent_cls(state_shape, n_actions, environment, episodes, update_rate, step_size=lr_episodes, lr=lr, save_interval=save_interval) # resume from a ckpt if resume is not None: agent.load(resume) avg_reward = MovingAverage(100) avg_loss = MovingAverage(100) log.info(f'Training with {episodes}, starting ...') # main training loop for i in range(episodes): state = game.reset() done = False loss = None while not done: state = game.state action = agent.select_action(state) transition, done = game.step(int(action.to('cpu').numpy())) if len(memory) > batch_size: batched = memory.sample(batch_size) loss = agent.train(batched, batch_size, gamma, i) avg_loss.add(loss) reward = game.rewards # agent.save_best(reward) agent.save() agent.scheduler.step() avg_reward.add(reward) # moving averages text = [ f'steps: {agent.step_cnt}', f'game epochs: {i}/{episodes}', f'train loss: {float(avg_loss):.5}', f'avg reward: {float(avg_reward):.5}', # f'best reward: {float(agent.best_reward):.5}', f'reward: {float(reward):.5}', f'epsilon: {agent.epsilon:.3}', ] log.info(', '.join(text), update=True) if agent.step_cnt % save_interval == 0: history.record({ 'steps': agent.step_cnt, 'avg_reward': float(avg_reward), 'loss': float(avg_loss), }) game.env.close()