def replay_steps(self, states, actions, rewards, last_state, last_terminal: bool): # get predicted reward for the last state - we didn't do action in that state R = 0 if last_terminal and rewards[-1] < 9 else self.Critic.predict(t(last_state)) # reset gradients for optimizers self.Actor.optimizer.zero_grad() self.Critic.optimizer.zero_grad() critic_loss, actor_loss = 0, 0 # go backwards through states, actions and rewards taken in this episode for i in reversed(range(len(rewards))): self.accum_rewards += rewards[i] R = rewards[i] + self.discount_rate * R advantage = (R - self.Critic.predict(t(states[i]))) # get Beta distribution parameters with which the action was drawn alpha, beta = self.Actor.predict(t(states[i])) torch.distributions.Beta.set_default_validate_args(True) dist = torch.distributions.Beta(alpha, beta) # accumulate critic loss critic_loss = critic_loss + advantage.pow(2).mean() # accumulate actor loss - we maximize the rewards, thus we take negation of gradient. # Adam opt. then negates it again, so weights are updated in a way which makes advantages higher actor_loss = actor_loss - dist.log_prob(self.Actor.action_to_beta(t(actions[i]))) * advantage.detach() # compute gradients wrt. weights actor_loss.backward() critic_loss.backward()
async def _(event): start = datetime.now() vent = event.chat_id end = datetime.now() ms = (end - start).microseconds / 1000 uptime = t((time.time() - StartTime)) await AnimeBot.send_message(vent, f"🏓Ping speed: {ms}\n😵Uptime: {uptime}")
def render(self): for e in range(10): state = self.env.reset() done = False score = 0 while not done: self.env.render() action = self.Actor.get_best_action(t(state)) state, reward, done, _ = self.env.step(action) score += reward if done: a3c_logger.info("episode: {}, score: {}".format(e, score)) break self.env.close()
def run(self): if self.globalA3C is None: raise Exception("Global model is not set! Please call set_global_model(global_model) to set the parent model.") state = self.env.reset() # reset env and get initial state episode = 0 while episode < self.max_episodes: # reset stuff is_terminal = False states, actions, rewards = [], [], [] step_start = self.step while not is_terminal and self.step - step_start < self.step_max: states.append(state) # register current state action = self.Actor.draw_action(t(state)) # draw action next_state, reward, is_terminal, info = self.env.step(action) # perform action actions.append(action) # register action rewards.append(reward) # register reward state = next_state self.step += 1 # replay experience backwards and compute gradients self.replay_steps(states, actions, rewards, state, is_terminal) self.lock.acquire() self.update_global_models() self.sync_models() self.globalA3C.episode += 1 episode = self.globalA3C.episode self.lock.release() if episode % self.measure_step == 0 and self.eval_repeats != 0: self.lock.acquire() mean, _ = self.evaluate(self.eval_repeats) self.globalA3C.performance.append([episode, mean]) self.lock.release() if self.log_info: a3c_logger.info(f"\nEpisode: {episode}\nMean accumulated rewards: {mean}") if is_terminal: self.update_local_results() state = self.env.reset() # reset env and get initial state self.local_episode += 1 self.env.close()
def evaluate(self, eval_repeats=20): self.Actor.model.eval() self.Critic.model.eval() scores = [] for ep in range(eval_repeats): state = self.env.reset() done = False performance = 0 while not done: with torch.no_grad(): action = self.Actor.get_best_action(t(state)) state, reward, done, _ = self.env.step(action) performance += reward scores.append([ep + 1, performance]) scores = np.array(scores) self.Actor.model.train() self.Critic.model.train() return scores[:, 1].mean(), scores