def ema_checks(stats: HourData, history: History, ema_threshold: float, reset_perc: float): if stats.ema_percent_diff_positive > ema_threshold: logging.info( f"Current price is outside threshold difference ({stats.formatted_info()})" ) return False logging.info( f"Current price not outside threshold ({stats.formatted_info()})") # If EMA hasn't been reset then check whether we should reset it if not history.ema_reset: if history.rising: target = history.price * (1 - reset_perc / 100) should_reset = stats.ema > target else: target = history.price * (1 + reset_perc / 100) should_reset = stats.ema < target if should_reset: logging.info("Resetting EMA") history.ema_reset = True history.save() return True
def __init__(self, config): super(DQNAgent, self).__init__(config) self.history = History(config) self.replay_memory = DQNReplayMemory(config) self.net = DQN(self.env_wrapper.action_space.n, config) self.net.build() self.net.add_summary(["average_reward", "average_loss", "average_q", "ep_max_reward", "ep_min_reward", "ep_num_game", "learning_rate"], ["ep_rewards", "ep_actions"])
class TestHistory(unittest.TestCase): def setUp(self): self.history = History() def test_get_last_n_commands(self): self.history.commands = ["foo1", "foo2", "foo3"] expected = ["foo2", "foo3"] self.assertEquals(expected, self.history.get_last_n_commands(2)) def test_get_last_n_formatted(self): self.history.commands = ["foo1", "foo2", "foo3"] expected = "foo1\nfoo2\nfoo3" self.assertEquals(self.history.get_last_n_formatted(3), expected) def test_add_line(self): self.assertEquals(0, len(self.history.commands)) self.history.add_line("foo line") self.assertEquals(1, len(self.history.commands))
def setUp(self): self.history = History()
# history = SGD(model, optimizer, data_loader, test_loader, num_epochs=1, log_every=50, test_every=1) # %% DUAL TRAINING if continuation: history, model, ngram, optimizer_primal, optimizer_dual = load() print(model.ngram.n) else: model = Model(ngram, output_size=4) model.to(DEVICE) model.init_weights() optimizer_primal = torch.optim.Adam(model.primal.parameters(), lr=primal_lr) optimizer_dual = torch.optim.Adam(model.dual.parameters(), lr=dual_lr) history = History() for idx in model.dual: history['dual ' + str(idx)] = [] epochs_done = 0 while epochs_done < num_epochs: history = SPDG(model, optimizer_primal, optimizer_dual, sequence_loader, data_loader, test_loader, save_every, log_every, test_every, sequence_test_loader=sequence_test_loader,
def SPDG(model, optimizer_primal, optimizer_dual, sequence_loader, data_loader, test_loader, num_epochs=5, log_every=1, test_every=1, eval_predictions_on_data=False, eval_predictions_on_sequences=False, show_dual=False, history=None, sequence_test_loader=None, eval_ngram_data_stats=False, eval_ngram_test_stats=False): if history is None: history = History() for idx in model.dual: history['dual ' + str(idx)] = [] model.train() iter_ = 0 epoch_ = 0 try: while epoch_ < num_epochs: if epoch_ % test_every == 0: msg = "Minibatch | p-loss | loss | err rate | steps/s |" if show_dual: for i in model.dual: msg += " {:>7d} |".format(i) print(msg) epoch_ += 1 stime = time.time() siter = iter_ for x, y in sequence_loader: iter_ += 1 x = x.to(DEVICE).view(-1, model.n, 28*28).float() y = y.to(DEVICE) out = model.forward_sequences(x) ploss = model.loss_primal(out, y) dloss = model.loss_dual(out, y) ploss_ = ploss.item() loss_ = -dloss.item() optimizer_primal.zero_grad() ploss.backward(retain_graph=True) optimizer_primal.step() optimizer_dual.zero_grad() dloss.backward() optimizer_dual.step() with torch.no_grad(): _, predictions = out.max(dim=2) a = predictions.view(-1) != y.view(-1) err_rate = 100.0 * a.sum().item() / (out.size(1) * out.size(0)) history.err_rate.append(err_rate) history.primal_loss.append(ploss_) history.loss.append(loss_) for idx in model.dual: history['dual ' + str(idx)].append(model.dual[idx].item()) if iter_ % log_every == 0: num_iter = iter_ - siter msg = "{:>8} | {:>10.2f} | {:>10.2f} | {:>7.2f}% | {:>7.2f} |".format(iter_, ploss_, loss_, err_rate, num_iter / (time.time() - stime)) if show_dual: for idx in model.dual: msg += " {:>7.2f} |".format(model.dual[idx].item()) print(msg) siter = iter_ stime = time.time() if epoch_ % test_every == 0: epmsg = "Epoch {:>3} | Test errors for: ".format(epoch_) history.predictions_test.append(get_statistics(model, data_loader=test_loader)) for i in range(model.output_size): accuracy = 100.0 - 100.0 * history.predictions_test[-1][i, i] / history.predictions_test[-1][i].sum() epmsg += " {}: {:.2f}, ".format(i, accuracy) epmsg = epmsg[:-2] if eval_predictions_on_data: epmsg += "\n | Data errors for: ".format(epoch_) history.predictions_data.append(get_statistics(model, data_loader=data_loader)) for i in range(model.output_size): accuracy = 100.0 - 100.0 * history.predictions_data[-1][i, i] / history.predictions_data[-1][i].sum() epmsg += " {}: {:.2f}, ".format(i, accuracy) epmsg = epmsg[:-2] if eval_predictions_on_sequences: epmsg += "\n | Seqs errors for: ".format(epoch_) history.predictions_sequences.append(get_statistics(model, data_loader=sequence_loader, sequences=True)) for i in range(model.output_size): accuracy = 100.0 - 100.0 * history.predictions_sequences[-1][i, i] / history.predictions_sequences[-1][i].sum() epmsg += " {}: {:.2f}, ".format(i, accuracy) epmsg = epmsg[:-2] if eval_ngram_data_stats: history.ngram_data_stats.append(get_ngram_stats(model, sequence_loader)) if eval_ngram_test_stats: history.ngram_test_stats.append(get_ngram_stats(model, sequence_test_loader)) test_err_rate = model.compute_error_rate(test_loader) history.test_err_rate.append(test_err_rate) print('{0}+\n{1}\n{0}+'.format('-' * (len(msg) - 1), epmsg)) except KeyboardInterrupt: pass return history
class DQNAgent(BaseAgent): def __init__(self, config): super(DQNAgent, self).__init__(config) self.history = History(config) self.replay_memory = DQNReplayMemory(config) self.net = DQN(self.env_wrapper.action_space.n, config) self.net.build() self.net.add_summary(["average_reward", "average_loss", "average_q", "ep_max_reward", "ep_min_reward", "ep_num_game", "learning_rate"], ["ep_rewards", "ep_actions"]) def observe(self): reward = max(self.min_reward, min(self.max_reward, self.env_wrapper.reward)) screen = self.env_wrapper.screen self.history.add(screen) self.replay_memory.add(screen, reward, self.env_wrapper.action, self.env_wrapper.terminal) if self.i < self.config.epsilon_decay_episodes: self.epsilon -= self.config.epsilon_decay if self.i % self.config.train_freq == 0 and self.i > self.config.train_start: state, action, reward, state_, terminal = self.replay_memory.sample_batch() q, loss= self.net.train_on_batch_target(state, action, reward, state_, terminal, self.i) self.total_q += q self.total_loss += loss self.update_count += 1 if self.i % self.config.update_freq == 0: self.net.update_target() def policy(self): if np.random.rand() < self.epsilon: return self.env_wrapper.random_step() else: state = self.history.get()/255.0 a = self.net.q_action.eval({ self.net.state : [state] }, session=self.net.sess) return a[0] def train(self, steps): render = False self.env_wrapper.new_random_game() num_game, self.update_count, ep_reward = 0,0,0. total_reward, self.total_loss, self.total_q = 0.,0.,0. ep_rewards, actions = [], [] t = 0 for _ in range(self.config.history_len): self.history.add(self.env_wrapper.screen) for self.i in tqdm(range(self.i, steps)): action = self.policy() self.env_wrapper.act(action) self.observe() if self.env_wrapper.terminal: t = 0 self.env_wrapper.new_random_game() num_game += 1 ep_rewards.append(ep_reward) ep_reward = 0. else: ep_reward += self.env_wrapper.reward t += 1 actions.append(action) total_reward += self.env_wrapper.reward if self.i >= self.config.train_start: if self.i % self.config.test_step == self.config.test_step -1: avg_reward = total_reward / self.config.test_step avg_loss = self.total_loss / self.update_count avg_q = self.total_q / self.update_count try: max_ep_reward = np.max(ep_rewards) min_ep_reward = np.min(ep_rewards) avg_ep_reward = np.mean(ep_rewards) except: max_ep_reward, min_ep_reward, avg_ep_reward = 0, 0, 0 sum_dict = { 'average_reward': avg_reward, 'average_loss': avg_loss, 'average_q': avg_q, 'ep_max_reward': max_ep_reward, 'ep_min_reward': min_ep_reward, 'ep_num_game': num_game, 'learning_rate': self.net.learning_rate, 'ep_rewards': ep_rewards, 'ep_actions': actions } self.net.inject_summary(sum_dict, self.i) num_game = 0 total_reward = 0. self.total_loss = 0. self.total_q = 0. self.update_count = 0 ep_reward = 0. ep_rewards = [] actions = [] if self.i % 500000 == 0 and self.i > 0: j = 0 self.save() if self.i % 100000 == 0: j = 0 render = True if render: self.env_wrapper.env.render() j += 1 if j == 1000: render = False def play(self, episodes, net_path): self.net.restore_session(path=net_path) self.env_wrapper.new_game() i = 0 for _ in range(self.config.history_len): self.history.add(self.env_wrapper.screen) episode_steps = 0 while i < episodes: a = self.net.q_action.eval({ self.net.state : [self.history.get()/255.0] }, session=self.net.sess) action = a[0] self.env_wrapper.act_play(action) self.history.add(self.env_wrapper.screen) episode_steps += 1 if episode_steps > self.config.max_steps: self.env_wrapper.terminal = True if self.env_wrapper.terminal: episode_steps = 0 i += 1 self.env_wrapper.new_play_game() for _ in range(self.config.history_len): screen = self.env_wrapper.screen self.history.add(screen)
class DQNAgent(BaseAgent): #Se inicializa la clase padre con el objeto config, un History, #una memoria de repetición, una red neuronal y su arquitectura. def __init__(self, config): super(DQNAgent, self).__init__(config) self.history = History(config) self.replay_memory = DQNReplayMemory(config) self.net = DQN(self.env_wrapper.action_space.n, config) self.net.build() self.net.add_summary(["average_reward", "average_loss", "average_q", "ep_max_reward", "ep_min_reward", "ep_num_game", "learning_rate"], ["ep_rewards", "ep_actions"]) #Se encarga de procesar una observación del agente. def observe(self): reward = max(self.min_reward, min(self.max_reward, self.env_wrapper.reward)) #Se encarga de que puntaje en el estado esté entre 1 y -1 screen = self.env_wrapper.screen #Se toma la screen actual del wrapper self.history.add(screen) #Se agrega la screen a la historial self.replay_memory.add(screen, reward, self.env_wrapper.action, self.env_wrapper.terminal) #Se agrega los valores a la memoria de repetición if self.i < self.config.epsilon_decay_episodes: #Se decrementa el valor de epsilon de acuerdo a la cantidad de timesteps self.epsilon -= self.config.epsilon_decay if self.i % self.config.train_freq == 0 and self.i > self.config.train_start: #se actualizan los pesos ed la red neuronal de acuerdo a los valores en el objeto config. state, action, reward, state_, terminal = self.replay_memory.sample_batch() #Se toma una muestra de la memoria de repetición q, loss= self.net.train_on_batch_target(state, action, reward, state_, terminal, self.i) #Se actualiza la red neuronal con los valores de la muestra self.total_q += q #se incrementa al valor de q self.total_loss += loss #se incrementa el valor de pérdida self.update_count += 1 #Se contabiliza el entrenamiento if self.i % self.config.update_freq == 0: #Se verifica si es necesario actualiza la red neuronal objetivo self.net.update_target() #Se actualiza la red neurona objetivo #Se encarga de definir la política bajo la cual el agente ejecuta un acción. def policy(self): if np.random.rand() < self.epsilon: #Si epsilon es mayo a un aleatorio return self.env_wrapper.random_step() #Se ejecuta una acción aleatoria else: #Si no state = self.history.get()/255.0 a = self.net.q_action.eval({ self.net.state : [state] }, session=self.net.sess) return a[0] #Calcula la acción que ofrece mejor puntaje a futuro de acuerdo a la red neuronal #Se encarga de realizan el proceso de entrenamiento del agente. def train(self, steps): render = False #Se setea la variable para renderiza en false self.env_wrapper.new_random_game() #Se inicia un juego y se lleva a un estado aleatorio num_game, self.update_count, ep_reward = 0,0,0. #Se inicializan las variables en 0. total_reward, self.total_loss, self.total_q = 0.,0.,0. #Se inicializan las varaibles en o ep_rewards, actions = [], [] #Se inicializan arreglos vacpios t = 0 for _ in range(self.config.history_len): #Se itera de acuerdo a la longitud de historial self.history.add(self.env_wrapper.screen) #Se agregan la primeras iguales screens al historial for self.i in tqdm(range(self.i, steps)): #Se itera de acuerdo a los timesteps de entrenamiento action = self.policy() #Se selecciona la acción del agente de acuerdo a la poĺítica self.env_wrapper.act(action) #Se eejecuta la acción en el ambiente self.observe() #Se procesa la observación. if self.env_wrapper.terminal: #Si se llega a un estado terminal t = 0 #reinicia el conteo de timesteps del episodio self.env_wrapper.new_random_game() #inicializa un juego y lo lleva un estado aleatorio num_game += 1 #Se incrementa la cantidad de episodios jugados ep_rewards.append(ep_reward) #Se almacena la recompensa del episodio ep_reward = 0. #Se reinicia la recompensa para el sigueinte episodio else: #si no es un estado terminal. ep_reward += self.env_wrapper.reward #Incrementa la recompensa en el episodio. t += 1 #contabiliza el timestep transcurrido actions.append(action) #almacena la acción ejecutada total_reward += self.env_wrapper.reward #Incrementa la recomentan total. if self.i >= self.config.train_start: #si ya han trasncurridos más timesteps que los especificados en el objeto config if self.i % self.config.test_step == self.config.test_step -1: #Si se cumple la condición de acuerdo a los teststeps avg_reward = total_reward / self.config.test_step #Sedeine el promedio de las recompensas sobre los lostest steps avg_loss = self.total_loss / self.update_count #Se define el promedio de pérdida de acuerdo los updates en la red avg_q = self.total_q / self.update_count #Se define el valor q promedio de acuerdo a los updates en la red try: max_ep_reward = np.max(ep_rewards) #Se almacena la recompensa máxima de los episodios min_ep_reward = np.min(ep_rewards) #Se almacena la recompensa minima de los episodios avg_ep_reward = np.mean(ep_rewards)#Se almaena la recompensa promedio de los episodios except: max_ep_reward, min_ep_reward, avg_ep_reward = 0, 0, 0 #Si hay un errores se setean a 0 sum_dict = { #crea un diccionari con la información definida. 'average_reward': avg_reward, 'average_loss': avg_loss, 'average_q': avg_q, 'ep_max_reward': max_ep_reward, 'ep_min_reward': min_ep_reward, 'ep_num_game': num_game, 'learning_rate': self.net.learning_rate, 'ep_rewards': ep_rewards, 'ep_actions': actions } self.net.inject_summary(sum_dict, self.i) #injecta el diccionario a la red para crear un objeto tf.Summary num_game = 0 #Reinicia los valores guardados en el diccionario total_reward = 0. self.total_loss = 0. self.total_q = 0. self.update_count = 0 ep_reward = 0. ep_rewards = [] actions = [] if self.i % 500000 == 0 and self.i > 0: #Cada 500000 timesteps guarda un checkpoint del proceso de entrenamiento de la red j = 0 self.save() if self.i % 100000 == 0: #Cada 10000 timesteps renderiza 1000 steps del entrenamiento j = 0 render = True # if render: # self.env_wrapper.env.render() # j += 1 # if j == 1000: # render = False #Se encarga de jugar episodios de acuerdo a la red neuronal entrenada def play(self, episodes, net_path): self.net.restore_session(path=net_path) #carga los pesos de la red neuronal self.env_wrapper.new_game() #Inicia un juego en un estado aleatorio i = 0 for _ in range(self.config.history_len): #sse itera de acuerdo al tamaño del historial self.history.add(self.env_wrapper.screen) #Se almacena las pprimera historias en el historial episode_steps = 0 while i < episodes: a = self.net.q_action.eval({ self.net.state : [self.history.get()/255.0] }, session=self.net.sess) #Calcula la acción que ofrece mejor retorno de acuerdo a la red neuronal action = a[0] #Se toma la primera posición, que ofrece el mejor retorno self.env_wrapper.act_play(action) #Ejecuta la accion en el ambiente y verifica si perdió vida self.history.add(self.env_wrapper.screen) #Almacena el nuevo screen episode_steps += 1 if episode_steps > self.config.max_steps: #Si se alcana la cantidad máxima de steps por episodio self.env_wrapper.terminal = True #Marca el episodio como terminado if self.env_wrapper.terminal: #Si el episodio está terminado episode_steps = 0 #Reiniicia los steps para el nuevo episodio i += 1 #contabiliza el episodio jugado self.env_wrapper.new_play_game() #Crear un nuevo ambiente en un estado aleatorio for _ in range(self.config.history_len): #Se itera sobre el tamaño del historial screen = self.env_wrapper.screen self.history.add(screen) #Almacena la screen en el historia
def SPDG(model, optimizer_primal, optimizer_dual, sequence_loader, data_loader, test_loader, num_epochs=5, log_every=1, test_every=1, predictions_on_data=False, predictions_on_sequences=False, show_dual=False, history=None, sequence_test_loader=None, ngram_data_stats=False, ngram_test_stats=False, loss_on_test=False, remember_dual=False): if history is None: history = History() if remember_dual: if model.ngram.size() > 20: warnings.warn( f"Model\'s n-gram is large ({model.ngram.size()} entries). Remebering dual variables \ will occupy significant amount of memory. Suggest adding \'remember_dual=False\' to method\'s parameters.'" ) for idx in model.dual: history['dual ' + str(idx)] = [] model.train() iter_ = 0 epoch_ = 0 try: while epoch_ < num_epochs: if epoch_ % test_every == 0: msg = "Minibatch | p-loss | loss | err rate | steps/s |" if show_dual: for i in model.dual: msg += " {:>7d} |".format(i) print(msg) epoch_ += 1 stime = time.time() siter = iter_ for x, y in sequence_loader: iter_ += 1 x = x.to(DEVICE).view(-1, model.n, 28 * 28).float() y = y.to(DEVICE) out = model.forward_sequences(x) ploss = model.loss_primal(out) dloss = model.loss_dual(out) ploss_ = ploss.item() loss_ = -dloss.item() optimizer_primal.zero_grad() ploss.backward(retain_graph=True) optimizer_primal.step() optimizer_dual.zero_grad() dloss.backward() optimizer_dual.step() with torch.no_grad(): _, predictions = out.max(dim=2) a = predictions.view(-1) != y.view(-1) err_rate = 100.0 * a.sum().item() / (out.size(1) * out.size(0)) history.err_rate.append(err_rate) history.primal_loss.append(ploss_) history.loss.append(loss_) if remember_dual: for idx in model.dual: history['dual ' + str(idx)].append( model.dual[idx].item()) if iter_ % log_every == 0: num_iter = iter_ - siter msg = "{:>8} | {:>10.2f} | {:>10.2f} | {:>7.2f}% | {:>7.2f} |".format( iter_, ploss_, loss_, err_rate, num_iter / (time.time() - stime)) if show_dual: for idx in model.dual: msg += " {:>7.2f} |".format( model.dual[idx].item()) print(msg) siter = iter_ stime = time.time() if epoch_ % test_every == 0: epmsg = "Epoch {:>3} | Test errors for: ".format( epoch_ + history.epochs_done) history.predictions_test.append( get_statistics(model, data_loader=test_loader)) for i in range(model.output_size): accuracy = 100.0 - 100.0 * history.predictions_test[-1][ i, i] / history.predictions_test[-1][i].sum() epmsg += " {}: {:.2f}, ".format(i, accuracy) epmsg = epmsg[:-2] if predictions_on_data: epmsg += "\n | Data errors for: ".format(epoch_) history.predictions_data.append( get_statistics(model, data_loader=data_loader)) for i in range(model.output_size): accuracy = 100.0 - 100.0 * history.predictions_data[ -1][i, i] / history.predictions_data[-1][i].sum() epmsg += " {}: {:.2f}, ".format(i, accuracy) epmsg = epmsg[:-2] if predictions_on_sequences: epmsg += "\n | Seqs errors for: ".format(epoch_) if ngram_data_stats: predictions, ngram_data = get_statistics( model, data_loader=sequence_loader, sequences=True, return_ngram=True) history.predictions_sequences.append(predictions) history.ngram_data_stats.append(ngram_data) else: history.predictions_sequences.append( get_statistics(model, data_loader=sequence_loader, sequences=True)) for i in range(model.output_size): accuracy = 100.0 - 100.0 * history.predictions_sequences[ -1][i, i] / history.predictions_sequences[-1][ i].sum() epmsg += " {}: {:.2f}, ".format(i, accuracy) epmsg = epmsg[:-2] if ngram_data_stats and not predictions_on_sequences: history.ngram_data_stats.append( get_ngram_stats(model, sequence_loader)) if ngram_test_stats: history.ngram_test_stats.append( get_ngram_stats(model, sequence_test_loader)) if loss_on_test: for x, y in sequence_test_loader: with torch.no_grad(): x = x.to(DEVICE).view(-1, model.n, 28 * 28).float() out = model.forward_sequences(x) history.test_primal_loss.append( model.loss_primal(out).item()) history.test_loss.append( -model.loss_dual(out).item()) if not remember_dual: for idx in model.dual: history['dual ' + str(idx)].append( model.dual[idx].item()) print('{0}+\n{1}\n{0}+'.format('-' * (len(msg) - 1), epmsg)) except KeyboardInterrupt: pass history.epochs_done += num_epochs return history
BOT_NAME = args.name SLACK_CHANNEL = args.channel # 'Hard' Constants - may rely on other values set but shouldn't be changed SLACK_URL = args.url DATA_FILE = f"last_post_data_{args.script_name}.json" # endregion # Get data and convert accordingly cb = Coinbase(Currencies.default(), args.interval) prices = cb.price_list() cur_price = prices[0] # Get history from last runs, use it to work out what test to make history = History(DATA_FILE) # Get stats from coinbase data # If change isn't large enough, then update history and exit stats = HourData(prices, EMA_NUM_HOURS) if Analysis.ema_checks(stats, history, EMA_THRESHOLD_PERCENT, EMA_RESET_PERCENT): sys.exit(1) if not Analysis.should_post(history, stats, prices, EMA_THRESHOLD_PERCENT): sys.exit(1) logging.info("Message should be posted, generating attachment") attachments = Slack.generate_post(prices, stats, Currencies.default()) image_url = SlackImages.get_image(stats.is_diff_positive) logging.info("Posting to slack")
class DQNAgent(BaseAgent): def __init__(self, config): super(DQNAgent, self).__init__(config) self.history = History(config) self.replay_memory = DQNReplayMemory(config) self.net = DQN(4, config) self.net.build() self.net.add_summary([ "average_reward", "average_loss", "average_q", "ep_max_reward", "ep_avg_reward", "ep_min_reward", "ep_num_game", "learning_rate" ], ["ep_rewards", "ep_actions"]) def observe(self): reward = max(self.min_reward, min(self.max_reward, self.env_wrapper.reward)) color = self.env_wrapper.color self.history.add(color) self.replay_memory.add(color, reward, self.env_wrapper.action, self.env_wrapper.terminal) if self.i < self.config.epsilon_decay_episodes: self.epsilon -= self.config.epsilon_decay if self.i % self.config.train_freq == 0 and self.i > self.config.train_start: #print('----> i',self.i) ### training starts only after train_start=20K steps, mem_size is 800K, train_freq is 8, ### I guess thats why its okay to not worry about sampling with repititions state, action, reward, state_, terminal = self.replay_memory.sample_batch( ) q, loss = self.net.train_on_batch_target(state, action, reward, state_, terminal, self.i) ### self.i is passed to implement lr decay self.total_q += q self.total_loss += loss self.update_count += 1 if self.i % self.config.update_freq == 0: self.net.update_target() def policy(self): if np.random.rand() < self.epsilon: return self.env_wrapper.random_step() else: state = self.history.get() a = self.net.q_action.eval({self.net.state: [state]}, session=self.net.sess) return a[0] def train(self, steps): f = open('dqn2.txt', 'w') render = False self.env_wrapper.new_random_game() num_game, self.update_count, ep_reward = 0, 0, 0. total_reward, self.total_loss, self.total_q = 0., 0., 0. ep_rewards, actions = [], [] t = 0 print(self.net.number_of_trainable_parameters()) for _ in range(self.config.history_len): self.history.add(self.env_wrapper.color) ### So, the first state is just the first color, repeated a number of times for self.i in tqdm(range(self.i, steps)): #take action, observe action = self.policy() self.env_wrapper.act(action) self.observe() if self.env_wrapper.terminal: t = 0 self.env_wrapper.new_random_game() num_game += 1 ep_rewards.append(ep_reward) ep_reward = 0. else: ep_reward += self.env_wrapper.reward t += 1 actions.append(action) total_reward += self.env_wrapper.reward #print(self.i,action,total_reward, self.env_wrapper.terminal) #total_reward, max_ep_reward, min_ep_reward, avg_ep_reward keep track of reward earned every self.config.test_step=5000 steps if self.i >= self.config.train_start: if self.i % self.config.test_step == self.config.test_step - 1: avg_reward = total_reward / self.config.test_step avg_loss = self.total_loss / self.update_count avg_q = self.total_q / self.update_count try: max_ep_reward = np.max(ep_rewards) min_ep_reward = np.min(ep_rewards) avg_ep_reward = np.mean(ep_rewards) except: max_ep_reward, min_ep_reward, avg_ep_reward = 0, 0, 0 sum_dict = { 'average_reward': avg_reward, 'average_loss': avg_loss, 'average_q': avg_q, 'ep_max_reward': max_ep_reward, 'ep_min_reward': min_ep_reward, 'ep_num_game': num_game, 'ep_avg_reward': avg_ep_reward, 'learning_rate': self.net.learning_rate, 'ep_rewards': ep_rewards, 'ep_actions': actions } self.net.inject_summary(sum_dict, self.i) num_game = 0 total_reward = 0. self.total_loss = 0. self.total_q = 0. self.update_count = 0 ep_reward = 0. ep_rewards = [] actions = [] f.write(str(avg_ep_reward)) if self.i % 50000 == 0 and self.i > 0: j = 0 print('saving..') self.save() #play_score = self.play(episodes=self.config.num_episodes_for_play_scores_summary, net_path=self.net.dir_model) #self.net.inject_summary({'play_score':play_score}, self.i) if self.i % 100000 == 0: j = 0 render = True if render: #self.env_wrapper.env.render() j += 1 if j == 1000: render = False f.close() def play(self, episodes, net_path, verbose=False, print_average=True): d = [] self.net.restore_session(path=net_path) self.env_wrapper.new_game() i = 0 for _ in range(self.config.history_len): self.history.add(self.env_wrapper.color) episode_steps = 0 ###EDIT (LJ): added rewards calculation episode_reward = 0 actions_list = [] while i < episodes: #Chose Action: a = self.net.q_action.eval({self.net.state: [self.history.get()]}, session=self.net.sess) action = a[0] actions_list.append(action) #Take Action self.env_wrapper.act_play(action) self.history.add(self.env_wrapper.color) episode_steps += 1 episode_reward += self.env_wrapper.reward if episode_steps > self.config.max_steps: self.env_wrapper.terminal = True if self.env_wrapper.terminal: if verbose: print('episode terminated in ' + str(episode_steps) + ' steps with reward ' + str(episode_reward)) print('ACTIONS TAKEN:') print(actions_list) actions_list = [] d.append(episode_reward) episode_steps = 0 episode_reward = 0 i += 1 self.env_wrapper.new_play_game() for _ in range(self.config.history_len): color = self.env_wrapper.color self.history.add(color) if verbose: print('ALL, AVERAGE:', [d, sum(d) / len(d)]) if print_average: print('AVERAGE:', sum(d) / len(d)) return sum(d) / len(d)