def _run_one_iteration(self, iteration, eval_mode): """Runs one iteration in separate thread, logs and checkpoints results. Same as parent Runner implementation except that summary statistics are directly logged instead of being returned. Args: iteration: int, current iteration number, used as a global_step for saving Tensorboard summaries. eval_mode: bool, whether this is an evaluation iteration. """ statistics = iteration_statistics.IterationStatistics() iteration_name = '{}iteration {}'.format('eval ' if eval_mode else '', iteration) tf.logging.info('Starting %s.', iteration_name) run_phase = self._run_eval_phase if eval_mode else self._run_train_phase num_episodes, average_reward = run_phase(statistics) with self._output_lock: logging_iteration = iteration if eval_mode else self._completed_iteration self._log_experiment(logging_iteration, statistics, suffix='_eval' if eval_mode else '') self._save_tensorboard_summaries( logging_iteration, num_episodes, average_reward, tag='Eval' if eval_mode else 'Train') if not eval_mode: self._checkpoint_experiment(self._completed_iteration) self._completed_iteration += 1 tf.logging.info('Completed %s.', iteration_name)
def testAddManyValues(self): my_pi = 3.14159 statistics = iteration_statistics.IterationStatistics() # Add a number of items. Each item is added to the list corresponding to its # given key. statistics.append({ 'rewards': 0, 'nouns': 'reinforcement', 'angles': my_pi }) # Add a second item to the 'nouns' list. statistics.append({'nouns': 'learning'}) # There are three lists. self.assertEqual(3, len(statistics.data_lists)) self.assertEqual(1, len(statistics.data_lists['rewards'])) self.assertEqual(2, len(statistics.data_lists['nouns'])) self.assertEqual(1, len(statistics.data_lists['angles'])) self.assertEqual(0, statistics.data_lists['rewards'][0]) self.assertEqual('reinforcement', statistics.data_lists['nouns'][0]) self.assertEqual('learning', statistics.data_lists['nouns'][1]) self.assertEqual(my_pi, statistics.data_lists['angles'][0])
def _run_one_iteration(self, iteration): """Runs one iteration of agent/environment interaction. An iteration involves running several episodes until a certain number of steps are obtained. The interleaving of train/eval phases implemented here are to match the implementation of (Mnih et al., 2015). Args: iteration: int, current iteration number, used as a global_step for saving Tensorboard summaries. Returns: A dict containing summary statistics for this iteration. """ statistics = iteration_statistics.IterationStatistics() tf.logging.info('Starting iteration %d', iteration) num_episodes_train, average_reward_train = self._run_train_phase( statistics) num_episodes_eval, average_reward_eval = self._run_eval_phase( statistics) self._save_tensorboard_summaries(iteration, num_episodes_train, average_reward_train, num_episodes_eval, average_reward_eval) if not self.testing: self.fout_test.write( '%d %f %d\n' % (iteration, average_reward_eval, num_episodes_eval)) self.fout_test.flush() return statistics.data_lists
def _run_one_iteration(self, iteration): """Runs one iteration of agent/environment interaction. An iteration involves running several episodes until a certain number of steps are obtained. The interleaving of train/eval phases implemented here are to match the implementation of (Mnih et al., 2015). Args: iteration: int, current iteration number, used as a global_step for saving Tensorboard summaries. Returns: A dict containing summary statistics for this iteration. """ statistics = iteration_statistics.IterationStatistics() logging.info('Starting iteration %d', iteration) num_episodes_train, average_reward_train, average_steps_per_second = ( self._run_train_phase(statistics)) active_num_episodes_eval, active_average_reward_eval = self._run_eval_phase( statistics, 'active') passive_num_episodes_eval, passive_average_reward_eval = ( self._run_eval_phase(statistics, 'passive')) self._save_tensorboard_summaries(iteration, num_episodes_train, average_reward_train, active_num_episodes_eval, active_average_reward_eval, passive_num_episodes_eval, passive_average_reward_eval, average_steps_per_second) return statistics.data_lists
def _run_one_iteration(self, iteration): """Runs one iteration of agent/environment interaction. An iteration involves running several episodes until a certain number of steps are obtained. The interleaving of train/eval phases implemented here are to match the implementation of (Mnih et al., 2015). Args: iteration: int, current iteration number, used as a global_step for saving Tensorboard summaries. Returns: A dict containing summary statistics for this iteration. """ statistics = iteration_statistics.IterationStatistics() tf.logging.info('Starting iteration %d', iteration) # Perform the training phase, during which the agent learns. self._agent.eval_mode = False start_time = time.time() number_steps, sum_returns, num_episodes = self._run_one_phase( self._training_steps, statistics, 'train') average_return = sum_returns / num_episodes if num_episodes > 0 else 0.0 print("Average return", round(average_return, 2)) print("Number_steps", number_steps) print("Num episodes", num_episodes) statistics.append({'train_average_return': average_return}) time_delta = time.time() - start_time print("Time", int(time_delta)) tf.logging.info( 'Average undiscounted return per training episode: %.2f', average_return) tf.logging.info('Average training steps per second: %.2f', number_steps / time_delta) num_episodes_train, average_reward_train = num_episodes, average_return # Perform the evaluation phase -- no learning. self._agent.eval_mode = True _, eval_sum_returns, eval_num_episodes = self._run_one_phase( self._evaluation_steps, statistics, 'eval') eval_average_return = eval_sum_returns / eval_num_episodes if eval_num_episodes > 0 else 0.0 print("Test average return", round(eval_average_return, 2)) print("Test num episodes", eval_num_episodes) print('---------------------------------------') tf.logging.info( 'Average undiscounted return per evaluation episode: %.2f', eval_average_return) statistics.append({'eval_average_return': eval_average_return}) num_episodes_eval, average_reward_eval = eval_num_episodes, eval_average_return self._save_tensorboard_summaries(iteration, num_episodes_train, average_reward_train, num_episodes_eval, average_reward_eval) return statistics.data_lists
def testAddOneValue(self): statistics = iteration_statistics.IterationStatistics() # The statistics data structure should be empty a-priori. self.assertEqual(0, len(statistics.data_lists)) statistics.append({'key1': 0}) # We should have exactly one list, containing one value. self.assertEqual(1, len(statistics.data_lists)) self.assertEqual(1, len(statistics.data_lists['key1'])) self.assertEqual(0, statistics.data_lists['key1'][0])
def _run_one_iteration(self, iteration): """Runs one iteration of agent/environment interaction.""" statistics = iteration_statistics.IterationStatistics() tf.logging.info('Starting iteration %d', iteration) self._run_train_phase() num_episodes_eval, average_reward_eval = self._run_eval_phase( statistics) self._save_tensorboard_summaries(iteration, num_episodes_eval, average_reward_eval) return statistics.data_lists
def _run_one_iteration(self, iteration): """Runs one iteration of agent/environment interaction.""" statistics = iteration_statistics.IterationStatistics() tf.logging.info("Starting iteration %d", iteration) # pylint: disable=protected-access if not self._agent._replay_suffix: # Reload the replay buffer self._agent._replay.memory.reload_buffer(num_buffers=5) # pylint: enable=protected-access self._run_train_phase() self.offline_evaluation(iteration) return statistics.data_lists
def _run_one_iteration(self, iteration): """Runs one iteration of agent/environment interaction.""" statistics = iteration_statistics.IterationStatistics() logging.info('Starting iteration %d', iteration) if not self._agent.replay_suffix: # Reload the replay buffer at every iteration self._agent._replay.reload_data() # pylint: disable=protected-access self._run_train_phase() num_episodes_eval, average_reward_eval = self._run_eval_phase( statistics) self._save_tensorboard_summaries(iteration, num_episodes_eval, average_reward_eval) return statistics.data_lists
def _run_one_iteration(self, iteration): """Runs one iteration of agent/environment interaction. An iteration involves running several episodes until a certain number of steps are obtained. The interleaving of train/eval phases implemented here are to match the implementation of (Mnih et al., 2015). Args: iteration: int, current iteration number, used as a global_step for saving Tensorboard summaries. Returns: A dict containing summary statistics for this iteration. """ statistics = iteration_statistics.IterationStatistics() tf.logging.info('Starting iteration %d', iteration) num_episodes_train, average_reward_train = self._run_train_phase( statistics) num_episodes_eval, average_reward_eval = self._run_eval_phase( statistics) self._run_evalrandom_phase(statistics) print("EPSILON:") print( self._agent.epsilon_fn(self._agent.epsilon_decay_period, self._agent.training_steps, self._agent.min_replay_history, self._agent.epsilon_train)) print("QUNATILE VALUES:") self._agent._record_observation([0, 2, 1, 1, 2, 0, 0, 0, 0]) print( self._agent._sess.run( self._agent._q_values, { self._agent.state_ph: self._agent.state, self._agent.validmoves_ph: [0, 5, 6, 7, 8] })) #self._save_tensorboard_summaries(iteration, num_episodes_train, # average_reward_train, num_episodes_eval, # average_reward_eval) if not self.testing: self.fout_test.write( '%d %f %d\n' % (iteration, average_reward_eval, num_episodes_eval)) self.fout_test.flush() return statistics.data_lists
def _train_one_step(self, epoch, Phi, Psi, left_vec, key, optim, opt_state): # pylint: disable=invalid-name """Training function.""" statistics = iteration_statistics.IterationStatistics() logging.info('Starting epoch %d', epoch) start_time = time.time() Phi, opt_state, grads = estimates.nabla_phi_analytical( Phi, Psi, key, optim, # pylint: disable=invalid-name opt_state, self._estimator, self._alpha, self._use_l2_reg, self._reg_coeff, self._use_penalty, self._j, self._num_rows) time_delta = time.time() - start_time statistics.append({'Time/epoch': time_delta}) statistics.append({'representation': Phi}) gm_distances = utils.grassman_distance(Phi, left_vec[:, :FLAGS.d]) statistics.append({'GM_distances': gm_distances}) frob_norms = utils.outer_objective_mc(Phi, Psi) statistics.append({'Frob_norms': frob_norms}) phi_norms = jnp.sum(jnp.square(Phi)) statistics.append({'phi_norms': phi_norms}) grad_norms = jnp.sum(jnp.square(grads)) phi_ranks = jnp.linalg.matrix_rank(Phi) statistics.append({'phi_ranks': phi_ranks}) statistics.append({'grad_norms': grad_norms}) if FLAGS.d == 1: dot_products = (Phi.T @ left_vec[:, :FLAGS.d] / (jnp.linalg.norm(Phi) * jnp.linalg.norm(left_vec[:, :FLAGS.d])))[0][0] statistics.append({'Dot_products': dot_products}) else: dot_products = 0. # if epoch % self._summary_writer_frequency == 0: self._save_tensorboard_summaries(epoch, frob_norms, gm_distances, dot_products, phi_norms, grad_norms, phi_ranks) return statistics.data_lists, Phi, opt_state
def _run_one_iteration(self, iteration): """Runs one iteration of agent/environment interaction. An iteration involves running several episodes until a certain number of steps are obtained. This method differs from the `_run_one_iteration` method in the base `Runner` class in that it only runs the train phase. Args: iteration: int, current iteration number, used as a global_step for saving Tensorboard summaries. Returns: A dict containing summary statistics for this iteration. """ statistics = iteration_statistics.IterationStatistics() num_episodes_train, average_reward_train = self._run_train_phase( statistics) self._save_tensorboard_summaries(iteration, num_episodes_train, average_reward_train) return statistics.data_lists
def _run_one_iteration(self, iteration): """Runs one iteration of agent/environment interaction.""" statistics = iteration_statistics.IterationStatistics() tf.logging.info('Starting iteration %d', iteration) for _ in range(self._training_maxi_steps): # pylint: disable=protected-access if not self._agent._replay_suffix: # Reload the replay buffer self._agent._replay.memory.reload_buffer(num_buffers=4) # pylint: enable=protected-access self._run_train_phase() num_episodes_eval, average_reward_eval = self._run_eval_phase( statistics) self._save_tensorboard_summaries(iteration, num_episodes_eval, average_reward_eval) try: self._agent.iteration_end_hook() except AttributeError: pass return statistics.data_lists
def _run_one_iteration(self, iteration): statistics = iteration_statistics.IterationStatistics() logging.info('Starting iteration %d', iteration) _, _ = self._run_eval_phase(statistics) return statistics.data_lists
def _run_one_iteration(self, iteration): from dopamine.discrete_domains import iteration_statistics statistics = iteration_statistics.IterationStatistics() tf.logging.info('Starting iteration %d', iteration) _, _ = self._run_eval_phase(statistics) return statistics.data_lists
def testMissingValue(self): statistics = iteration_statistics.IterationStatistics() with self.assertRaises(KeyError): _ = statistics.data_lists['missing_key']
def _run_one_iteration(self, iteration, firstiteration): """Runs one iteration of agent/environment interaction. An iteration involves running several episodes until a certain number of steps are obtained. The interleaving of train/eval phases implemented here are to match the implementation of (Mnih et al., 2015). Args: iteration: int, current iteration number, used as a global_step for saving Tensorboard summaries. Returns: A dict containing summary statistics for this iteration. """ #if firstiteration: # experiment_data = self._agent2.bundle_and_checkpoint(self.PATH + "test", # iteration) # if experiment_data: # experiment_data['current_iteration'] = iteration # experiment_data['logs'] = self._logger.data # self._checkpointertest.save_checkpoint(iteration, experiment_data) #if iteration==25: # experiment_data = self._checkpointertest.load_checkpoint(0) # x=False # x=self._agent2.unbundle(self.PATH + "test", 0, experiment_data) # #self.to_graph = tf.Graph() # graph where everything above will be copied to #self._q_argmax = tf.contrib.copy_graph.copy_op_to_graph(self._agent._q_argmax, self.to_graph,[]) #self._q_argmax = tf.contrib.copy_graph.copy_op_to_graph(self._agent.state_ph, self.to_graph,[]) #self._q_argmax = tf.contrib.copy_graph.copy_op_to_graph(self._agent.validmoves_ph, self.to_graph,[]) if firstiteration: self._my_checkpoint_experiment(iteration, 'latest', 1) self._my_checkpoint_experiment(iteration, 'latest', 2) self._my_checkpoint_experiment(iteration, 'player', 1) q_argmax2 = self._agent._sess.run( self._agent._q_values, { self._agent.state_ph: [[[[0], [0], [0], [0], [0], [0], [0], [0], [0], [50]]]], self._agent.validmoves_ph: [1, 3, 5, 7, 8] }) #print("Q Before loading", q_argmax2) self._my_initialize_resume('crpt', 'latest', self.latest1, 11) self._my_initialize_resume('crpt', 'latest', self.latest2, 22) statistics = iteration_statistics.IterationStatistics() tf.logging.info('Starting iteration %d', iteration) print("SELF.COUNTER:", self.counter) if iteration > 50 and self.counter >= 1: self.counter = 0 self.player1_turn_training = not self.player1_turn_training if iteration < 50000: num_episodes_train, average_reward_train = self._run_generic_phase( statistics, 'train1') num_episodes_train, average_reward_train = self._run_generic_phase( statistics, 'train2') print("TRAIN TRAIN1") else: if self.player1_turn_training: num_episodes_train, average_reward_train = self._run_generic_phase( statistics, 'train1') print("TRAIN TRAIN2") else: num_episodes_train, average_reward_train = self._run_generic_phase( statistics, 'train2') print("TRAIN TRAIN3") num_episodes_eval, average_reward_eval = self._run_eval_phase( statistics) print("PLAYER1 TURN TRAINING:", self.player1_turn_training) print("EVAL EVAL") #numep, evalaveragereward = self._run_generic_phase( # statistics,'preveval') print("LATEST1:", self.latest1) print("LATEST2:", self.latest2) numep, oldvsnew = self._run_generic_phase(statistics, 'oldvsnew') numep, newvsold = self._run_generic_phase(statistics, 'newvsold') numep, oldvsold = self._run_generic_phase(statistics, 'oldvsold') print("OLD VS NEW:", oldvsnew) print("NEW VS OLD:", newvsold) print("OLD VS OLD:", oldvsold) numep, oldvsboss = self._run_generic_phase(statistics, 'oldvsboss') print("OLD VS BOSS", oldvsboss) numep, evalrandom = self._run_generic_phase(statistics, 'evalrandom') print("EVALRANDOM:", evalrandom) print("PLAYER1 TURN TRAINING2:", self.player1_turn_training) if self.player1_turn_training: self.compare_result_against = newvsold self.who_to_change = 1 self.who_to_change_latest = self.latest1 if newvsold > 0.05: self.counter += 1 else: self.counter = 0 else: self.compare_result_against = oldvsnew self.who_to_change = 2 self.who_to_change_latest = self.latest2 if oldvsnew < 0.05: self.counter += 1 else: self.counter = 0 if iteration < 50000: self._my_checkpoint_experiment(iteration, 'latest', 1) self.latest1 = iteration self._my_checkpoint_experiment(iteration, 'latest', 2) self.latest2 = iteration self.counter = 0 else: if self.player1_turn_training: if oldvsold + 0.05 < self.compare_result_against: self._my_checkpoint_experiment(iteration, 'latest', self.who_to_change) if self.who_to_change == 1: self.latest1 = iteration else: self.latest2 = iteration print("CROOK: Changing player " + str(self.who_to_change) + " version") else: self._my_initialize_resume('crpt', 'latest', self.who_to_change_latest, self.who_to_change) print("CROOK: Staying with old player " + str(self.who_to_change)) else: if oldvsold - 0.05 > self.compare_result_against: self._my_checkpoint_experiment(iteration, 'latest', self.who_to_change) if self.who_to_change == 1: self.latest1 = iteration else: self.latest2 = iteration print("CROOK: Changing player " + str(self.who_to_change) + " version") else: self._my_initialize_resume('crpt', 'latest', self.who_to_change_latest, self.who_to_change) print("CROOK: Staying with old player " + str(self.who_to_change)) #variables = tf.trainable_variables() #print("Weight matrix: {0}".format(self._agent._sess.run(variables[0]))) print("EPSILON:") print( self._agent.epsilon_fn(self._agent.epsilon_decay_period, self._agent.training_steps, self._agent.min_replay_history, self._agent.epsilon_train)) #self._save_tensorboard_summaries(iteration, num_episodes_train, # average_reward_train, num_episodes_eval, # average_reward_eval) if not self.testing: self.fout_test.write( '%d %f %d\n' % (iteration, average_reward_eval, num_episodes_eval)) self.fout_test.flush() return statistics.data_lists
def run_inference_test(self): statistics = iteration_statistics.IterationStatistics() _ = self._run_one_phase(self.inference_steps, statistics, 'eval')