def test_Playground_continuous_Hard_Lunar(): play = bloc.GymPlayground('LunarLanderContinuous-v2', harderEnvCoeficient=1.5) assert play.ACTION_SPACE.shape == (2, ) assert play.ACTION_CHOICES == 2 assert play.OBSERVATION_SPACE.shape == (8, ) assert play.OBSERVATION_DIM == 8
def gym_discrete_setup(): """ :return: (exp_spec, playground) :rtype: (ExperimentSpec, GymPlayground) """ exp_spec = bloc.ExperimentSpec(batch_size_in_ts=1000, max_epoch=2, theta_nn_hidden_layer_topology=(2, 2)) playground = bloc.GymPlayground('LunarLander-v2') yield exp_spec, playground tf_cv1.reset_default_graph()
def gym_and_tf_discrete_setup(): """ :return: (obs_p, act_p, exp_spec, playground) :rtype: (tf.Tensor, tf.Tensor, ExperimentSpec, GymPlayground) """ exp_spec = bloc.ExperimentSpec(batch_size_in_ts=1000, max_epoch=2, theta_nn_hidden_layer_topology=(2, 2)) playground = bloc.GymPlayground('LunarLander-v2') obs_p, act_p, Q_values_ph = bloc.gym_playground_to_tensorflow_graph_adapter(playground, action_shape_constraint=(1,)) yield obs_p, act_p, exp_spec, playground tf_cv1.reset_default_graph()
def gym_and_tf_SAC_Brain_continuous_setup(): """ :return: obs_t_ph, act_ph, obs_t_prime_ph, reward_t_ph, trj_done_t_ph, exp_spec, playground """ exp_spec = bloc.ExperimentSpec() exp_spec.set_experiment_spec(unit_test_hparam) playground = bloc.GymPlayground('LunarLanderContinuous-v2') obs_t_ph, act_ph, _ = bloc.gym_playground_to_tensorflow_graph_adapter(playground) obs_t_prime_ph = bloc.continuous_space_placeholder(space=playground.OBSERVATION_SPACE, name=vocab.obs_tPrime_ph) reward_t_ph = tf_cv1.placeholder(dtype=tf.float32, shape=(None,), name=vocab.rew_ph) trj_done_t_ph = tf_cv1.placeholder(dtype=tf.float32, shape=(None,), name=vocab.trj_done_ph) yield obs_t_ph, act_ph, obs_t_prime_ph, reward_t_ph, trj_done_t_ph, exp_spec, playground tf_cv1.reset_default_graph()
def test_Playground_discreet(): play = bloc.GymPlayground('LunarLander-v2') assert play.ACTION_CHOICES == 4 assert play.OBSERVATION_DIM == 8
def test_Playground_continuous_Hard_no_env_FAIL(): with pytest.raises(Exception): play = bloc.GymPlayground('Pendulum-v0', harderEnvCoeficient=1.5)
def test_Playground_continuous(): play = bloc.GymPlayground('LunarLanderContinuous-v2') assert play.ACTION_SPACE.shape == (2, ) assert play.ACTION_CHOICES == 2 assert play.OBSERVATION_SPACE.shape == (8, ) assert play.OBSERVATION_DIM == 8
def test_Playground_init_ENV_FAIL(): with pytest.raises(Exception): bloc.GymPlayground('UnExistingEnvironment!!!')
def _build_computation_graph(self): """ Build the Policy_phi, V_psi and Q_theta computation graph as multi-layer perceptron """ self._set_random_seed() # (nice to have) todo:implement --> add init hook: # Note: Second environment for policy evaluation self.evaluation_playground = bloc.GymPlayground( environment_name=self.exp_spec.prefered_environment) """ ---- Placeholder ---- """ self.obs_t_ph = bloc.build_observation_placeholder(self.playground, name=vocab.obs_t_ph) self.obs_t_prime_ph = bloc.build_observation_placeholder( self.playground, name=vocab.obs_tPrime_ph) self.act_ph = bloc.build_action_placeholder(self.playground, name=vocab.act_ph) self.reward_t_ph = tf_cv1.placeholder(dtype=tf.float32, shape=(None, ), name=vocab.rew_ph) self.trj_done_t_ph = tf_cv1.placeholder(dtype=tf.float32, shape=(None, ), name=vocab.trj_done_ph) # \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\ # /// Actor computation graph ////////////////////////////////////////////////////////////////////////////////// with tf_cv1.variable_scope(vocab.actor_network): pi, pi_log_p, self.policy_mu = build_gaussian_policy_graph( self.obs_t_ph, self.exp_spec, self.playground) self.policy_pi, self.pi_log_likelihood = apply_action_bound( pi, pi_log_p) """ ---- Adjust policy distribution result to action range ---- """ if self.playground.ACTION_SPACE.bounded_above.all(): self.policy_pi *= self.playground.ACTION_SPACE.high[0] self.policy_mu *= self.playground.ACTION_SPACE.high[0] # \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\ # /// Critic computation graph ///////////////////////////////////////////////////////////////////////////////// with tf_cv1.variable_scope(vocab.critic_network): self.V_psi, self.V_psi_frozen = build_critic_graph_v_psi( self.obs_t_ph, self.obs_t_prime_ph, self.exp_spec) """ ---- Q_theta {1,2} according to sampled action & according to the reparametrized policy---- """ self.Q_act_1, self.Q_pi_1 = build_critic_graph_q_theta( self.obs_t_ph, self.act_ph, self.policy_pi, self.exp_spec, name=vocab.Q_theta_1) self.Q_act_2, self.Q_pi_2 = build_critic_graph_q_theta( self.obs_t_ph, self.act_ph, self.policy_pi, self.exp_spec, name=vocab.Q_theta_2) # \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\ # /// Actor & Critic Training ops ////////////////////////////////////////////////////////////////////////////// with tf_cv1.variable_scope(vocab.critic_training): critic_lr_schedule, critic_global_grad_step = critic_learning_rate_scheduler( self.exp_spec) self.V_psi_loss, self.V_psi_optimizer = critic_v_psi_train( self.V_psi, self.Q_pi_1, self.Q_pi_2, self.pi_log_likelihood, self.exp_spec, critic_lr_schedule, critic_global_grad_step) q_theta_train_ops = critic_q_theta_train( self.V_psi_frozen, self.Q_act_1, self.Q_act_2, self.reward_t_ph, self.trj_done_t_ph, self.exp_spec, critic_lr_schedule, critic_global_grad_step) self.q_theta_1_loss, self.q_theta_2_loss, self.q_theta_1_optimizer, self.q_theta_2_optimizer = q_theta_train_ops with tf_cv1.variable_scope(vocab.policy_training): self.actor_kl_loss, self.actor_policy_optimizer_op = actor_train( self.pi_log_likelihood, self.Q_pi_1, self.Q_pi_2, self.exp_spec) """ ---- Target nework update: V_psi --> frozen_V_psi ---- """ with tf_cv1.variable_scope(vocab.target_update): self.V_psi_frozen_update_ops = update_frozen_v_psi_op( self.exp_spec['target_smoothing_coefficient']) self.init_frozen_v_psi_op = init_frozen_v_psi() tr_str = list_representation( tf_cv1.get_collection_ref(tf_cv1.GraphKeys.TRAINABLE_VARIABLES), ":: TRAINABLE_VARIABLES") print(tr_str) # \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\ # /// Summary ops ////////////////////////////////////////////////////////////////////////////////////////////// # region :: Summary placholders & ops ... """ ---- By Epoch summary: RETURNS & LENGHT ---- """ self.summary_avg_trjs_return_ph = tf_cv1.placeholder( tf.float32, name=vocab.summary_ph + 'stoPi_stage_avg_trjs_return_ph') tf_cv1.summary.scalar('Epoch_average_trj_return_stochastic_pi)', self.summary_avg_trjs_return_ph, family=vocab.G) self.summary_avg_trjs_len_ph = tf_cv1.placeholder( tf.float32, name=vocab.summary_ph + 'stoPi_stage_avg_trjs_len_ph') tf_cv1.summary.scalar('Epoch_average_trj_lenght_stochastic_pi)', self.summary_avg_trjs_len_ph, family=vocab.Trajectory_lenght) self.summary_eval_avg_trjs_return_ph = tf_cv1.placeholder( tf.float32, name=vocab.summary_ph + 'detPi_stage_avg_trjs_return_ph') tf_cv1.summary.scalar('Epoch_average_trj_return_deterministic_pi)', self.summary_eval_avg_trjs_return_ph, family=vocab.G) self.summary_eval_avg_trjs_len_ph = tf_cv1.placeholder( tf.float32, name=vocab.summary_ph + 'detPi_stage_avg_trjs_len_ph') tf_cv1.summary.scalar('Epoch_average_trj_lenght_deterministic_pi)', self.summary_eval_avg_trjs_len_ph, family=vocab.Trajectory_lenght) """ ---- By Epoch summary: LOSS ---- """ self.summary_avg_trjs_Vloss_ph = tf_cv1.placeholder( tf.float32, name=vocab.summary_ph + 'Critic_V_loss_ph') tf_cv1.summary.scalar('critic_v_loss', self.summary_avg_trjs_Vloss_ph, family=vocab.loss) self.summary_avg_trjs_Q1loss_ph = tf_cv1.placeholder( tf.float32, name=vocab.summary_ph + 'Critic_Q1_loss_ph') tf_cv1.summary.scalar('critic_q_1_loss', self.summary_avg_trjs_Q1loss_ph, family=vocab.loss) self.summary_avg_trjs_Q2loss_ph = tf_cv1.placeholder( tf.float32, name=vocab.summary_ph + 'Critic_Q2_loss_ph') tf_cv1.summary.scalar('critic_q_2_loss', self.summary_avg_trjs_Q2loss_ph, family=vocab.loss) self.summary_avg_trjs_pi_loss_ph = tf_cv1.placeholder( tf.float32, name=vocab.summary_ph + 'policy_loss_ph') tf_cv1.summary.scalar('policy_loss', self.summary_avg_trjs_pi_loss_ph, family=vocab.loss) """ ---- By Epoch summary: POLICY & VALUE fct ---- """ self.summary_avg_pi_log_likelihood_ph = tf_cv1.placeholder( tf.float32, name=vocab.summary_ph + 'pi_log_p_ph') tf_cv1.summary.scalar('policy_log_likelihood', self.summary_avg_pi_log_likelihood_ph, family=vocab.policy) # self.summary_avg_policy_pi_ph = tf_cv1.placeholder(tf.float32, name=vocab.summary_ph + 'policy_pi_ph') # tf_cv1.summary.scalar('policy_py', self.summary_avg_policy_pi_ph, family=vocab.policy) # # self.summary_avg_policy_mu_ph = tf_cv1.placeholder(tf.float32, name=vocab.summary_ph + 'policy_mu_ph') # tf_cv1.summary.scalar('policy_mu', self.summary_avg_policy_mu_ph, family=vocab.policy) self.summary_avg_V_value_ph = tf_cv1.placeholder( tf.float32, name=vocab.summary_ph + 'V_values_ph') tf_cv1.summary.scalar('V_values', self.summary_avg_V_value_ph, family=vocab.values) self.summary_avg_frozen_V_value_ph = tf_cv1.placeholder( tf.float32, name=vocab.summary_ph + 'frozen_V_values_ph') tf_cv1.summary.scalar('frozen_V_values', self.summary_avg_frozen_V_value_ph, family=vocab.values) self.summary_avg_Q1_value_ph = tf_cv1.placeholder( tf.float32, name=vocab.summary_ph + 'Q1_values_ph') tf_cv1.summary.scalar('Q1_values', self.summary_avg_Q1_value_ph, family=vocab.values) self.summary_avg_Q2_value_ph = tf_cv1.placeholder( tf.float32, name=vocab.summary_ph + 'Q2_values_ph') tf_cv1.summary.scalar('Q2_values', self.summary_avg_Q2_value_ph, family=vocab.values) self.summary_epoch_op = tf_cv1.summary.merge_all() """ ---- Distribution summary ---- """ self.summary_hist_policy_pi = tf_cv1.summary.histogram( 'policy_py_tensor', self.policy_pi, family=vocab.policy) """ ---- By Trajectory summary ---- """ # self.summary_sto_pi_TRJ_return_ph = tf_cv1.placeholder(tf.float32, # name=vocab.summary_ph + 'summary_stoPi_trj_return_ph') # self.summary_sto_pi_TRJ_return_op = tf_cv1.summary.scalar('Trajectory_return_stochastic_pi', # self.summary_sto_pi_TRJ_return_ph, family=vocab.G) # # self.summary_sto_pi_TRJ_lenght_ph = tf_cv1.placeholder(tf.float32, # name=vocab.summary_ph + 'summary_stoPi_trj_lenght_ph') # self.summary_sto_pi_TRJ_lenght_op = tf_cv1.summary.scalar('Trajectory_lenght_stochastic_pi', # self.summary_sto_pi_TRJ_lenght_ph, # family=vocab.Trajectory_lenght) # # self.summary_TRJ_op = tf_cv1.summary.merge([self.summary_sto_pi_TRJ_return_op, # self.summary_sto_pi_TRJ_lenght_op]) # endregion return None
def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2, epochs=50, batch_size=5000, render=False): # make environment, check spaces, get obs / act dims # env = gym.make(env_name) # ////// Original bloc ////// REINFORCE_integration_test = { # \\\\\\ My bloc \\\\\\ 'prefered_environment': env_name, 'paramameter_set_name': 'REINFORCE integration test on CartPole-v0', 'batch_size_in_ts': batch_size, 'max_epoch': epochs, 'discounted_reward_to_go': False, 'discout_factor': 0.999, 'learning_rate': lr, 'theta_nn_h_layer_topo': tuple(hidden_sizes), 'random_seed': 42, 'theta_hidden_layers_activation': tf.nn.tanh, # tf.nn.relu, 'theta_output_layers_activation': None, 'render_env_every_What_epoch': 100, 'print_metric_every_what_epoch': 5, } playground = BLOC.GymPlayground(env_name) # \\\\\\ My bloc \\\\\\ env = playground.env # \\\\\\ My bloc \\\\\\ exp_spec = BLOC.ExperimentSpec() # \\\\\\ My bloc \\\\\\ exp_spec.set_experiment_spec( REINFORCE_integration_test) # \\\\\\ My bloc \\\\\\ consol_print_learning_stats = ConsolPrintLearningStats( # \\\\\\ My bloc \\\\\\ exp_spec, exp_spec.print_metric_every_what_epoch) # \\\\\\ My bloc \\\\\\ assert isinstance(env.observation_space, Box), \ "This example only works for envs with continuous state spaces." assert isinstance(env.action_space, Discrete), \ "This example only works for envs with discrete action spaces." obs_dim = env.observation_space.shape[0] n_acts = env.action_space.n # make core of policy network # obs_ph = tf.placeholder(shape=(None, obs_dim), dtype=tf.float32) # ////// Original bloc ////// obs_ph, act_ph, weights_ph = BLOC.gym_playground_to_tensorflow_graph_adapter( playground) # \\\\\\ My bloc \\\\\\ # logits = mlp(obs_ph, sizes=hidden_sizes+[n_acts]) # ////// Original bloc ////// # logits = BLOC.build_MLP_computation_graph(obs_ph, playground, # \\\\\\ My bloc \\\\\\ # hidden_layer_topology=tuple(hidden_sizes)) # \\\\\\ My bloc \\\\\\ # make action selection op (outputs int actions, sampled from policy) # actions = tf.squeeze(tf.multinomial(logits=logits,num_samples=1), axis=1) # ////// Original bloc ////// # actions, log_p_all = BLOC.policy_theta_discrete_space(logits, playground) # \\\\\\ My bloc \\\\\\ # make loss function whose gradient, for the right data, is policy gradient # weights_ph = tf.placeholder(shape=(None,), dtype=tf.float32) # ////// Original bloc ////// # act_ph = tf.placeholder(shape=(None,), dtype=tf.int32) # ////// Original bloc ////// # action_masks = tf.one_hot(act_ph, n_acts) # ////// Original bloc ////// # log_probs = tf.reduce_sum(action_masks * tf.nn.log_softmax(logits), axis=1) # ////// Original bloc ////// # loss = -tf.reduce_mean(weights_ph * log_probs) # ////// Original bloc ////// # (!) First silent error cause by uneven batch size # \\\\\\ My bloc \\\\\\ # loss = BLOC.discrete_pseudo_loss(log_p_all, act_ph, weights_ph, playground) # \\\\\\ My bloc \\\\\\ reinforce_policy = REINFORCEbrain.REINFORCE_policy( obs_ph, act_ph, # \\\\\\ My bloc \\\\\\ weights_ph, exp_spec, playground) # \\\\\\ My bloc \\\\\\ (actions, _, loss) = reinforce_policy # \\\\\\ My bloc \\\\\\ # make train op # train_op = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss) # ////// Original bloc ////// train_op = BLOC.policy_optimizer( loss, learning_rate=exp_spec.learning_rate) # \\\\\\ My bloc \\\\\\ # \\\\\\ My bloc \\\\\\ date_now = datetime.now() run_str = "Run--{}h{}--{}-{}-{}".format(date_now.hour, date_now.minute, date_now.day, date_now.month, date_now.year) # writer = tf_cv1.summary.FileWriter("./graph/{}".format(run_str), tf_cv1.get_default_graph()) writer = tf_cv1.summary.FileWriter( "test_Z_integration/test_integrationREINFORCE/graph/{}".format( run_str), tf_cv1.get_default_graph()) the_TRAJECTORY_COLLECTOR = TrajectoryCollector( exp_spec, playground) # \\\\\\ My bloc \\\\\\ the_UNI_BATCH_COLLECTOR = UniformBatchCollector( exp_spec.batch_size_in_ts) # \\\\\\ My bloc \\\\\\ # ////// Original bloc ////// # sess = tf.InteractiveSession() # sess.run(tf.global_variables_initializer()) # \\\\\\ My bloc \\\\\\ tf_cv1.set_random_seed(exp_spec.random_seed) np.random.seed(exp_spec.random_seed) with tf_cv1.Session() as sess: sess.run(tf_cv1.global_variables_initializer() ) # initialize random variable in the computation graph consol_print_learning_stats.start_the_crazy_experiment() # for training policy def train_one_epoch(): consol_print_learning_stats.next_glorious_epoch( ) # \\\\\\ My bloc \\\\\\ # ////// Original bloc ////// # # make some empty lists for logging. # batch_obs = [] # for observations # batch_acts = [] # for actions # batch_weights = [] # for reward-to-go weighting in policy gradient # batch_rets = [] # for measuring episode returns # batch_lens = [] # for measuring episode lengths # ep_rews = [] # list for rewards accrued throughout ep # reset episode-specific variables obs = env.reset() # first obs comes from starting distribution done = False # signal from environment that episode is over # render first episode of each epoch finished_rendering_this_epoch = False consol_print_learning_stats.next_glorious_trajectory( ) # \\\\\\ My bloc \\\\\\ # collect experience by acting in the environment with current policy while True: # rendering if (not finished_rendering_this_epoch) and render: env.render() # save obs # batch_obs.append(obs.copy()) # <-- (!) (Critical) append S_t not S_{t+1} ////// Original bloc ////// # # act in the environment # act = sess.run(actions, {obs_ph: obs.reshape(1,-1)})[0] # ////// Original bloc ////// # obs, rew, done, _ = env.step(act) # ////// Original bloc ////// step_observation = BLOC.format_single_step_observation( obs) # \\\\\\ My bloc \\\\\\ action_array = sess.run(actions, feed_dict={ obs_ph: step_observation }) # \\\\\\ My bloc \\\\\\ act = blocAndTools.tensorflowbloc.to_scalar( action_array) # \\\\\\ My bloc \\\\\\ # obs, rew, done, _ = playground.env.step(act) <-- (!) mistake # \\\\\\ My bloc \\\\\\ # (!) Solution to silent error 2: dont ovewrite S_t \\\\\\ My bloc \\\\\\ obs_prime, rew, done, _ = playground.env.step( act) # <-- (!) Solution \\\\\\ My bloc \\\\\\ # ////// Original bloc ////// # # save action, reward # batch_acts.append(act) # ep_rews.append(rew) # (Critical) | Append the observation S_t that trigered the action A_t is critical. \\\\\\ My bloc \\\\\\ # | If the observation is the one at time S_{t+1}, the agent wont learn \\\\\\ My bloc \\\\\\ the_TRAJECTORY_COLLECTOR.collect_OAR( obs, act, rew ) # <-- (!) Silent error 2 \\\\\\ My bloc \\\\\\ obs = obs_prime # <-- (!) Solution to silent error 2 \\\\\\ My bloc \\\\\\ if done: # ////// Original bloc ////// # # if episode is over, record info about episode # ep_ret, ep_len = sum(ep_rews), len(ep_rews) # batch_rets.append(ep_ret) # batch_lens.append(ep_len) trj_return = the_TRAJECTORY_COLLECTOR.trajectory_ended( ) # \\\\\\ My bloc \\\\\\ the_TRAJECTORY_COLLECTOR.compute_Qvalues_as_rewardToGo() trj_container = the_TRAJECTORY_COLLECTOR.pop_trajectory_and_reset( ) # \\\\\\ My bloc \\\\\\ the_UNI_BATCH_COLLECTOR.collect( trj_container) # \\\\\\ My bloc \\\\\\ consol_print_learning_stats.trajectory_training_stat( the_trajectory_return=trj_return, timestep=len( trj_container)) # \\\\\\ My bloc \\\\\\ # the weight for each logprob(a_t|s_t) is reward-to-go from t # batch_weights += list(reward_to_go(ep_rews)) # ////// Original bloc ////// # batch_weights += BLOC.reward_to_go(ep_rews) # \\\\\\ My bloc \\\\\\ # reset episode-specific variables obs, done, ep_rews = env.reset(), False, [] consol_print_learning_stats.next_glorious_trajectory( ) # \\\\\\ My bloc \\\\\\ # won't render again this epoch finished_rendering_this_epoch = True # ////// Original bloc ////// # # end experience loop if we have enough of it # if len(batch_obs) > batch_size: # break if not the_UNI_BATCH_COLLECTOR.is_not_full( ): # \\\\\\ My bloc \\\\\\ break # \\\\\\ My bloc \\\\\\ # ////// Original bloc ////// # # take a single policy gradient update step # batch_loss, _ = sess.run([loss, train_op], # feed_dict={ # obs_ph: np.array(batch_obs), # act_ph: np.array(batch_acts), # weights_ph: np.array(batch_weights) # }) batch_container = the_UNI_BATCH_COLLECTOR.pop_batch_and_reset( ) # \\\\\\ My bloc \\\\\\ (batch_rets, batch_lens) = batch_container.get_basic_metric( ) # \\\\\\ My bloc \\\\\\ batch_obs = batch_container.batch_observations # \\\\\\ My bloc \\\\\\ batch_acts = batch_container.batch_actions # \\\\\\ My bloc \\\\\\ batch_weights = batch_container.batch_Qvalues # \\\\\\ My bloc \\\\\\ feed_dictionary = blocAndTools.tensorflowbloc.build_feed_dictionary( [obs_ph, act_ph, weights_ph], # \\\\\\ My bloc \\\\\\ [ batch_obs, # \\\\\\ My bloc \\\\\\ batch_acts, batch_weights ]) # \\\\\\ My bloc # \\\\\\ batch_loss, _ = sess.run( [loss, train_op], # \\\\\\ My bloc \\\\\\ feed_dict=feed_dictionary) # \\\\\\ My bloc \\\\\\ return batch_loss, batch_rets, batch_lens # training loop for i in range(epochs): batch_loss, batch_rets, batch_lens = train_one_epoch() mean_return = np.mean(batch_rets) average_len = np.mean(batch_lens) # ////// Original bloc ////// # print('epoch: %3d \t loss: %.3f \t return: %.3f \t ep_len: %.3f' % # (i, batch_loss, mean_return, average_len)) # \\\\\\ My bloc \\\\\\ consol_print_learning_stats.epoch_training_stat( epoch_loss=batch_loss, epoch_average_trjs_return=mean_return, epoch_average_trjs_lenght=average_len, number_of_trj_collected=0, total_timestep_collected=0) yield (i, batch_loss, mean_return, average_len) print("\n>>> Close session\n") writer.close() playground.env.close() tf_cv1.reset_default_graph()