Exemplo n.º 1
0
 def tensorboard_logger(env):
     for k, v in env.evaluation_result_list:
         pos = k.index("-")
         key = k[:pos -
                 2]  # NB: xgb usually has validation_0, drop the last 2 chars
         metric = k[pos + 1:]
         tb_log_value("%s/%s" % (key, metric), v, step=env.iteration)
Exemplo n.º 2
0
 def logger(self, q, finished):
     configure("{}/tb".format(self.args.log_path), flush_secs=30)
     while finished.value < 1:
         try:
             (name, value, step) = q.get(block=False)
             tb_log_value(name, value, step=step)
         except queue.Empty:
             pass
     print("Logging loop closed")
Exemplo n.º 3
0
def tensorboard_reporting(metrics, tick, phase, tick_type=None):
    """This method will write its results to tensorboard

    :param metrics: A map of metrics to scores
    :param tick: The time (resolution defined by `tick_type`)
    :param phase: The phase of training (`Train`, `Valid`, `Test`)
    :param tick_type: The resolution of tick (`STEP`, `EPOCH`)
    :return:
    """
    # To use this:
    # tensorboard --logdir runs
    # http://localhost:6006
    from tensorboard_logger import configure as tb_configure, log_value as tb_log_value
    global g_tb_run

    if g_tb_run is None:

        g_tb_run = 'runs/%d' % os.getpid()
        print('Creating Tensorboard run %s' % g_tb_run)
        tb_configure(g_tb_run, flush_secs=5)

    for metric in metrics.keys():
        chart_id = '%s:%s' % (phase, metric)
        tb_log_value(chart_id, metrics[metric], tick)
Exemplo n.º 4
0
def train(arglist):
    with U.single_threaded_session():
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
        print(
            "Using good policy {} and adv policy {}".format(
                arglist.good_policy, arglist.adv_policy
            )
        )

        np.seterr(all="raise")  # define before your code.

        # Initialize
        U.initialize()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print("Loading previous state...")
            U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0] for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        saver = tf.train.Saver()
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        t_start = time.time()

        print("making logger")
        tb_configure("logs/" + str(arglist.exp_name) + "_" + str(datetime.now()))
        print("Starting iterations...")
        while True:
            # get action
            action_n = [agent.action(obs) for agent, obs in zip(trainers, obs_n)]
            # environment step
            new_obs_n, rew_n, done_n, info_n = env.step(action_n)
            episode_step += 1
            done = all(done_n)
            terminal = episode_step >= arglist.max_episode_len
            # collect experience
            for i, agent in enumerate(trainers):
                agent.experience(
                    obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal
                )
            obs_n = new_obs_n

            for i, rew in enumerate(rew_n):
                episode_rewards[
                    -1
                ] += rew  ## / self.n (?) Do we want this to be average across all agents?
                agent_rewards[i][-1] += rew

            if done or terminal:
                obs_n = env.reset()
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n["n"])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + ".pkl"
                    print("Finished benchmarking, now saving...")
                    with open(file_name, "wb") as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # for displaying learned policies
            if arglist.display:
                time.sleep(0.1)
                env.render()
                continue

            # update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for agent in trainers:
                loss = agent.update(trainers, train_step)
            # log metrics

            tb_log_value("episode_reward", episode_rewards[train_step - 1], train_step)
            tb_log_value(
                "first_agent_reward", agent_rewards[0][train_step - 1], train_step
            )
            tb_log_value(
                "second_agent_reward", agent_rewards[1][train_step], train_step
            )
            if loss is not None:
                loss_to_log = loss
            else:
                loss_to_log = -100
                tb_log_value("loss", loss_to_log, train_step)

            # save model, display training output
            if terminal and (len(episode_rewards) % arglist.save_rate == 0):
                print("made it into if terminal and len(episde)")
                U.save_state(arglist.save_dir, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, time: {}".format(
                            train_step,
                            len(episode_rewards),
                            np.mean(episode_rewards[-arglist.save_rate :]),
                            round(time.time() - t_start, 3),
                        )
                    )
                else:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format(
                            train_step,
                            len(episode_rewards),
                            np.mean(episode_rewards[-arglist.save_rate :]),
                            [
                                np.mean(rew[-arglist.save_rate :])
                                for rew in agent_rewards
                            ],
                            round(time.time() - t_start, 3),
                        )
                    )

                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate :]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate :]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                rew_file_name = arglist.plots_dir + arglist.exp_name + "_rewards.pkl"
                with open(rew_file_name, "wb") as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = (
                    arglist.plots_dir + arglist.exp_name + "_agrewards.pkl"
                )
                with open(agrew_file_name, "wb") as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print("...Finished total of {} episodes.".format(len(episode_rewards)))
                break
Exemplo n.º 5
0
    def learn(self, total_timesteps, callback=None,
              log_interval=4, tb_log_name="SAC", 
              reset_num_timesteps=True, replay_wrapper=None,
              own_log_dir = None, 
              planning_steps = 0):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        callback = self._init_callback(callback)

        tb_configure(own_log_dir)

        steps_in_real_env = 0

        if replay_wrapper is not None:
            self.replay_buffer = replay_wrapper(self.replay_buffer)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:

            self._setup_learn()

            # Transform to callable if needed
            self.learning_rate = get_schedule_fn(self.learning_rate)
            # Initial learning rate
            current_lr = self.learning_rate(1)

            start_time = time.time()
            episode_rewards = [0.0]
            episode_successes = []
            if self.action_noise is not None:
                self.action_noise.reset()
            obs = self.env.reset()
            # Retrieve unnormalized observation for saving into the buffer
            if self._vec_normalize_env is not None:
                obs_ = self._vec_normalize_env.get_original_obs().squeeze()

            n_updates = 0
            infos_values = []

            callback.on_training_start(locals(), globals())
            callback.on_rollout_start()

            for step in range(total_timesteps):
                # Before training starts, randomly sample actions
                # from a uniform distribution for better exploration.
                # Afterwards, use the learned policy
                # if random_exploration is set to 0 (normal setting)
                if self.num_timesteps < self.learning_starts or np.random.rand() < self.random_exploration:
                    # actions sampled from action space are from range specific to the environment
                    # but algorithm operates on tanh-squashed actions therefore simple scaling is used
                    unscaled_action = self.env.action_space.sample()
                    action = scale_action(self.action_space, unscaled_action)
                else:
                    action = self.policy_tf.step(obs[None], deterministic=False).flatten()
                    # Add noise to the action (improve exploration,
                    # not needed in general)
                    if self.action_noise is not None:
                        action = np.clip(action + self.action_noise(), -1, 1)
                    # inferred actions need to be transformed to environment action_space before stepping
                    unscaled_action = unscale_action(self.action_space, action)

                assert action.shape == self.env.action_space.shape

                # if not planning: 
                #     new_obs, reward, done, info = self.env.step(unscaled_action)
                # else: 

                if not self.num_timesteps % (planning_steps + 1):
                    new_obs, reward, done, info = self.env.step(unscaled_action) #, step_num = self.num_timesteps)
                    steps_in_real_env +=1

                else: 
                    print("planning step")
                    new_obs, reward, done, info = self.non_vec_env.planning_step(unscaled_action)
                
                # Only stop training if return value is False, not when it is None. This is for backwards
                # compatibility with callbacks that have no return statement.
                callback.update_locals(locals())
                if callback.on_step() is False:
                    break

                # Store only the unnormalized version
                if self._vec_normalize_env is not None:
                    new_obs_ = self._vec_normalize_env.get_original_obs().squeeze()
                    reward_ = self._vec_normalize_env.get_original_reward().squeeze()
                else:
                    # Avoid changing the original ones
                    obs_, new_obs_, reward_ = obs, new_obs, reward

                if not self.num_timesteps % (planning_steps + 1):
                    print("writing real world step to TB")
                    tb_log_value("reward_in_environment", reward_, steps_in_real_env)

                tb_log_value("reward_planning", reward_, self.num_timesteps)
                self.num_timesteps += 1

                # Store transition in the replay buffer.
                self.replay_buffer_add(obs_, action, reward_, new_obs_, done, info)
                obs = new_obs
                # Save the unnormalized observation
                if self._vec_normalize_env is not None:
                    obs_ = new_obs_

                # Retrieve reward and episode length if using Monitor wrapper
                maybe_ep_info = info.get('episode')
                if maybe_ep_info is not None:
                    self.ep_info_buf.extend([maybe_ep_info])

                if writer is not None:
                    # Write reward per episode to tensorboard
                    ep_reward = np.array([reward_]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    tf_util.total_episode_reward_logger(self.episode_reward, ep_reward,
                                                        ep_done, writer, self.num_timesteps)

                if self.num_timesteps % self.train_freq == 0:
                    callback.on_rollout_end()

                    mb_infos_vals = []
                    # Update policy, critics and target networks
                    for grad_step in range(self.gradient_steps):
                        # Break if the warmup phase is not over
                        # or if there are not enough samples in the replay buffer
                        if not self.replay_buffer.can_sample(self.batch_size) \
                           or self.num_timesteps < self.learning_starts:
                            break
                        n_updates += 1
                        # Compute current learning_rate
                        frac = 1.0 - step / total_timesteps
                        current_lr = self.learning_rate(frac)
                        # Update policy and critics (q functions)
                        mb_infos_vals.append(self._train_step(step, writer, current_lr))
                        # Update target network
                        if (step + grad_step) % self.target_update_interval == 0:
                            # Update target network
                            self.sess.run(self.target_update_op)
                    # Log losses and entropy, useful for monitor training
                    if len(mb_infos_vals) > 0:
                        infos_values = np.mean(mb_infos_vals, axis=0)

                    callback.on_rollout_start()

                episode_rewards[-1] += reward_
                if done:
                    if self.action_noise is not None:
                        self.action_noise.reset()
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)

                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))

                if len(episode_rewards[-101:-1]) == 0:
                    mean_reward = -np.inf
                else:
                    mean_reward = round(float(np.mean(episode_rewards[-101:-1])), 1)

                # substract 1 as we appended a new term just now
                num_episodes = len(episode_rewards) - 1 
                # Display training infos
                if self.verbose >= 1 and done and log_interval is not None and num_episodes % log_interval == 0:
                    fps = int(step / (time.time() - start_time))
                    logger.logkv("episodes", num_episodes)
                    logger.logkv("mean 100 episode reward", mean_reward)
                    if len(self.ep_info_buf) > 0 and len(self.ep_info_buf[0]) > 0:
                        logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in self.ep_info_buf]))
                        logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in self.ep_info_buf]))
                    logger.logkv("n_updates", n_updates)
                    logger.logkv("current_lr", current_lr)
                    logger.logkv("fps", fps)
                    logger.logkv('time_elapsed', int(time.time() - start_time))
                    if len(episode_successes) > 0:
                        logger.logkv("success rate", np.mean(episode_successes[-100:]))
                    if len(infos_values) > 0:
                        for (name, val) in zip(self.infos_names, infos_values):
                            logger.logkv(name, val)
                    logger.logkv("total timesteps", self.num_timesteps)
                    logger.dumpkvs()
                    # Reset infos:
                    infos_values = []
            callback.on_training_end()
            return self #, ep_reward #, reward_
Exemplo n.º 6
0
    def learn(self,
              total_timesteps,
              callback=None,
              log_interval=4,
              tb_log_name="SAC",
              reset_num_timesteps=True,
              replay_wrapper=None,
              planning_steps=0):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        callback = self._init_callback(callback)

        # TODO: use builtin log writer instead of this old lib
        tb_configure(self.tensorboard_log)

        action_log_csv = self.tensorboard_log + "_actions.csv"

        action_log_df = pd.DataFrame(columns=np.concatenate((
            ["iteration"],
            ["p" + str(i) for i in range(24)],
            ["b" + str(i) for i in range(24)],
            ["e" + str(i) for i in range(24)],
        )))

        action_log_index = 0

        steps_in_real_env = 0
        person_data_dict = {}

        if replay_wrapper is not None:
            self.replay_buffer = replay_wrapper(self.replay_buffer)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:

            self._setup_learn()

            # Transform to callable if needed
            self.learning_rate = get_schedule_fn(self.learning_rate)
            # Initial learning rate
            current_lr = self.learning_rate(1)

            start_time = time.time()
            episode_rewards = [0.0]
            episode_successes = []
            if self.action_noise is not None:
                self.action_noise.reset()
            obs = self.env.reset()
            # Retrieve unnormalized observation for saving into the buffer
            if self._vec_normalize_env is not None:
                obs_ = self._vec_normalize_env.get_original_obs().squeeze()

            n_updates = 0
            infos_values = []

            callback.on_training_start(locals(), globals())
            callback.on_rollout_start()

            for step in range(total_timesteps):
                # Before training starts, randomly sample actions
                # from a uniform distribution for better exploration.
                # Afterwards, use the learned policy
                # if random_exploration is set to 0 (normal setting)
                if self.num_timesteps < self.learning_starts or np.random.rand(
                ) < self.random_exploration:
                    # actions sampled from action space are from range specific to the environment
                    # but algorithm operates on tanh-squashed actions therefore simple scaling is used
                    unscaled_action = self.env.action_space.sample()
                    action = scale_action(self.action_space, unscaled_action)
                else:
                    action = self.policy_tf.step(
                        obs[None], deterministic=False).flatten()
                    # Add noise to the action (improve exploration,
                    # not needed in general)
                    if self.action_noise is not None:
                        action = np.clip(action + self.action_noise(), -1, 1)
                    # inferred actions need to be transformed to environment action_space before stepping
                    unscaled_action = unscale_action(self.action_space, action)

                assert action.shape == self.env.action_space.shape

                # if not planning:
                #     new_obs, reward, done, info = self.env.step(unscaled_action)
                # else:

                if not self.num_timesteps % (planning_steps + 1):

                    ## TODO: work on this?

                    # if self.num_timesteps ==1:
                    #      # form the control
                    #     from sklearn.preprocessing import MinMaxScaler
                    #     grid_price = self.non_vec_env.prices[self.non_vec_env.day - 1]
                    #     scaler = MinMaxScaler(feature_range = (0, 10))
                    #     scaled_grid_price = scaler.fit_transform(np.array(grid_price).reshape(-1, 1))
                    #     scaled_grid_price = np.squeeze(scaled_grid_price)
                    #     energy_consumptions = self.non_vec_env._simulate_humans(scaled_grid_price)
                    #     person_data_dict["control"] = {
                    #         "x" : list(range(8, 18)),
                    #         "grid_price" : scaled_grid_price,
                    #         "energy_consumption" : energy_consumptions["avg"],
                    #         "reward" : self.non_vec_env._get_reward(price = grid_price, energy_consumptions = energy_consumptions),
                    #     }

                    # # form the data_dict
                    # if self.num_timesteps in [100, 1000, 9500]:
                    #     person_data_dict["Step " + str(self.num_timesteps)] = {
                    #         "x" : list(range(8, 18)),
                    #         "grid_price" : self.non_vec_env.prices[self.non_vec_env.day - 1],
                    #         "action" : unscaled_action,
                    #         "energy_consumption" : self.non_vec_env.prev_energy,
                    #         "reward" : reward,
                    #     }

                    # if self.num_timesteps == 9501 and self.people_reaction_log_dir and self.plotter_person_reaction:
                    #     # call the plotting statement
                    #     self.plotter_person_reaction(person_data_dict, self.people_reaction_log_dir)

                    new_obs, reward, done, info = self.env.step(
                        unscaled_action)  #, step_num = self.num_timesteps)
                    steps_in_real_env += 1

                else:
                    print("planning step")
                    new_obs, reward, done, info = self.non_vec_env.planning_step(
                        unscaled_action)

                # write the action to a csv

                # if ((not self.num_timesteps % 10) & (self.num_timesteps > 10000)) or self.num_timesteps>19500:

                #     ### get the battery charging
                #     battery_op = {}
                #     total_battery_consumption = np.zeros(24)
                #     total_energy_consumption = np.zeros(24)

                #     for prosumer_name in self.non_vec_env.prosumer_dict:
                #         #Get players response to agent's actions
                #         day = self.non_vec_env.day
                #         price = self.non_vec_env.price
                #         prosumer = self.non_vec_env.prosumer_dict[prosumer_name]
                #         prosumer_battery = prosumer.get_battery_operation(day, price)
                #         prosumer_demand = prosumer.get_response(day, price)

                #         total_battery_consumption += prosumer_battery
                #         total_energy_consumption += prosumer_demand

                #     action_log_df.loc[action_log_index] = np.concatenate(
                #         ([self.num_timesteps],
                #             price,
                #             total_battery_consumption,
                #             total_energy_consumption,))
                #     action_log_index += 1
                #     action_log_df.to_csv(action_log_csv)
                #     print("Iteration: " + str(self.num_timesteps))

                # Only stop training if return value is False, not when it is None. This is for backwards
                # compatibility with callbacks that have no return statement.
                callback.update_locals(locals())
                if callback.on_step() is False:
                    break

                # Store only the unnormalized version
                if self._vec_normalize_env is not None:
                    new_obs_ = self._vec_normalize_env.get_original_obs(
                    ).squeeze()
                    reward_ = self._vec_normalize_env.get_original_reward(
                    ).squeeze()
                else:
                    # Avoid changing the original ones
                    obs_, new_obs_, reward_ = obs, new_obs, reward

                if not self.num_timesteps % (planning_steps + 1):
                    tb_log_value("reward_in_environment", reward_,
                                 steps_in_real_env)

                # tb_log_value("reward_planning", reward_, self.num_timesteps)
                self.num_timesteps += 1

                # Store transition in the replay buffer.
                self.replay_buffer_add(obs_, action, reward_, new_obs_, done,
                                       info)
                obs = new_obs
                # Save the unnormalized observation
                if self._vec_normalize_env is not None:
                    obs_ = new_obs_

                # Retrieve reward and episode length if using Monitor wrapper
                maybe_ep_info = info.get('episode')
                if maybe_ep_info is not None:
                    self.ep_info_buf.extend([maybe_ep_info])

                if writer is not None:
                    # Write reward per episode to tensorboard
                    ep_reward = np.array([reward_]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    tf_util.total_episode_reward_logger(
                        self.episode_reward, ep_reward, ep_done, writer,
                        self.num_timesteps)

                    if self.num_timesteps % 100 == 0 and not np.any(
                            unscaled_action == np.inf):
                        if self.action_to_prices_fn:
                            prices = self.action_to_prices_fn(unscaled_action)
                            # tf_util.log_histogram(writer, "action_vec_hist", unscaled_action, self.num_timesteps, bins=10, flush=False)
                            # tb_log_value("constant_load_price", np.sum(prices), self.num_timesteps)
                            # tf_util.log_vec_as_histogram(writer, "prices", prices, self.num_timesteps, flush=True)

                if self.num_timesteps % self.train_freq == 0:
                    callback.on_rollout_end()

                    mb_infos_vals = []
                    # Update policy, critics and target networks
                    for grad_step in range(self.gradient_steps):
                        # Break if the warmup phase is not over
                        # or if there are not enough samples in the replay buffer
                        if not self.replay_buffer.can_sample(self.batch_size) \
                           or self.num_timesteps < self.learning_starts:
                            break
                        n_updates += 1
                        # Compute current learning_rate
                        frac = 1.0 - step / total_timesteps
                        current_lr = self.learning_rate(frac)
                        # Update policy and critics (q functions)
                        mb_infos_vals.append(
                            self._train_step(step, writer, current_lr))
                        # Update target network
                        if (step +
                                grad_step) % self.target_update_interval == 0:
                            # Update target network
                            self.sess.run(self.target_update_op)
                    # Log losses and entropy, useful for monitor training
                    if len(mb_infos_vals) > 0:
                        infos_values = np.mean(mb_infos_vals, axis=0)

                    callback.on_rollout_start()

                episode_rewards[-1] += reward_
                if done:
                    if self.action_noise is not None:
                        self.action_noise.reset()
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)

                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))

                if len(episode_rewards[-101:-1]) == 0:
                    mean_reward = -np.inf
                else:
                    mean_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                # substract 1 as we appended a new term just now
                num_episodes = len(episode_rewards) - 1
                # Display training infos
                if self.verbose >= 1 and done and log_interval is not None and num_episodes % log_interval == 0:
                    fps = int(step / (time.time() - start_time))
                    logger.logkv("episodes", num_episodes)
                    logger.logkv("mean 100 episode reward", mean_reward)
                    if len(self.ep_info_buf) > 0 and len(
                            self.ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_rewmean',
                            safe_mean([
                                ep_info['r'] for ep_info in self.ep_info_buf
                            ]))
                        logger.logkv(
                            'eplenmean',
                            safe_mean([
                                ep_info['l'] for ep_info in self.ep_info_buf
                            ]))
                    logger.logkv("n_updates", n_updates)
                    logger.logkv("current_lr", current_lr)
                    logger.logkv("fps", fps)
                    logger.logkv('time_elapsed', int(time.time() - start_time))
                    if len(episode_successes) > 0:
                        logger.logkv("success rate",
                                     np.mean(episode_successes[-100:]))
                    if len(infos_values) > 0:
                        for (name, val) in zip(self.infos_names, infos_values):
                            logger.logkv(name, val)
                    logger.logkv("total timesteps", self.num_timesteps)
                    logger.dumpkvs()
                    # Reset infos:
                    infos_values = []
            callback.on_training_end()
            return self  #, ep_reward #, reward_