Exemplo n.º 1
0
    def mpc_step(self, env_model, obs, args, desired_goal):
        mpc_sample = 10
        mpc_step = 5

        pure_obs_batch = np.array(
            [obs['observation'] for _ in range(mpc_sample)])
        desired_goal_batch = np.array(
            [obs['desired_goal'] for _ in range(mpc_sample)])
        selected = False
        original_acts = None
        for x in range(mpc_step):
            pi_input = np.concatenate([pure_obs_batch, desired_goal_batch],
                                      axis=1)
            actions = self.step_batch(pi_input, explore=True, batch_size=10)
            if selected == False:
                original_acts = actions.copy()
                selected = True
            pure_obs_batch = step_fake_batch(
                env_model=env_model,
                obs=pure_obs_batch,
                action=actions,
                dims=args.env_param['step_fake_param'],
                distance_threshold=args.distance_threshold,
                args=args,
                batch=10)
        min_id = -1
        min_dis = 999999999999
        for x in range(mpc_sample):
            achieved = pure_obs_batch[
                mpc_step -
                1][args.env_param['start_in_obs']:args.env_param['end_in_obs']]
            if goal_distance(desired_goal, achieved) < min_dis:
                min_dis = goal_distance(desired_goal, achieved)
                min_id = x
        return original_acts[min_id]
Exemplo n.º 2
0
    def learn(self, args, env, env_test, agent, buffer):
        initial_goals = []
        desired_goals = []
        for i in range(args.episodes):
            obs = self.env_List[i].reset()
            goal_a = obs['achieved_goal'].copy()
            goal_d = obs['desired_goal'].copy()
            initial_goals.append(goal_a.copy())
            desired_goals.append(goal_d.copy())

        self.sampler.update(initial_goals, desired_goals)

        achieved_trajectories = []
        achieved_init_states = []
        for i in range(args.episodes):
            obs = self.env_List[i].get_obs()
            init_state = obs['observation'].copy()
            explore_goal = self.sampler.sample(i)
            self.env_List[i].goal = explore_goal.copy()
            obs = self.env_List[i].get_obs()
            current = Trajectory(obs)
            trajectory = [obs['achieved_goal'].copy()]
            for timestep in range(args.timesteps):
                action = agent.step(obs, explore=True)
                obs, reward, done, info = self.env_List[i].step(action)
                trajectory.append(obs['achieved_goal'].copy())
                if timestep == args.timesteps - 1: done = True
                current.store_step(action, obs, reward, done)
                if done: break
            achieved_trajectories.append(np.array(trajectory))
            achieved_init_states.append(init_state)
            buffer.store_trajectory(current)
            agent.normalizer_update(buffer.sample_batch())

            if buffer.steps_counter >= args.warmup:
                for _ in range(args.train_batches):
                    info = agent.train(buffer.sample_batch())
                    args.logger.add_dict(info)
                agent.target_update()

        selection_trajectory_idx = {}
        for i in range(self.args.episodes):
            if goal_distance(achieved_trajectories[i][0],
                             achieved_trajectories[i][-1]) > 0.01:
                selection_trajectory_idx[i] = True
        for idx in selection_trajectory_idx.keys():
            self.achieved_trajectory_pool.insert(
                achieved_trajectories[idx].copy(),
                achieved_init_states[idx].copy())
Exemplo n.º 3
0
    def __init__(self, args, achieved_trajectory_pool):
        self.args = args
        self.env = make_env(args)
        self.env_test = make_env(args)
        self.dim = np.prod(self.env.reset()['achieved_goal'].shape)
        self.delta = self.env.distance_threshold

        self.length = args.episodes
        init_goal = self.env.reset()['achieved_goal'].copy()
        self.pool = np.tile(init_goal[np.newaxis, :],
                            [self.length, 1]) + np.random.normal(
                                0, self.delta, size=(self.length, self.dim))
        self.init_state = self.env.reset()['observation'].copy()

        self.match_lib = gcc_load_lib('learner/cost_flow.c')
        self.achieved_trajectory_pool = achieved_trajectory_pool

        # estimating diameter
        self.max_dis = 0
        for i in range(1000):
            obs = self.env.reset()
            dis = goal_distance(obs['achieved_goal'], obs['desired_goal'])
            if dis > self.max_dis: self.max_dis = dis
Exemplo n.º 4
0
 def compute_reward_direct(self, achieved, goal):
     dis = goal_distance(achieved, goal)
     return -1.0 if dis > self.distance_threshold else 0
Exemplo n.º 5
0
 def compute_reward(self, observation_current, observation_old, goal):
     dis = goal_distance(observation_current['achieved_goal'], goal)
     return -1.0 if dis > self.distance_threshold else 0.0
Exemplo n.º 6
0
    def learn(self, args, env, env_test, agent, buffer, write_goals=0):
        # Actual learning cycle takes place here!
        initial_goals = []
        desired_goals = []
        goal_list = []

        # get initial position and goal from environment for each epsiode
        for i in range(args.episodes):
            obs = self.env_List[i].reset()
            goal_a = obs['achieved_goal'].copy()
            goal_d = obs['desired_goal'].copy()
            initial_goals.append(goal_a.copy())
            desired_goals.append(goal_d.copy())

        # if HGG has not been stopped yet, perform crucial HGG update step here
        # by updating the sampler, a set of intermediate goals is provided and stored in sampler
        # based on distance to target goal distribution, similarity of initial states and expected reward (see paper)
        # by bipartite matching
        if not self.stop:
            self.sampler.update(initial_goals, desired_goals)
        if self.stop:
            buffer.stop_trade_off = True

        achieved_trajectories = []
        achieved_init_states = []

        explore_goals = []
        test_goals = []
        inside = []
        left_dis_total = 0
        for i in range(args.episodes):
            obs = self.env_List[i].get_obs()
            init_state = obs['observation'].copy()

            # if HGG has not been stopped yet, sample from the goals provided by the update step
            # if it has been stopped, the goal to explore is simply the one generated by the environment
            if not self.stop:
                explore_goal = self.sampler.sample(i)
            else:
                explore_goal = desired_goals[i]

            left_dis_total += self.sampler.get_graph_goal_distance(
                explore_goal, desired_goals[i])

            # store goals in explore_goals list to check whether goals are within goal space later
            explore_goals.append(explore_goal)
            test_goal = self.env.generate_goal()
            if test_goal.shape[-1] == 7:
                test_goal = test_goal[3:]  # for some hand tasks
            test_goals.append(test_goal)

            # Perform HER training by interacting with the environment
            self.env_List[i].goal = explore_goal.copy()
            if write_goals != 0 and len(goal_list) < write_goals:
                goal_list.append(explore_goal.copy())
            obs = self.env_List[i].get_obs()
            current = Trajectory(obs)
            trajectory = [obs['achieved_goal'].copy()]
            for timestep in range(args.timesteps):
                # get action from the ddpg policy
                action = agent.step(obs, explore=True)
                # feed action to environment, get observation and reward
                obs, reward, done, info = self.env_List[i].step(action)
                trajectory.append(obs['achieved_goal'].copy())
                if timestep == args.timesteps - 1: done = True
                current.store_step(action, obs, reward, done)
                if done: break
            achieved_trajectories.append(np.array(trajectory))
            achieved_init_states.append(init_state)
            # Trajectory is stored in replay buffer, replay buffer can be normal or EBP
            buffer.store_trajectory(current)
            agent.normalizer_update(buffer.sample_batch())

            if buffer.steps_counter >= args.warmup:
                for _ in range(args.train_batches):
                    # train with Hindsight Goals (HER step)
                    info = agent.train(buffer.sample_batch())
                    args.logger.add_dict(info)
                # update target network
                agent.target_update()
        if left_dis_total == 0:
            buffer.dis_balance = 1000
            # maximum
        else:
            buffer.dis_balance = args.balance_eta * pow(
                2.71, (-left_dis_total / args.episodes) /
                (args.balance_sigma * args.balance_sigma))

        selection_trajectory_idx = {}
        for i in range(self.args.episodes):
            # only add trajectories with movement to the trajectory pool --> use default (L2) distance measure!
            if goal_distance(achieved_trajectories[i][0],
                             achieved_trajectories[i][-1]) > 0.01:
                selection_trajectory_idx[i] = True
        for idx in selection_trajectory_idx.keys():
            self.achieved_trajectory_pool.insert(
                achieved_trajectories[idx].copy(),
                achieved_init_states[idx].copy())

        # unless in first call: Check which of the explore goals are inside the target goal space target goal space
        # is represented by a sample of test_goals directly generated from the environment an explore goal is
        # considered inside the target goal space, if it is closer than the distance_threshold to one of the test
        # goals (i.e. would yield a non-negative reward if that test goal was to be achieved)
        if self.learn_calls > 0:
            assert len(explore_goals) == len(test_goals)
            for ex in explore_goals:
                is_inside = 0
                for te in test_goals:
                    # TODO: check: originally with self.sampler.get_graph_goal_distance, now trying with goal_distance (L2)
                    if goal_distance(
                            ex, te) <= self.env.env.env.distance_threshold:
                        is_inside = 1
                inside.append(is_inside)
            assert len(inside) == len(test_goals)
            inside_sum = 0
            for i in inside:
                inside_sum += i

            # If more than stop_hgg_threshold (e.g. 0.9) of the explore goals are inside the target goal space, stop HGG
            # and continue with normal HER.
            # By default, stop_hgg_threshold is disabled (set to a value > 1)
            average_inside = inside_sum / len(inside)
            self.args.logger.info("Average inside: {}".format(average_inside))
            if average_inside > self.stop_hgg_threshold:
                self.stop = True
                self.args.logger.info("Continue with normal HER")

        self.learn_calls += 1

        return goal_list if len(goal_list) > 0 else None
Exemplo n.º 7
0
    def update(self, initial_goals, desired_goals):
        if self.achieved_trajectory_pool.counter == 0:
            self.pool = copy.deepcopy(desired_goals)
            return

        achieved_pool, achieved_pool_init_state = self.achieved_trajectory_pool.pad(
        )
        candidate_goals = []
        candidate_edges = []
        candidate_id = []

        agent = self.args.agent
        achieved_value = []
        for i in range(len(achieved_pool)):
            obs = [
                goal_concat(achieved_pool_init_state[i], achieved_pool[i][j])
                for j in range(achieved_pool[i].shape[0])
            ]
            feed_dict = {agent.raw_obs_ph: obs}
            value = agent.sess.run(agent.q_pi, feed_dict)[:, 0]
            value = np.clip(value, -1.0 / (1.0 - self.args.gamma), 0)
            achieved_value.append(value.copy())

        n = 0
        graph_id = {'achieved': [], 'desired': []}
        for i in range(len(achieved_pool)):
            n += 1
            graph_id['achieved'].append(n)
        for i in range(len(desired_goals)):
            n += 1
            graph_id['desired'].append(n)
        n += 1
        self.match_lib.clear(n)

        for i in range(len(achieved_pool)):
            self.match_lib.add(0, graph_id['achieved'][i], 1, 0)
        for i in range(len(achieved_pool)):
            for j in range(len(desired_goals)):

                # use graph_goal_distance here!
                if self.args.graph:
                    size = achieved_pool[i].shape[0]
                    res_1 = np.zeros(size)
                    for k in range(size):
                        res_1[k] = self.get_graph_goal_distance(
                            achieved_pool[i][k], desired_goals[j])
                    res = res_1 - achieved_value[i] / (self.args.hgg_L /
                                                       self.max_dis /
                                                       (1 - self.args.gamma))
                elif self.args.route and self.args.env == 'FetchPickObstacle-v1':
                    size = achieved_pool[i].shape[0]
                    res_1 = np.zeros(size)
                    for k in range(size):
                        res_1[k] = self.get_route_goal_distance(
                            achieved_pool[i][k], desired_goals[j])
                    res = res_1 - achieved_value[i] / (self.args.hgg_L /
                                                       self.max_dis /
                                                       (1 - self.args.gamma))
                else:
                    res = np.sqrt(
                        np.sum(np.square(achieved_pool[i] - desired_goals[j]),
                               axis=1)) - achieved_value[i] / (
                                   self.args.hgg_L / self.max_dis /
                                   (1 - self.args.gamma))  # that was original

                match_dis = np.min(res) + goal_distance(
                    achieved_pool[i][0], initial_goals[j]
                ) * self.args.hgg_c  # distance of initial positions: take l2 norm_as before
                match_idx = np.argmin(res)

                edge = self.match_lib.add(graph_id['achieved'][i],
                                          graph_id['desired'][j], 1,
                                          c_double(match_dis))
                candidate_goals.append(achieved_pool[i][match_idx])
                candidate_edges.append(edge)
                candidate_id.append(j)
        for i in range(len(desired_goals)):
            self.match_lib.add(graph_id['desired'][i], n, 1, 0)

        match_count = self.match_lib.cost_flow(0, n)
        assert match_count == self.length

        explore_goals = [0] * self.length
        for i in range(len(candidate_goals)):
            if self.match_lib.check_match(candidate_edges[i]) == 1:
                explore_goals[candidate_id[i]] = candidate_goals[i].copy()
        assert len(explore_goals) == self.length
        self.pool = np.array(explore_goals)
Exemplo n.º 8
0
    def update(self, initial_goals, desired_goals):
        if self.achieved_trajectory_pool.counter == 0:
            self.pool = copy.deepcopy(desired_goals)
            return

        achieved_pool, achieved_pool_init_state = self.achieved_trajectory_pool.pad(
        )
        candidate_goals = []
        candidate_edges = []
        candidate_id = []

        agent = self.args.agent
        achieved_value = []
        for i in range(len(achieved_pool)):
            obs = [
                goal_concat(achieved_pool_init_state[i], achieved_pool[i][j])
                for j in range(achieved_pool[i].shape[0])
            ]
            feed_dict = {agent.raw_obs_ph: obs}
            value = agent.sess.run(agent.q_pi, feed_dict)[:, 0]
            value = np.clip(value, -1.0 / (1.0 - self.args.gamma), 0)
            achieved_value.append(value.copy())

        n = 0
        graph_id = {'achieved': [], 'desired': []}
        for i in range(len(achieved_pool)):
            n += 1
            graph_id['achieved'].append(n)
        for i in range(len(desired_goals)):
            n += 1
            graph_id['desired'].append(n)
        n += 1
        self.match_lib.clear(n)

        for i in range(len(achieved_pool)):
            self.match_lib.add(0, graph_id['achieved'][i], 1, 0)
        for i in range(len(achieved_pool)):
            for j in range(len(desired_goals)):
                res = np.sqrt(
                    np.sum(np.square(achieved_pool[i] - desired_goals[j]),
                           axis=1)) - achieved_value[i] / (
                               self.args.hgg_L / self.max_dis /
                               (1 - self.args.gamma))
                match_dis = np.min(res) + goal_distance(
                    achieved_pool[i][0], initial_goals[j]) * self.args.hgg_c
                match_idx = np.argmin(res)

                edge = self.match_lib.add(graph_id['achieved'][i],
                                          graph_id['desired'][j], 1,
                                          c_double(match_dis))
                candidate_goals.append(achieved_pool[i][match_idx])
                candidate_edges.append(edge)
                candidate_id.append(j)
        for i in range(len(desired_goals)):
            self.match_lib.add(graph_id['desired'][i], n, 1, 0)

        match_count = self.match_lib.cost_flow(0, n)
        assert match_count == self.length

        explore_goals = [0] * self.length
        for i in range(len(candidate_goals)):
            if self.match_lib.check_match(candidate_edges[i]) == 1:
                explore_goals[candidate_id[i]] = candidate_goals[i].copy()
        assert len(explore_goals) == self.length
        self.pool = np.array(explore_goals)
Exemplo n.º 9
0
    def learn(self, args, env, env_test, agent, buffer, write_goals=0):
        # Actual learning cycle takes place here!
        initial_goals = []
        desired_goals = []
        episodes = args.episodes // 5
        # get initial position and goal from environment for each epsiode
        for i in range(episodes):
            obs = self.env_List[i].reset()
            goal_a = obs['achieved_goal'].copy()
            goal_d = obs['desired_goal'].copy()
            initial_goals.append(goal_a.copy())
            desired_goals.append(goal_d.copy())
        goal_list = []
        achieved_trajectories = []
        achieved_init_states = []
        explore_goals = []
        test_goals = []
        inside = []
        for i in range(episodes):
            obs = self.env_List[i].get_obs()
            init_state = obs['observation'].copy()
            sampler = MatchSampler(args, self.env_List[i])
            loop = train_goalGAN(agent, initialize_GAN(env=self.env_List[i]), sampler, 5, True)
            next(loop)
            if not self.stop:
                explore_goal = sampler.sample()
            else:
                explore_goal = desired_goals[i]

            # store goals in explore_goals list to check whether goals are within goal space later
            explore_goals.append(explore_goal)
            test_goals.append(self.env.generate_goal())

            # Perform HER training by interacting with the environment
            self.env_List[i].goal = explore_goal.copy()
            if write_goals != 0 and len(goal_list) < write_goals:
                goal_list.append(explore_goal.copy())
            current = None
            trajectory = None
            for iters in range(NUM):
                if iters < 20:
                    obs = self.env_List[i].get_obs()
                    current = Trajectory(obs)
                    trajectory = [obs['achieved_goal'].copy()]
                has_success = False
                for timestep in range(args.timesteps // SCALE):
                    # get action from the ddpg policy
                    action = agent.step(obs, explore=True)
                    # feed action to environment, get observation and reward
                    obs, reward, done, info = self.env_List[i].step(action)
                    is_success = reward == 0
                    if iters < 20:
                        trajectory.append(obs['achieved_goal'].copy())
                        current.store_step(action, obs, reward, done)
                    if is_success and not has_success:
                        has_success = True
                        if len(sampler.successes_per_goal) > 0:
                            sampler.successes_per_goal[tuple(self.env_List[i].goal)].append(is_success)
                    if timestep == args.timesteps // SCALE -1:
                        if len(sampler.successes_per_goal) > 0:
                            sampler.successes_per_goal[tuple(self.env_List[i].goal)].append(is_success)

                next(loop)
                sampler.reset()
                if iters < 20:
                    achieved_trajectories.append(np.array(trajectory))
                    achieved_init_states.append(init_state)
                    # Trajectory is stored in replay buffer, replay buffer can be normal or EBP
                    buffer.store_trajectory(current)
                    agent.normalizer_update(buffer.sample_batch())



            if buffer.steps_counter >= args.warmup:
                for _ in range(args.train_batches):
                    # train with Hindsight Goals (HER step)
                    info = agent.train(buffer.sample_batch())
                    args.logger.add_dict(info)
                # update target network
                agent.target_update()

        selection_trajectory_idx = {}
        for i in range(episodes):
            # only add trajectories with movement to the trajectory pool --> use default (L2) distance measure!
            if goal_distance(achieved_trajectories[i][0], achieved_trajectories[i][-1]) > 0.01:
                selection_trajectory_idx[i] = True
        for idx in selection_trajectory_idx.keys():
            self.achieved_trajectory_pool.insert(achieved_trajectories[idx].copy(), achieved_init_states[idx].copy())

        # unless in first call: Check which of the explore goals are inside the target goal space target goal space
        # is represented by a sample of test_goals directly generated from the environment an explore goal is
        # considered inside the target goal space, if it is closer than the distance_threshold to one of the test
        # goals (i.e. would yield a non-negative reward if that test goal was to be achieved)
        if self.learn_calls > 0:
            assert len(explore_goals) == len(test_goals)
            for ex in explore_goals:
                is_inside = 0
                for te in test_goals:
                    # TODO: check: originally with self.sampler.get_graph_goal_distance, now trying with goal_distance (L2)
                    if goal_distance(ex, te) <= self.env.env.env.distance_threshold:
                        is_inside = 1
                inside.append(is_inside)
            assert len(inside) == len(test_goals)
            inside_sum = 0
            for i in inside:
                inside_sum += i

            # If more than stop_hgg_threshold (e.g. 0.9) of the explore goals are inside the target goal space, stop HGG
            # and continue with normal HER.
            # By default, stop_hgg_threshold is disabled (set to a value > 1)
            average_inside = inside_sum / len(inside)
            self.args.logger.info("Average inside: {}".format(average_inside))
            if average_inside > self.stop_hgg_threshold:
                self.stop = True
                self.args.logger.info("Continue with normal HER")

        self.learn_calls += 1

        return goal_list if len(goal_list) > 0 else None
Exemplo n.º 10
0
    def learn(self,
              args,
              env,
              env_test,
              agent,
              buffer,
              buffer_fake=None,
              env_model=None,
              fake=False,
              test=False):
        self.current_trajs = {"eps": [], "obs": [], "goal": []}
        self.hist_trajs = {"eps": [], "obs": [], "goal": []}
        initial_goals = []
        desired_goals = []
        for i in range(args.episodes):
            obs = self.env_List[i].reset()
            goal_a = obs['achieved_goal'].copy()
            goal_d = obs['desired_goal'].copy()
            initial_goals.append(goal_a.copy())
            desired_goals.append(goal_d.copy())

        if args.goal_generator:
            self.sampler.update(initial_goals, desired_goals)

        achieved_trajectories = []
        achieved_init_states = []

        for i in range(args.episodes):
            obs = self.env_List[i].get_obs()
            init_state = obs['observation'].copy()

            # decide on whether to use goal generator
            if args.goal_generator:
                # generate goal by HGG or GoalGAN
                explore_goal = self.sampler.sample(i)

                # replace goal given by the environment
                self.env_List[i].goal = explore_goal.copy()

            # initialization for interaction with the environment
            obs = self.env_List[i].get_obs()
            current = Trajectory(obs)
            trajectory = [obs['achieved_goal'].copy()]

            for timestep in range(args.timesteps):
                action = agent.step(obs, explore=True)
                self.dynamic_buffer.add(obs['observation'].copy(), 'st')
                obss = obs.copy()
                obs, reward, done, info = self.env_List[i].step(action)
                self.dynamic_buffer.add(action.copy(), 'at')
                self.dynamic_buffer.add(obs['observation'].copy(), 'stpo')

                trajectory.append(obs['achieved_goal'].copy())

                if buffer.steps_counter >= args.warmup:
                    for _ in range(self.args.training_freq):
                        batch_real = buffer.sample_batch(batch_size=12,
                                                         sample_for_mb=False)
                        batch_fake = buffer_fake.sample_batch(batch_size=244)
                        batch_new = {
                            'obs':
                            batch_real['obs'] + batch_fake['obs'],
                            'obs_next':
                            batch_real['obs_next'] + batch_fake['obs_next'],
                            'acts':
                            batch_real['acts'] + batch_fake['acts'],
                            'rews':
                            batch_real['rews'] + batch_fake['rews']
                        }
                        info = agent.train(batch_new)
                        args.logger.add_dict(info)
                agent.target_update()

                if timestep == args.timesteps - 1:
                    done = True
                current.store_step(action, obs, reward, done)
                if done:
                    break

            # dynamic model training
            if args.fgi or args.model_based_training:
                # calculate delta state (st+1 minus st), which is a trick
                if self.dynamic_buffer.dynamic_buffer_number <= 1000000:
                    # if len(self.st)<=20000:
                    _st = np.array(
                        self.dynamic_buffer.data['st']
                        [:self.dynamic_buffer.dynamic_buffer_number].copy())
                    _at = np.array(
                        self.dynamic_buffer.data['at']
                        [:self.dynamic_buffer.dynamic_buffer_number].copy())
                    _stpo = np.array(
                        self.dynamic_buffer.data['stpo']
                        [:self.dynamic_buffer.dynamic_buffer_number].copy())
                    target = _stpo - _st
                    inputs = np.concatenate([_st, _at], axis=1)
                    outputs = np.array(target)
                else:
                    _st = []
                    _at = []
                    _stpo = []
                    target = []
                    inds = np.random.randint(
                        0,
                        self.dynamic_buffer.dynamic_buffer_number,
                        size=1000000)
                    for x in range(1000000):
                        _st.append(
                            self.dynamic_buffer.data['st'][inds[x]].copy())
                        _at.append(
                            self.dynamic_buffer.data['at'][inds[x]].copy())
                        target.append(
                            (self.dynamic_buffer.data['stpo'][inds[x]] -
                             self.dynamic_buffer.data['st'][inds[x]]).copy())
                    _st = np.array(_st)
                    _at = np.array(_at)
                    target = np.array(target)
                    inputs = np.concatenate([_st, _at], axis=1)
                    outputs = np.array(target)
                if len(self.model_loss) > 0 and self.model_loss[-1] < 0.03:
                    los = env_model.train(inputs=inputs,
                                          targets=outputs,
                                          holdout_ratio=0.2,
                                          batch_size=256,
                                          max_epochs=10)
                else:
                    los = env_model.train(inputs=inputs,
                                          targets=outputs,
                                          holdout_ratio=0.2,
                                          batch_size=256,
                                          max_epochs=None)
                del (_st)
                del (_at)
                del (_stpo)
                del (inputs)
                del (outputs)
                self.model_loss.append(los['val_loss'])

            # update buffer and normalizer
            achieved_trajectories.append(np.array(trajectory))
            achieved_init_states.append(init_state)
            buffer.store_trajectory(current)

            if buffer.steps_counter > args.warmup:
                agent.normalizer_update(buffer.sample_batch())

            # generate fake data
            if buffer.steps_counter > args.warmup - 1 and args.model_based_training:
                print('extending...')
                extend_length = self.args.extend_length
                self.extend_traj(extend_length=extend_length,
                                 env_model=env_model,
                                 buffer=buffer,
                                 agent=agent,
                                 buffer_fake=buffer_fake)
                print('extend over.')

        # update achieved_trajectories for HGG sampler
        selection_trajectory_idx = {}

        for i in range(self.args.episodes):
            if goal_distance(achieved_trajectories[i][0],
                             achieved_trajectories[i][-1]) > 0.01:
                selection_trajectory_idx[i] = True
        for idx in selection_trajectory_idx.keys():
            self.achieved_trajectory_pool.insert(
                achieved_trajectories[idx].copy(),
                achieved_init_states[idx].copy())