示例#1
0
    def train(self, args):
        self.agent.start_interaction(self.envs, nlump=self.hps['nlumps'], dynamics=self.dynamics)
        sess = tf.get_default_session()
        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)
        checkdir = osp.join(logger.get_dir(), 'checkpoints')
        os.makedirs(checkdir, exist_ok=True)
        load_weights = args['load_weights']
        start_nupdates = 0
        if load_weights is not None:
            load_path = osp.join(checkdir, load_weights)
            start_nupdates = int(load_weights)
            print('Loading checkpoint from %s ' % load_weights)
            self.load(load_path)

        while True:
            info = self.agent.step()
            if info['update']:
                info['update']['n_updates'] += start_nupdates
                info['update']['tcount'] += start_nupdates*args['nsteps_per_seg']*args['envs_per_process']
                logger.logkvs(info['update'])
                logger.dumpkvs()

                
                if info['update']['n_updates'] % 10 == 0: 
                    weights_index =  info['update']['n_updates']             
                    savepath = osp.join(checkdir, '%.5i'% weights_index)
                    print('Saving to', savepath)
                    self.save(savepath)

            if self.agent.rollout.stats['tcount'] > self.num_timesteps:
                break
        self.agent.stop_interaction()
def train(args, model, device, train_loader, optimizer, epoch):
    model.train()
    total_loss = 0
    correct = 0
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device).float(), target.to(device).long()
        optimizer.zero_grad()
        output = model(data)
        loss = F.cross_entropy(output, target, reduction='sum')
        loss.backward()
        optimizer.step()
        # get the index of the max log-probability
        pred = output.max(1, keepdim=True)[1]


        correct += pred.eq(target.view_as(pred)).sum().item()
        total_loss += loss.item()
        if batch_idx % args.log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                       100. * batch_idx / len(train_loader), loss.item()))
    total_loss /= float(len(train_loader.dataset))
    acc = correct / float(len(train_loader.dataset))
    logger.logkv('epoch', epoch)
    logger.logkv('train/loss', total_loss)
    logger.logkv('train/acc', acc)
    logger.dumpkvs()
    return total_loss, acc
    def train(self):
        next_v = 1e6
        v = self.value_fun.get_values()
        itr = 0
        videos = []
        contours = []
        returns = []
        delay_cs = []
        fig = None

        while not self._stop_condition(itr, next_v, v) and itr < self.max_itr:
            log = itr % self.log_itr == 0
            render = (itr % self.render_itr == 0) and self.render
            if log:
                next_pi = self.get_next_policy()
                self.policy.update(next_pi)
                average_return, avg_delay_cost, video = rollout(self.env, self.policy, render=render,
                                                num_rollouts=self.num_rollouts, max_path_length=self.max_path_length,iteration=itr)
                if render:
                    contour, fig = plot_contour(self.env, self.value_fun, fig=fig, iteration=itr)
                    contours += [contour] * len(video)
                    videos += video
                returns.append(average_return)
                delay_cs.append(avg_delay_cost)
                logger.logkv('Iteration', itr)
                logger.logkv('Average Returns', average_return)
                logger.logkv('Average Delayed Costs', avg_delay_cost)
                logger.dumpkvs()
            next_v = self.get_next_values()
            self.value_fun.update(next_v)
            itr += 1

        next_pi = self.get_next_policy()
        self.policy.update(next_pi)
        contour, fig = plot_contour(self.env, self.value_fun, save=True, fig=fig, iteration=itr)
        average_return, avg_delay_cost, video = rollout(self.env, self.policy,
                                        render=True, num_rollouts=self.num_rollouts, max_path_length=self.max_path_length, iteration=itr)
        self.env.close()
        plot_returns(returns)
        plot_returns(delay_cs,'delayed_cost')
        videos += video
        if self.render:
            contours += [contour]
        logger.logkv('Iteration', itr)
        logger.logkv('Average Returns', average_return)
        logger.logkv('Average Delayed Costs', avg_delay_cost)

        fps = int(4/getattr(self.env, 'dt', 0.1))
        if contours and contours[0] is not None:
            clip = mpy.ImageSequenceClip(contours, fps=fps)
            clip.write_videofile('%s/contours_progress.mp4' % logger.get_dir())

        if videos:
            clip = mpy.ImageSequenceClip(videos, fps=fps)
            clip.write_videofile('%s/roll_outs.mp4' % logger.get_dir())

        plt.close()
示例#4
0
    def train(self):
        obs = self._env.reset()

        episode_rewards = []
        n_episodes = 0
        l_episode_return = deque([], maxlen=10)
        l_discounted_episode_return = deque([], maxlen=10)
        l_tq_squared_error = deque(maxlen=50)
        log_itr = -1
        for itr in range(self._initial_step, self._max_steps):
            act = self.eps_greedy(obs[np.newaxis, :],
                                  self.exploration.value(itr))
            next_obs, rew, done, _ = self._env.step(act)
            if self._render:
                self._env.render()
            self._replay_buffer.add(obs, act, rew, next_obs, float(done))

            episode_rewards.append(rew)

            if done:
                obs = self._env.reset()
                episode_return = np.sum(episode_rewards)
                discounted_episode_return = np.sum(
                    episode_rewards *
                    self._discount**np.arange(len(episode_rewards)))
                l_episode_return.append(episode_return)
                l_discounted_episode_return.append(discounted_episode_return)
                episode_rewards = []
                n_episodes += 1
            else:
                obs = next_obs

            if itr % self._target_q_update_freq == 0 and itr > self._learning_start_itr:
                self._update_target_q()

            if itr % self._train_q_freq == 0 and itr > self._learning_start_itr:
                # Sample from replay buffer.
                l_obs, l_act, l_rew, l_obs_prime, l_done = self._replay_buffer.sample(
                    self._opt_batch_size)
                # Train Q value function with sampled data.
                td_squared_error = self.train_q(l_obs, l_act, l_rew,
                                                l_obs_prime, l_done)
                l_tq_squared_error.append(td_squared_error)

            if (itr + 1) % self._log_freq == 0 and len(l_episode_return) > 5:
                log_itr += 1
                logger.logkv('Iteration', log_itr)
                logger.logkv('Steps', itr)
                logger.logkv('Epsilon', self.exploration.value(itr))
                logger.logkv('Episodes', n_episodes)
                logger.logkv('AverageReturn', np.mean(l_episode_return))
                logger.logkv('AverageDiscountedReturn',
                             np.mean(l_discounted_episode_return))
                logger.logkv('TDError^2', np.mean(l_tq_squared_error))
                logger.dumpkvs()
                self._q.dump(logger.get_dir() + '/weights.pkl')
示例#5
0
def test(episodes=20, agent=None, load_path=None, ifrender=False, log=False):
    if log:
        logger.configure(dir="./log/", format_strs="stdout")
    if agent is None:
        agent = DQN(num_state=16, num_action=4)
        if load_path:
            agent.load(load_path)
        else:
            agent.load()

    env = Game2048Env()
    score_list = []
    highest_list = []

    for i in range(episodes):
        state, _, done, info = env.reset()
        state = log2_shaping(state)

        start = time.time()
        while True:
            action = agent.select_action(state, deterministic=True)
            next_state, _, done, info = env.step(action)
            next_state = log2_shaping(next_state)
            state = next_state

            if ifrender:
                env.render()

            if done:
                print(env.Matrix)
                if log:
                    logger.logkv('episode number', i + 1)
                    logger.logkv('episode reward', info['score'])
                    logger.logkv('episode steps', info['steps'])
                    logger.logkv('highest', info['highest'])
                    logger.dumpkvs()
                break

        end = time.time()
        if log:
            print('episode time:{} s\n'.format(end - start))

        score_list.append(info['score'])
        highest_list.append(info['highest'])

    print('mean score:{}, mean highest:{}'.format(np.mean(score_list),
                                                  np.mean(highest_list)))
    print('max score:{}, max hightest:{}'.format(np.max(score_list),
                                                 np.max(highest_list)))
    result_info = {
        'mean': np.mean(score_list),
        'max': np.max(score_list),
        'list': score_list
    }
    print(highest_list)
    return result_info
示例#6
0
    def train(self):
        obs = self._env.reset()

        episode_rewards = []
        n_episodes = 0
        l_episode_return = deque([], maxlen=10)
        l_discounted_episode_return = deque([], maxlen=10)
        l_tq_squared_error = deque(maxlen=50)
        log_itr = -1
        for itr in range(self._initial_step, self._max_steps):
            act = self.eps_greedy(obs[np.newaxis, :],
                                  self.exploration.value(itr))
            next_obs, rew, done, _ = self._env.step(act)
            if self._render:
                self._env.render()
            self._replay_buffer.add(obs, act, rew, next_obs, float(done))

            episode_rewards.append(rew)

            if done:
                obs = self._env.reset()
                episode_return = np.sum(episode_rewards)
                discounted_episode_return = np.sum(
                    episode_rewards * self._discount ** np.arange(len(episode_rewards)))
                l_episode_return.append(episode_return)
                l_discounted_episode_return.append(discounted_episode_return)
                episode_rewards = []
                n_episodes += 1
            else:
                obs = next_obs

            if itr % self._target_q_update_freq == 0 and itr > self._learning_start_itr:
                self._update_target_q()

            if itr % self._train_q_freq == 0 and itr > self._learning_start_itr:
                # Sample from replay buffer.
                l_obs, l_act, l_rew, l_obs_prime, l_done = self._replay_buffer.sample(
                    self._opt_batch_size)
                # Train Q value function with sampled data.
                td_squared_error = self.train_q(
                    l_obs, l_act, l_rew, l_obs_prime, l_done)
                l_tq_squared_error.append(td_squared_error)

            if (itr + 1) % self._log_freq == 0 and len(l_episode_return) > 5:
                log_itr += 1
                logger.logkv('Iteration', log_itr)
                logger.logkv('Steps', itr)
                logger.logkv('Epsilon', self.exploration.value(itr))
                logger.logkv('Episodes', n_episodes)
                logger.logkv('AverageReturn', np.mean(l_episode_return))
                logger.logkv('AverageDiscountedReturn',
                             np.mean(l_discounted_episode_return))
                logger.logkv('TDError^2', np.mean(l_tq_squared_error))
                logger.dumpkvs()
                self._q.dump(logger.get_dir() + '/weights.pkl')
示例#7
0
    def train(self):
        self.agent.start_interaction(self.envs,
                                     nlump=self.hps['nlumps'],
                                     dynamics=self.dynamics)
        while True:
            info = self.agent.step()
            if info['update']:
                logger.logkvs(info['update'])
                logger.dumpkvs()
            if self.agent.rollout.stats['tcount'] > self.num_timesteps:
                break

        self.agent.stop_interaction()
示例#8
0
def instant_impulse(variant):
    env_name = variant['env_name']
    env = get_env_from_name(env_name)
    env_params = variant['env_params']

    eval_params = variant['eval_params']
    policy_params = variant['alg_params']
    policy_params.update({
        's_bound': env.observation_space,
        'a_bound': env.action_space,
    })

    build_func = get_policy(variant['algorithm_name'])
    if 'Fetch' in env_name or 'Hand' in env_name:
        s_dim = env.observation_space.spaces['observation'].shape[0] \
                + env.observation_space.spaces['achieved_goal'].shape[0] + \
                env.observation_space.spaces['desired_goal'].shape[0]
    else:
        s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.shape[0]
    # d_dim = env_params['disturbance dim']
    policy = build_func(a_dim, s_dim, policy_params)
    # disturber = Disturber(d_dim, s_dim, disturber_params)

    log_path = variant['log_path'] + '/eval/safety_eval'
    variant['eval_params'].update({'magnitude': 0})
    logger.configure(dir=log_path, format_strs=['csv'])
    for magnitude in eval_params['magnitude_range']:
        variant['eval_params']['magnitude'] = magnitude
        diagnostic_dict = evaluation(variant, env, policy)

        string_to_print = ['magnitude', ':', str(magnitude), '|']
        [
            string_to_print.extend(
                [key, ':', str(round(diagnostic_dict[key], 2)), '|'])
            for key in diagnostic_dict.keys()
        ]
        print(''.join(string_to_print))

        logger.logkv('magnitude', magnitude)
        [
            logger.logkv(key, diagnostic_dict[key])
            for key in diagnostic_dict.keys()
        ]
        logger.dumpkvs()
示例#9
0
def train(num_iter, log_schedule):
    game = KuhnPoker()
    strategy_profile = get_initial_strategy_profile(game.root,
                                                    game.num_players)
    average_strategy_profile = deepcopy(strategy_profile)
    for t in tqdm(range(num_iter)):
        update_pi(game.root, strategy_profile, average_strategy_profile,
                  [1.0 for _ in range(game.num_players + 1)],
                  [1.0 for _ in range(game.num_players + 1)],
                  [1.0 for _ in range(game.num_players + 1)])
        update_node_values(game.root, strategy_profile)
        exploitability = get_exploitability(game, average_strategy_profile)
        update_strategy(strategy_profile, average_strategy_profile,
                        game.information_sets)
        if t % log_schedule(t) == 0:
            logger.logkv("t", t)
            logger.logkv("exploitability", exploitability)
            logger.dumpkvs()
    return average_strategy_profile
def various_disturbance(variant):
    env_name = variant['env_name']
    env = get_env_from_name(env_name)
    env_params = variant['env_params']

    eval_params = variant['eval_params']
    policy_params = variant['alg_params']
    disturber_params = variant['disturber_params']
    build_func = get_policy(variant['algorithm_name'])
    if 'Fetch' in env_name or 'Hand' in env_name:
        s_dim = env.observation_space.spaces['observation'].shape[0] \
                + env.observation_space.spaces['achieved_goal'].shape[0] + \
                env.observation_space.spaces['desired_goal'].shape[0]
    else:
        s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.shape[0]
    d_dim = env_params['disturbance dim']
    policy = build_func(a_dim, s_dim, d_dim, policy_params)
    # disturber = Disturber(d_dim, s_dim, disturber_params)

    log_path = variant[
        'log_path'] + '/eval/various_disturbance-' + eval_params['form']
    variant['eval_params'].update({'period': 0})
    logger.configure(dir=log_path, format_strs=['csv'])
    for period in eval_params['period_list']:
        variant['eval_params']['period'] = period
        diagnostic_dict = evaluation(variant, env, policy)
        frequency = 1. / period
        string_to_print = ['frequency', ':', str(frequency), '|']
        [
            string_to_print.extend(
                [key, ':', str(round(diagnostic_dict[key], 2)), '|'])
            for key in diagnostic_dict.keys()
        ]
        print(''.join(string_to_print))

        logger.logkv('frequency', frequency)
        [
            logger.logkv(key, diagnostic_dict[key])
            for key in diagnostic_dict.keys()
        ]
        logger.dumpkvs()
示例#11
0
    def train(self):
        params = self.value_fun._params
        videos = []
        contours = []
        returns = []
        fig = None
        for itr in range(self.max_itr):
            params = self.optimizer.grad_step(self.objective, params)
            self.value_fun.update(params)
            log = itr % self.log_itr == 0 or itr == self.max_itr - 1
            render = (itr % self.render_itr == 0) and self.render
            if log:
                average_return, video = rollout(self.env,
                                                self.policy,
                                                render=render,
                                                iteration=itr)
                if render:
                    contour, fig = plot_contour(self.env,
                                                self.value_fun,
                                                fig=fig,
                                                iteration=itr)
                    contours += [contour]
                    videos += video
                returns.append(average_return)
                logger.logkv('Iteration', itr)
                logger.logkv('Average Returns', average_return)
                logger.dumpkvs()

        plot_returns(returns)
        plot_contour(self.env, self.value_fun, save=True, fig=fig)

        if contours and contours[0] is not None:
            contours = list(upsample(np.array(contours), 10))
            clip = mpy.ImageSequenceClip(contours, fps=10)
            clip.write_videofile('%s/contours_progress.mp4' % logger.get_dir())

        if videos:
            fps = int(10 / getattr(self.env, 'dt', 0.1))
            clip = mpy.ImageSequenceClip(videos, fps=fps)
            clip.write_videofile('%s/learning_progress.mp4' % logger.get_dir())

        plt.close()
def constant_impulse(variant):
    env_name = variant["env_name"]
    env = get_env_from_name(env_name)
    env_params = variant["env_params"]

    eval_params = variant["eval_params"]
    policy_params = variant["alg_params"]
    policy_params["network_structure"] = env_params["network_structure"]

    build_func = get_policy(variant["algorithm_name"])
    if "Fetch" in env_name or "Hand" in env_name:
        s_dim = (env.observation_space.spaces["observation"].shape[0] +
                 env.observation_space.spaces["achieved_goal"].shape[0] +
                 env.observation_space.spaces["desired_goal"].shape[0])
    else:
        s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.shape[0]
    policy = build_func(a_dim, s_dim, policy_params)
    # disturber = Disturber(d_dim, s_dim, disturber_params)

    log_path = variant["log_path"] + "/eval/constant_impulse"
    variant["eval_params"].update({"magnitude": 0})
    logger.configure(dir=log_path, format_strs=["csv"])
    for magnitude in eval_params["magnitude_range"]:
        variant["eval_params"]["magnitude"] = magnitude
        diagnostic_dict, _ = evaluation(variant, env, policy)

        string_to_print = ["magnitude", ":", str(magnitude), "|"]
        [
            string_to_print.extend(
                [key, ":", str(round(diagnostic_dict[key], 2)), "|"])
            for key in diagnostic_dict.keys()
        ]
        print("".join(string_to_print))

        logger.logkv("magnitude", magnitude)
        [
            logger.logkv(key, diagnostic_dict[key])
            for key in diagnostic_dict.keys()
        ]
        logger.dumpkvs()
示例#13
0
    def train(self):
        self.agent.start_interaction(self.envs,
                                     nlump=self.hps['nlumps'],
                                     dynamics=self.dynamics)
        if self.hps['ckptpath'] is not None:
            self.agent.restore_model(logdir=self.hps['ckptpath'],
                                     exp_name=self.hps['exp_name'])
        while True:
            info = self.agent.step()
            if info['update']:
                logger.logkvs(info['update'])
                logger.dumpkvs()
                if info['update']['n_updates'] % 60 == 0:
                    self.agent.save_model(
                        logdir=logger.get_dir(),
                        exp_name=self.hps['exp_name'],
                        global_step=info['update']['n_updates'])
            if self.agent.rollout.stats['tcount'] > self.num_timesteps:
                break

        self.agent.stop_interaction()
示例#14
0
def trained_disturber(variant):
    env_name = variant['env_name']
    env = get_env_from_name(env_name)
    env_params = variant['env_params']

    eval_params = variant['eval_params']
    policy_params = variant['alg_params']
    disturber_params = variant['disturber_params']
    build_func = get_policy(variant['algorithm_name'])
    if 'Fetch' in env_name or 'Hand' in env_name:
        s_dim = env.observation_space.spaces['observation'].shape[0] \
                + env.observation_space.spaces['achieved_goal'].shape[0] + \
                env.observation_space.spaces['desired_goal'].shape[0]
    else:
        s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.shape[0]
    d_dim = env_params['disturbance dim']
    policy = build_func(a_dim, s_dim, d_dim, policy_params)
    disturbance_chanel_list = np.nonzero(
        disturber_params['disturbance_magnitude'])[0]
    disturber_params['disturbance_chanel_list'] = disturbance_chanel_list
    disturber = Disturber(d_dim, s_dim, disturber_params)
    disturber.restore(eval_params['path'])

    log_path = variant['log_path'] + '/eval/trained_disturber'
    variant['eval_params'].update({'magnitude': 0})
    logger.configure(dir=log_path, format_strs=['csv'])

    diagnostic_dict, _ = evaluation(variant, env, policy, disturber)

    string_to_print = []
    [
        string_to_print.extend(
            [key, ':', str(round(diagnostic_dict[key], 2)), '|'])
        for key in diagnostic_dict.keys()
    ]
    print(''.join(string_to_print))

    [logger.logkv(key, diagnostic_dict[key]) for key in diagnostic_dict.keys()]
    logger.dumpkvs()
def trained_disturber(variant):
    env_name = variant["env_name"]
    env = get_env_from_name(env_name)
    env_params = variant["env_params"]

    eval_params = variant["eval_params"]
    policy_params = variant["alg_params"]
    disturber_params = variant["disturber_params"]
    build_func = get_policy(variant["algorithm_name"])
    if "Fetch" in env_name or "Hand" in env_name:
        s_dim = (env.observation_space.spaces["observation"].shape[0] +
                 env.observation_space.spaces["achieved_goal"].shape[0] +
                 env.observation_space.spaces["desired_goal"].shape[0])
    else:
        s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.shape[0]
    d_dim = env_params["disturbance dim"]
    policy = build_func(a_dim, s_dim, d_dim, policy_params)
    disturbance_chanel_list = np.nonzero(
        disturber_params["disturbance_magnitude"])[0]
    disturber_params["disturbance_chanel_list"] = disturbance_chanel_list
    disturber = Disturber(d_dim, s_dim, disturber_params)
    disturber.restore(eval_params["path"])

    log_path = variant["log_path"] + "/eval/trained_disturber"
    variant["eval_params"].update({"magnitude": 0})
    logger.configure(dir=log_path, format_strs=["csv"])

    diagnostic_dict, _ = evaluation(variant, env, policy, disturber)

    string_to_print = []
    [
        string_to_print.extend(
            [key, ":", str(round(diagnostic_dict[key], 2)), "|"])
        for key in diagnostic_dict.keys()
    ]
    print("".join(string_to_print))

    [logger.logkv(key, diagnostic_dict[key]) for key in diagnostic_dict.keys()]
    logger.dumpkvs()
def various_disturbance(variant):
    env_name = variant["env_name"]
    env = get_env_from_name(env_name)
    env_params = variant["env_params"]

    eval_params = variant["eval_params"]
    policy_params = variant["alg_params"]
    build_func = get_policy(variant["algorithm_name"])
    if "Fetch" in env_name or "Hand" in env_name:
        s_dim = (env.observation_space.spaces["observation"].shape[0] +
                 env.observation_space.spaces["achieved_goal"].shape[0] +
                 env.observation_space.spaces["desired_goal"].shape[0])
    else:
        s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.shape[0]
    policy = build_func(a_dim, s_dim, policy_params)
    # disturber = Disturber(d_dim, s_dim, disturber_params)

    log_path = variant[
        "log_path"] + "/eval/various_disturbance-" + eval_params["form"]
    variant["eval_params"].update({"period": 0})
    logger.configure(dir=log_path, format_strs=["csv"])
    for period in eval_params["period_list"]:
        variant["eval_params"]["period"] = period
        diagnostic_dict, _ = evaluation(variant, env, policy)
        frequency = 1.0 / period
        string_to_print = ["frequency", ":", str(frequency), "|"]
        [
            string_to_print.extend(
                [key, ":", str(round(diagnostic_dict[key], 2)), "|"])
            for key in diagnostic_dict.keys()
        ]
        print("".join(string_to_print))

        logger.logkv("frequency", frequency)
        [
            logger.logkv(key, diagnostic_dict[key])
            for key in diagnostic_dict.keys()
        ]
        logger.dumpkvs()
def test(model, device, test_loader, epoch=None, val=True):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device).float(), \
                           target.to(device).long()
            output = model(data)
            test_loss += F.cross_entropy(output, target,
                                         reduction='sum').item()  # sum up batch loss
            pred = output.max(1, keepdim=True)[1]  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    acc = float(correct) / len(test_loader.dataset)
    if val:
        logger.logkv('epoch', epoch)
        logger.logkv('val/loss', test_loss)
        logger.logkv('val/acc', acc)
        logger.dumpkvs()
    return test_loss, acc
示例#18
0
    def train(self):
        self.agent.start_interaction(self.envs,
                                     nlump=self.hps['nlumps'],
                                     dynamics=self.dynamics)
        sess = tf.get_default_session()
        self.save = functools.partial(save_variables, sess=sess)
        while True:
            info = self.agent.step()
            if info['update']:
                logger.logkvs(info['update'])
                logger.dumpkvs()

                if info['update']['n_updates'] % 10 == 0:
                    checkdir = osp.join(logger.get_dir(), 'checkpoints')
                    os.makedirs(checkdir, exist_ok=True)
                    savepath = osp.join(checkdir,
                                        '%.5i' % info['update']['n_updates'])
                    print('Saving to', savepath)
                    self.save(savepath)

            if self.agent.rollout.stats['tcount'] > self.num_timesteps:
                break
        self.agent.stop_interaction()
示例#19
0
def test_mpi_weighted_mean():
    from mpi4py import MPI
    comm = MPI.COMM_WORLD
    with logger.scoped_configure(comm=comm):
        if comm.rank == 0:
            name2valcount = {'a': (10, 2), 'b': (20, 3)}
        elif comm.rank == 1:
            name2valcount = {'a': (19, 1), 'c': (42, 3)}
        else:
            raise NotImplementedError

        d = mpi_util.mpi_weighted_mean(comm, name2valcount)
        correctval = {'a': (10 * 2 + 19) / 3.0, 'b': 20, 'c': 42}
        if comm.rank == 0:
            assert d == correctval, f'{d} != {correctval}'

        for name, (val, count) in name2valcount.items():
            for _ in range(count):
                logger.logkv_mean(name, val)
        d2 = logger.dumpkvs()
        if comm.rank == 0:
            assert d2 == correctval
示例#20
0
def train():
    episodes = train_episodes
    logger.configure(dir="./log/", format_strs="stdout,tensorboard,log")
    agent = DQN(num_state=16, num_action=4)
    env = Game2048Env()

    pf_saver = Perfomance_Saver()
    model_saver = Model_Saver(num=10)

    eval_max_score = 0
    for i in range(episodes):
        state, reward, done, info = env.reset()
        state = log2_shaping(state)

        start = time.time()
        loss = None
        while True:
            if agent.buffer.memory_counter <= agent.memory_capacity:
                action = agent.select_action(state, random=True)
            else:
                action = agent.select_action(state)

            next_state, reward, done, info = env.step(action)
            next_state = log2_shaping(next_state)
            reward = log2_shaping(reward, divide=1)

            agent.store_transition(state, action, reward, next_state)
            state = next_state

            if ifrender:
                env.render()

            if agent.buffer.memory_counter % agent.train_interval == 0 and agent.buffer.memory_counter > agent.memory_capacity:  # 相当于填满后才update
                loss = agent.update()

            if done:
                if i % log_interval == 0:
                    if loss:
                        logger.logkv('loss', loss)
                    logger.logkv('training progress', (i+1) / episodes)
                    logger.logkv('episode reward', info['score'])
                    logger.logkv('episode steps', info['steps'])
                    logger.logkv('highest', info['highest'])
                    logger.logkv('epsilon', agent.epsilon)
                    logger.dumpkvs()

                    loss = None

                if i % epsilon_decay_interval == 0:   # episilon decay
                    agent.epsilon_decay(i, episodes)
                break
        
        end = time.time()
        print('episode time:{} s\n'.format(end - start))

        # eval 
        if i % eval_interval == 0 and i:
            eval_info = test(episodes=test_episodes, agent=agent)
            average_score, max_score, score_lis = eval_info['mean'], eval_info['max'], eval_info['list']

            pf_saver.save(score_lis, info=f'episode:{i}')

            if int(average_score) > eval_max_score:
                eval_max_score = int(average_score)
                name = 'dqn_{}.pkl'.format(int(eval_max_score))
                agent.save(name=name)
                model_saver.save("./save/" + name)

            logger.logkv('eval average score', average_score)
            logger.logkv('eval max socre', max_score)
            logger.dumpkvs()
示例#21
0
def eval(variant):
    env_name = variant['env_name']
    traj = get_traj()
    env = get_env_from_name(env_name)
    env_params = variant['env_params']
    max_ep_steps = env_params['max_ep_steps']
    policy_params = variant['alg_params']
    s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.shape[0]
    a_upperbound = env.action_space.high
    a_lowerbound = env.action_space.low
    policy = CAC(a_dim, s_dim, policy_params)

    log_path = variant['log_path'] + '/eval/' + str(0)
    logger.configure(dir=log_path, format_strs=['csv'])
    policy.restore(variant['log_path'] + '/' + str(0) + '/policy')

    # Training setting
    t1 = time.time()
    PLOT_theta_1 = []
    PLOT_ground_theta_1 = []
    mst = []
    agent_traj = []
    ground_traj = []

    for i in tqdm(range(50)):

        s = env.reset()
        cost = 0

        traj_num = 0
        # Random start point

        start_point = 0 + 1000 * i

        s = traj[start_point, :16]
        PLOT_state = s
        s = np.concatenate([[s], [traj[start_point, 17:]]], axis=1)[0]

        env.state = s

        for j in range(start_point + 1, start_point + 1 + 1000):

            if agent_traj == []:
                agent_traj = s[0:16]
            else:
                agent_traj = np.concatenate((agent_traj, s[0:16]), axis=0)

            if ground_traj == []:
                ground_traj = traj[j - 1, 0:16]
            else:
                ground_traj = np.concatenate((ground_traj, traj[j - 1, 0:16]),
                                             axis=0)

            delta = np.zeros(36)
            # ###### NOSIE ##############

            # noise = np.random.normal(0, 0.001, 16)
            # delta[20:]= noise

            # ###### BIAS ##############

            # noise = s[0:16]*0.005
            # delta[0:16] = noise

            a = policy.choose_action(s + delta, True)
            action = a_lowerbound + (a + 1.) * (a_upperbound -
                                                a_lowerbound) / 2
            # action =  traj[j-1,16]

            X_, r, done, theta = env.step(action)
            # The new s= current state,next omega, next state
            s_ = np.concatenate([X_, [traj[j, 17:]]], axis=1)[0]

            env.state = s_

            PLOT_theta_1.append(theta[0])
            PLOT_ground_theta_1.append(traj[j, 16])
            mst.append(np.linalg.norm(traj[j, 16] - theta[0]))

            PLOT_state = np.vstack((PLOT_state, X_))

            logger.logkv('rewards', r)
            logger.logkv('timestep', j)
            logger.dumpkvs()

            cost = cost + r
            if j == 1000 - 1 + start_point:
                done = True

            s = s_

            if done:
                #print('episode:', i,'trajectory_number:',traj_num,'total_cost:',cost,'steps:',j-start_point)
                break
    x = np.linspace(0,
                    np.shape(PLOT_ground_theta_1)[0] - 1,
                    np.shape(PLOT_ground_theta_1)[0])
    # plt.plot(x, PLOT_theta_1, color='blue', label='Tracking')
    # plt.plot(x, PLOT_ground_theta_1, color='black', linestyle='--', label='Ground truth')
    # plt.show()

    fig = plt.figure()
    with h5py.File(variant['log_path'] + '/' + 'CAC_theta.h5', 'w') as hdf:
        hdf.create_dataset('Data', data=PLOT_theta_1)
    with h5py.File(variant['log_path'] + '/' + 'Normal_theta_ground.h5',
                   'w') as hdf:
        hdf.create_dataset('Data', data=PLOT_ground_theta_1)
    with h5py.File(variant['log_path'] + '/' + 'CAC_track.h5', 'w') as hdf:
        hdf.create_dataset('Data', data=agent_traj)
    with h5py.File(variant['log_path'] + '/' + 'GT_track.h5', 'w') as hdf:
        hdf.create_dataset('Data', data=ground_traj)

    plt.plot(x, PLOT_theta_1, color='blue', label='Tracking')
    plt.plot(x,
             PLOT_ground_theta_1,
             color='black',
             linestyle='--',
             label='Ground truth')
    plt.show()

    return
示例#22
0
def train(variant):
    Min_cost = 1000000

    traj = get_traj()  # get data
    env_name = variant['env_name']  # choose your environment
    env = get_env_from_name(env_name)

    env_params = variant['env_params']

    max_episodes = env_params[
        'max_episodes']  # maximum episodes for RL training
    max_ep_steps = env_params[
        'max_ep_steps']  # number of maximum steps in each episode
    max_global_steps = env_params['max_global_steps']
    store_last_n_paths = variant['store_last_n_paths']
    evaluation_frequency = variant['evaluation_frequency']

    policy_params = variant['alg_params']

    min_memory_size = policy_params['min_memory_size']
    steps_per_cycle = policy_params['steps_per_cycle']
    train_per_cycle = policy_params['train_per_cycle']
    batch_size = policy_params['batch_size']

    lr_a, lr_c, lr_l = policy_params['lr_a'], policy_params[
        'lr_c'], policy_params['lr_l']
    lr_a_now = lr_a  # learning rate for actor
    lr_c_now = lr_c  # learning rate for critic
    lr_l_now = lr_l  # learning rate for critic

    s_dim = env.observation_space.shape[0]
    print("s_dim is ", s_dim)

    a_dim = env.action_space.shape[0]
    a_upperbound = env.action_space.high
    a_lowerbound = env.action_space.low

    policy = CAC(a_dim, s_dim, policy_params)
    # policy.restore("log/CMAPSS/CAC-new-reward-0.01/0/policy")

    pool_params = {
        's_dim': s_dim,
        'a_dim': a_dim,
        'd_dim': 1,
        'store_last_n_paths': store_last_n_paths,
        'memory_capacity': policy_params['memory_capacity'],
        'min_memory_size': policy_params['min_memory_size'],
        'history_horizon': policy_params['history_horizon'],
        'finite_horizon': policy_params['finite_horizon']
    }
    if 'value_horizon' in policy_params.keys():
        pool_params.update({'value_horizon': policy_params['value_horizon']})
    else:
        pool_params['value_horizon'] = None
    pool = Pool(pool_params)
    # For analyse
    Render = env_params['eval_render']

    # Training setting
    t1 = time.time()
    global_step = 0
    last_training_paths = deque(maxlen=store_last_n_paths)
    training_started = False

    log_path = variant['log_path']
    logger.configure(dir=log_path, format_strs=['csv'])
    logger.logkv('tau', policy_params['tau'])

    logger.logkv('alpha3', policy_params['alpha3'])
    logger.logkv('batch_size', policy_params['batch_size'])
    logger.logkv('target_entropy', policy.target_entropy)

    for i in range(max_episodes):

        current_path = {
            'rewards': [],
            'distance': [],
            'kl_divergence': [],
            'a_loss': [],
            'alpha': [],
            'lyapunov_error': [],
            'entropy': [],
            'beta': [],
            'action_distance': [],
        }

        if global_step > max_global_steps:
            break

        s = env.reset()

        # Random start point

        start_point = np.random.randint(0, 500000)

        s = traj[start_point, :16]

        # current state, theta,next w, desired state
        # this is for decision making
        # 16,1,4,16
        s = np.concatenate([[s], [traj[start_point, 17:]]], axis=1)[0]

        env.state = s

        for j in range(start_point + 1, start_point + 1 + max_ep_steps):
            if Render:
                env.render()
            delta = np.zeros(36)
            # ###### NOSIE ##############

            noise = np.random.normal(0, 0.01, 16)
            delta[20:] = noise
            # ########IF Noise env##########
            # s= s + delta
            # a = policy.choose_action(s)

            # ###### BIAS ##############

            # noise = s[0:16]*0.01
            # delta[0:16] = noise

            a = policy.choose_action(s + delta)

            action = a_lowerbound + (a + 1.) * (a_upperbound -
                                                a_lowerbound) / 2
            # action = traj[j-1,16]

            a_upperbound = env.action_space.high
            a_lowerbound = env.action_space.low

            # Run in simulator
            X_, r, done, theta = env.step(action)
            # The new s= current state,next omega, next state
            s_ = np.concatenate([X_, [traj[j, 17:]]], axis=1)[0]
            # s_ = np.concatenate([[s_], [theta]], axis=1)[0]
            # s_ = np.concatenate([X_,[[theta]], [traj[j, 9:]]], axis=1)[0]
            env.state = s_

            # theta_pre=theta
            if training_started:
                global_step += 1

            if j == max_ep_steps - 1 + start_point:
                done = True

            terminal = 1. if done else 0.

            if j > start_point + 2:
                pool.store(s, a, np.zeros([1]), np.zeros([1]), r, terminal, s_,
                           _s)
            # policy.store_transition(s, a, disturbance, r,0, terminal, s_)

            if pool.memory_pointer > min_memory_size and global_step % steps_per_cycle == 0:
                training_started = True

                for _ in range(train_per_cycle):
                    batch = pool.sample(batch_size)
                    labda, alpha, l_loss, entropy, a_loss, beta, action_distance, kl, distance = policy.learn(
                        lr_a_now, lr_c_now, lr_l_now, lr_a_now / 10, batch)

            if training_started:
                current_path['rewards'].append(r)
                current_path['distance'].append(distance)
                current_path['kl_divergence'].append(kl)
                current_path['lyapunov_error'].append(l_loss)
                current_path['alpha'].append(alpha)
                current_path['entropy'].append(entropy)
                current_path['a_loss'].append(a_loss)
                current_path['beta'].append(beta)
                current_path['action_distance'].append(action_distance)

            if training_started and global_step % evaluation_frequency == 0 and global_step > 0:

                logger.logkv("total_timesteps", global_step)

                training_diagnotic = evaluate_training_rollouts(
                    last_training_paths)
                # print(training_diagnotic)
                if training_diagnotic is not None:
                    eval_diagnotic = training_evaluation(variant, env, policy)
                    [
                        logger.logkv(key, eval_diagnotic[key])
                        for key in eval_diagnotic.keys()
                    ]
                    training_diagnotic.pop('return')
                    [
                        logger.logkv(key, training_diagnotic[key])
                        for key in training_diagnotic.keys()
                    ]
                    logger.logkv('lr_a', lr_a_now)
                    logger.logkv('lr_c', lr_c_now)
                    logger.logkv('lr_l', lr_l_now)

                    string_to_print = ['time_step:', str(global_step), '|']
                    [
                        string_to_print.extend(
                            [key, ':', str(eval_diagnotic[key]), '|'])
                        for key in eval_diagnotic.keys()
                    ]
                    [
                        string_to_print.extend([
                            key, ':',
                            str(round(training_diagnotic[key], 2)), '|'
                        ]) for key in training_diagnotic.keys()
                    ]
                    print(''.join(string_to_print))

                logger.dumpkvs()
                if eval_diagnotic['test_return'] / eval_diagnotic[
                        'test_average_length'] <= Min_cost:
                    Min_cost = eval_diagnotic['test_return'] / eval_diagnotic[
                        'test_average_length']
                    print("New lowest cost:", Min_cost)
                    policy.save_result(log_path)
                if training_started and global_step % (
                        10 * evaluation_frequency) == 0 and global_step > 0:
                    policy.save_result(log_path)

            # Status Update
            _s = s
            s = s_

            # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY
            if done:
                if training_started:
                    last_training_paths.appendleft(current_path)
                frac = 1.0 - (global_step - 1.0) / max_global_steps
                lr_a_now = lr_a * frac  # learning rate for actor
                lr_c_now = lr_c * frac  # learning rate for critic
                lr_l_now = lr_l * frac  # learning rate for critic
                break
    policy.save_result(log_path)

    print('Running time: ', time.time() - t1)
    return
示例#23
0
def train(variant):
    env_name = variant['env_name']
    env = get_env_from_name(env_name)

    env_params = variant['env_params']

    max_episodes = env_params['max_episodes']
    max_ep_steps = env_params['max_ep_steps']
    max_global_steps = env_params['max_global_steps']
    store_last_n_paths = variant['num_of_training_paths']
    evaluation_frequency = variant['evaluation_frequency']

    policy_params = variant['alg_params']
    policy_params['network_structure'] = env_params['network_structure']

    min_memory_size = policy_params['min_memory_size']
    steps_per_cycle = policy_params['steps_per_cycle']
    train_per_cycle = policy_params['train_per_cycle']
    batch_size = policy_params['batch_size']

    lr_a, lr_c, lr_l = policy_params['lr_a'], policy_params[
        'lr_c'], policy_params['lr_l']
    lr_a_now = lr_a  # learning rate for actor
    lr_c_now = lr_c  # learning rate for critic
    lr_l_now = lr_l  # learning rate for critic

    if 'Fetch' in env_name or 'Hand' in env_name:
        s_dim = env.observation_space.spaces['observation'].shape[0]\
                + env.observation_space.spaces['achieved_goal'].shape[0]+ \
                env.observation_space.spaces['desired_goal'].shape[0]
    else:
        s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.shape[0]
    # if disturber_params['process_noise']:
    #     d_dim = disturber_params['noise_dim']
    # else:
    #     d_dim = env_params['disturbance dim']

    a_upperbound = env.action_space.high
    a_lowerbound = env.action_space.low
    policy = LAC(a_dim, s_dim, policy_params)

    pool_params = {
        's_dim': s_dim,
        'a_dim': a_dim,
        'd_dim': 1,
        'store_last_n_paths': store_last_n_paths,
        'memory_capacity': policy_params['memory_capacity'],
        'min_memory_size': policy_params['min_memory_size'],
        'history_horizon': policy_params['history_horizon'],
        'finite_horizon': policy_params['finite_horizon']
    }
    if 'value_horizon' in policy_params.keys():
        pool_params.update({'value_horizon': policy_params['value_horizon']})
    else:
        pool_params['value_horizon'] = None
    pool = Pool(pool_params)
    # For analyse
    Render = env_params['eval_render']

    # Training setting
    t1 = time.time()
    global_step = 0
    last_training_paths = deque(maxlen=store_last_n_paths)
    training_started = False

    log_path = variant['log_path']
    logger.configure(dir=log_path, format_strs=['csv'])
    logger.logkv('tau', policy_params['tau'])

    logger.logkv('alpha3', policy_params['alpha3'])
    logger.logkv('batch_size', policy_params['batch_size'])
    logger.logkv('target_entropy', policy.target_entropy)

    for i in range(max_episodes):

        current_path = {
            'rewards': [],
            'a_loss': [],
            'alpha': [],
            'lambda': [],
            'lyapunov_error': [],
            'entropy': [],
        }

        if global_step > max_global_steps:
            break

        s = env.reset()
        if 'Fetch' in env_name or 'Hand' in env_name:
            s = np.concatenate([s[key] for key in s.keys()])

        for j in range(max_ep_steps):
            if Render:
                env.render()
            a = policy.choose_action(s)
            # a = a*0
            action = a_lowerbound + (a + 1.) * (a_upperbound -
                                                a_lowerbound) / 2

            # Run in simulator
            disturbance_input = np.zeros([a_dim + s_dim])

            s_, r, done, info = env.step(action)

            if 'Fetch' in env_name or 'Hand' in env_name:
                s_ = np.concatenate([s_[key] for key in s_.keys()])
                if info['done'] > 0:
                    done = True

            if training_started:
                global_step += 1

            if j == max_ep_steps - 1:
                done = True

            terminal = 1. if done else 0.
            pool.store(s, a, np.zeros([1]), np.zeros([1]), r, terminal, s_)
            # policy.store_transition(s, a, disturbance, r,0, terminal, s_)

            if pool.memory_pointer > min_memory_size and global_step % steps_per_cycle == 0:
                training_started = True

                for _ in range(train_per_cycle):
                    batch = pool.sample(batch_size)
                    labda, alpha, l_loss, entropy, a_loss = policy.learn(
                        lr_a_now, lr_c_now, lr_l_now, lr_a, batch)

            if training_started:
                current_path['rewards'].append(r)
                current_path['lyapunov_error'].append(l_loss)
                current_path['alpha'].append(alpha)
                current_path['lambda'].append(labda)
                current_path['entropy'].append(entropy)
                current_path['a_loss'].append(a_loss)

            if training_started and global_step % evaluation_frequency == 0 and global_step > 0:

                logger.logkv("total_timesteps", global_step)

                training_diagnotic = evaluate_training_rollouts(
                    last_training_paths)
                if training_diagnotic is not None:
                    if variant['num_of_evaluation_paths'] > 0:
                        eval_diagnotic = training_evaluation(
                            variant, env, policy)
                        [
                            logger.logkv(key, eval_diagnotic[key])
                            for key in eval_diagnotic.keys()
                        ]
                        training_diagnotic.pop('return')
                    [
                        logger.logkv(key, training_diagnotic[key])
                        for key in training_diagnotic.keys()
                    ]
                    logger.logkv('lr_a', lr_a_now)
                    logger.logkv('lr_c', lr_c_now)
                    logger.logkv('lr_l', lr_l_now)

                    string_to_print = ['time_step:', str(global_step), '|']
                    if variant['num_of_evaluation_paths'] > 0:
                        [
                            string_to_print.extend(
                                [key, ':',
                                 str(eval_diagnotic[key]), '|'])
                            for key in eval_diagnotic.keys()
                        ]
                    [
                        string_to_print.extend([
                            key, ':',
                            str(round(training_diagnotic[key], 2)), '|'
                        ]) for key in training_diagnotic.keys()
                    ]
                    print(''.join(string_to_print))

                logger.dumpkvs()
            # 状态更新
            s = s_

            # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY
            if done:
                if training_started:
                    last_training_paths.appendleft(current_path)

                frac = 1.0 - (global_step - 1.0) / max_global_steps
                lr_a_now = lr_a * frac  # learning rate for actor
                lr_c_now = lr_c * frac  # learning rate for critic
                lr_l_now = lr_l * frac  # learning rate for critic

                break
    policy.save_result(log_path)

    print('Running time: ', time.time() - t1)
    return
示例#24
0
def learn(*,
          network,
          env,
          total_timesteps,
          starting_positions,
          env_name,
          win_percentage=0.5,
          eval_env=None,
          seed=None,
          nsteps=2048,
          ent_coef=0.0,
          lr=3e-4,
          vf_coef=0.5,
          max_grad_norm=0.5,
          gamma=0.99,
          lam=0.95,
          log_interval=10,
          nminibatches=4,
          noptepochs=4,
          cliprange=0.2,
          save_interval=0,
          load_path=None,
          model_fn=None,
          **network_kwargs):
    '''
    Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)

    Parameters:
    ----------

    network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                                      specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                                      tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                                      neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                                      See common/models.py/lstm for more details on using recurrent nets in policies

    env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation.
                                      The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.


    nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                                      nenv is number of environment copies simulated in parallel)

    total_timesteps: int              number of timesteps (i.e. number of actions taken in the environment)

    ent_coef: float                   policy entropy coefficient in the optimization objective

    lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
                                      training and 0 is the end of the training.

    vf_coef: float                    value function loss coefficient in the optimization objective

    max_grad_norm: float or None      gradient norm clipping coefficient

    gamma: float                      discounting factor

    lam: float                        advantage estimation discounting factor (lambda in the paper)

    log_interval: int                 number of timesteps between logging events

    nminibatches: int                 number of training minibatches per update. For recurrent policies,
                                      should be smaller or equal than number of environments run in parallel.

    noptepochs: int                   number of training epochs per update

    cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
                                      and 0 is the end of the training

    save_interval: int                number of timesteps between saving events

    load_path: str                    path to load the model from

    **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                      For instance, 'mlp' network architecture has arguments num_hidden and num_layers.



    '''

    set_global_seeds(seed)

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    policy = build_policy(env, network, **network_kwargs)

    # Get the nb of env
    nenvs = env.num_envs

    # Get state_space and action_space
    ob_space = env.observation_space
    ac_space = env.action_space

    # Calculate the batch_size
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches

    # Instantiate the model object (that creates act_model and train_model)
    if model_fn is None:
        from ppo2.model import Model
        model_fn = Model

    model = model_fn(policy=policy,
                     ob_space=ob_space,
                     ac_space=ac_space,
                     nbatch_act=nenvs,
                     nbatch_train=nbatch_train,
                     nsteps=nsteps,
                     ent_coef=ent_coef,
                     vf_coef=vf_coef,
                     max_grad_norm=max_grad_norm)

    if load_path is not None:
        model.load(load_path)
    current_starting_position = starting_positions.pop()

    # Instantiate the runner object
    runner = Runner(env=env,
                    model=model,
                    nsteps=nsteps,
                    gamma=gamma,
                    lam=lam,
                    starting_position=current_starting_position)
    if eval_env is not None:
        eval_runner = Runner(env=eval_env,
                             model=model,
                             nsteps=nsteps,
                             gamma=gamma,
                             lam=lam,
                             starting_position=current_starting_position)

    epinfobuf = deque(maxlen=100)
    if eval_env is not None:
        eval_epinfobuf = deque(maxlen=100)

    # Start total timer
    tfirststart = time.time()
    start_changes = []
    reached_goal = []

    nupdates = total_timesteps // nbatch
    for update in range(1, nupdates + 1):
        assert nbatch % nminibatches == 0
        # Start timer
        tstart = time.time()
        frac = 1.0 - (update - 1.0) / nupdates
        # Calculate the learning rate
        lrnow = lr(frac)
        # Calculate the cliprange
        cliprangenow = cliprange(frac)
        # Get minibatch
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
        )  #pylint: disable=E0632
        if eval_env is not None:
            eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run(
            )  #pylint: disable=E0632

        if env_name == "MountainCar-v0":
            done_obs = obs[masks]
            # Number of episodes past
            n_eps = done_obs.shape[0]
            # Reached goal if pos is > 0.5
            n_goal_reached = (done_obs[:, 0] >= 0.5).sum()

            reached_goal.extend([
                done + update * nsteps - nsteps
                for done in np.where(done_obs[:, 0] >= 0.5)[0]
            ])

            if (n_goal_reached /
                    n_eps) > win_percentage and len(starting_positions) > 0:
                start_changes.append(update * nsteps)
                current_starting_position = starting_positions.pop()

                runner.env.starting_position = current_starting_position
                if eval_env is not None:
                    eval_runner.env.starting_position = current_starting_position

        epinfobuf.extend(epinfos)
        if eval_env is not None:
            eval_epinfobuf.extend(eval_epinfos)

        # Here what we're going to do is for each minibatch calculate the loss and append it.
        mblossvals = []
        if states is None:  # nonrecurrent version
            # Index of each element of batch_size
            # Create the indices array
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                # Randomize the indexes
                np.random.shuffle(inds)
                # 0 to batch_size with batch_train_size step
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mblossvals.append(model.train(lrnow, cliprangenow,
                                                  *slices))
        else:  # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            envsperbatch = nbatch_train // nsteps
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(
                        model.train(lrnow, cliprangenow, *slices, mbstates))

        # Feedforward --> get losses --> update
        lossvals = np.mean(mblossvals, axis=0)
        # End timer
        tnow = time.time()
        # Calculate the fps (frame per second)
        fps = int(nbatch / (tnow - tstart))
        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, returns)
            logger.logkv("serial_timesteps", update * nsteps)
            logger.logkv("nupdates", update)
            logger.logkv("total_timesteps", update * nbatch)
            logger.logkv("fps", fps)
            logger.logkv("explained_variance", float(ev))
            logger.logkv('eprewmean',
                         safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean',
                         safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.logkv('start_changes',
                         "_".join([str(s) for s in start_changes]))
            logger.logkv('reached_goal',
                         "_".join([str(goal) for goal in reached_goal]))
            if eval_env is not None:
                logger.logkv(
                    'eval_eprewmean',
                    safemean([epinfo['r'] for epinfo in eval_epinfobuf]))
                logger.logkv(
                    'eval_eplenmean',
                    safemean([epinfo['l'] for epinfo in eval_epinfobuf]))
            logger.logkv('time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv(lossname, lossval)
            if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
                logger.dumpkvs()
        if save_interval and (update % save_interval == 0
                              or update == 1) and logger.get_dir() and (
                                  MPI is None
                                  or MPI.COMM_WORLD.Get_rank() == 0):
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i' % update)
            print('Saving to', savepath)
            model.save(savepath)
    return model
    def train(self):
        # params = self.value_fun._params
        videos = []
        contours = []
        returns = []
        delay_cs = []
        fig = None
        for itr in range(self.max_itr):
            itr_starttime = time.time()
            self.value_fun_update()
            itr_time = time.time() - itr_starttime
            log = itr % self.log_itr == 0 or itr == self.max_itr - 1
            render = (itr % self.render_itr == 0) and self.render
            if log:
                rollout_starttime = time.time()
                average_return, avg_delay_cost, video = rollout(
                    self.env,
                    self.policy,
                    num_rollouts=self.num_rollouts,
                    render=render,
                    iteration=itr,
                    max_path_length=self.max_path_length)
                rollout_time = time.time() - rollout_starttime
                if render:
                    # contour, fig = plot_contour(self.env, self.value_fun, fig=fig, iteration=itr)
                    # contours += [contour]
                    videos += video
                returns.append(average_return)
                delay_cs.append(avg_delay_cost)
                logger.logkv('Iteration', itr)
                logger.logkv('Average Returns', average_return)
                logger.logkv('Average Delayed Costs', avg_delay_cost)
                logger.logkv('Iteration Time', itr_time)
                logger.logkv('Policy Rollout Time', rollout_time)
                logger.dumpkvs()

        plot_returns(returns)
        plot_returns(delay_cs, 'delayed_cost')
        # plot_contour(self.env, self.value_fun, save=True, fig=fig)

        # if contours and contours[0] is not None:
        #     contours = list(upsample(np.array(contours), 10))
        #     clip = mpy.ImageSequenceClip(contours, fps=10)
        #     clip.write_videofile('%s/contours_progress.mp4' % logger.get_dir())

        if videos:
            fps = int(4 / getattr(self.env, 'dt', 0.1))
            clip = mpy.ImageSequenceClip(videos, fps=fps)
            clip.write_videofile('%s/learning_progress.mp4' % logger.get_dir())

        itr = self.max_itr
        average_return, avg_delay_cost, final_itr_video = rollout(
            self.env,
            self.policy,
            num_rollouts=2,
            render=True,
            iteration=itr,
            last_max_path_length=self.last_max_path_length,
            last_iteration=True)

        final_clip = mpy.ImageSequenceClip(final_itr_video, fps=40)
        final_clip.write_videofile('%s/final_rollout.mp4' % logger.get_dir())
        plt.close()
示例#26
0
def eval(variant):
    # num_data_traj = variant['num_data_trajectories']
    num_data_traj = 50
    env_name = variant['env_name']
    data_trajectories = get_data()
    env = get_env_from_name(env_name)
    env_params = variant['env_params']
    max_ep_steps = env_params['max_ep_steps']
    policy_params = variant['alg_params']
    s_dim = env.observation_space.shape[0]
    print("observation_space = ", s_dim)
    a_dim = env.action_space.shape[0]
    print("action space = ", a_dim)
    a_upperbound = env.action_space.high
    print("upper bound =", a_upperbound)
    a_lowerbound = env.action_space.low
    print("lower bound = ", a_lowerbound)
    policy = CAC(a_dim, s_dim, policy_params)
    ref_s = env.reference_state
    log_path = variant['log_path'] + '/eval/' + str(0)
    logger.configure(dir=log_path, format_strs=['csv'])
    policy.restore(variant['log_path'] + '/' + str(0) + '/policy')

    # Training setting
    t1 = time.time()
    PLOT_theta_1 = []
    PLOT_ground_theta_1 = []
    PLOT_theta_2 = []
    PLOT_ground_theta_2 = []
    state_storage = StateStorage()
    mst = []
    agent_traj = []
    ground_traj = []

    reward_traj = []
    for i in tqdm(range(num_data_traj)):
        if (i >= 10):
            break
        j = i * len(data_trajectories) // num_data_traj
        print(j)
        traj = data_trajectories[j]

        env.reset()
        cost = 0

        # s = traj[0, 1]
        s = traj[0, -8:]
        # PLOT_state = np.array([s])
        # s = np.array([s, traj[0, 2], traj[0, 4]])
        s = np.array(list(s) + [traj[0, 2]] + list(traj[1, -8:]))
        # print("initial state : ", s)
        print("action here is : ", [traj[0, 5], traj[0, 6]])
        env.state = s
        env.model.state = traj[0, -8:]
        # env.state = env.model.state

        ep_steps = len(traj)
        for j in range(1, ep_steps):
            # if j%100 == 0:
            #     env.reset()
            #     s = np.array(list(traj[j-1, -8:]) + [traj[j,2]] + list(traj[j,-8:]))
            #     # s = traj[j-1,-8:]
            #     env.state = s
            #     env.model.state = traj[j-1, -8:]
            s = env.state
            # if agent_traj == []:
            #     agent_traj = [s[0]]
            # else:
            #     agent_traj = np.concatenate((agent_traj, [s[0]]),axis=0)

            # if ground_traj == []:
            #     ground_traj = [traj[j-1,1]]
            # else:
            #     ground_traj = np.concatenate((ground_traj, [traj[j-2,4]]),axis=0)
            # print(traj[j,1], s[2])
            delta = np.zeros(s.shape)
            # ###### NOSIE ##############

            # noise = np.random.normal(0, 0.001, 16)
            # delta[20:]= noise

            # ###### BIAS ##############

            # noise = s[0:16]*0.005
            # delta[0:16] = noise
            # store_s = s.copy()
            # store_s[2] = store_s[2] - store_s[0]
            # a = policy.choose_action(store_s + delta, True)
            a = policy.choose_action(s / ref_s + delta, True)
            # a = policy.choose_action(s + delta, True)
            # print(a)
            action = a_lowerbound + (a + 1.) * (a_upperbound -
                                                a_lowerbound) / 2
            # print(action)

            s_, r, done, X_ = env.step(action, traj[j, 2], traj[j, 1])
            # _, r, done , X_= env.step(action, True)
            # print(r)
            # The new s= current state,next omega, next state
            # s_ = np.array([X_[1,0], traj[j, 2], traj[j, 4]])
            s_ = np.array(list(s_) + [traj[j + 1, 2]] + list(traj[j + 1, -8:]))
            # s_ = np.array([traj[j,1], traj[j,2], traj[j,4]])
            r = modify_reward(r, s, s_, variant['reward_id'])
            # s_ = np.array([traj[j,1], traj[j,2], traj[j,4]])
            if (j % 51 == 0):
                # print("X predicted ", X_, " and actual: ", traj[j-1, 4])
                print("predicted action : ", action, ", reward : ", r)

            if agent_traj == []:
                # agent_traj = [s_[0]]
                agent_traj = [X_[1, 0]]
            else:
                # agent_traj = np.concatenate((agent_traj, [s_[0]]),axis=0)
                agent_traj = np.concatenate((agent_traj, [X_[1, 0]]), axis=0)

            if ground_traj == []:
                # ground_traj = [s[2]]
                ground_traj = [traj[j, 1]]
            else:
                # ground_traj = np.concatenate((ground_traj, [s[2]]),axis=0)
                ground_traj = np.concatenate((ground_traj, [traj[j, 1]]),
                                             axis=0)
            env.state = s_
            theta = action
            PLOT_theta_1.append(theta[0])
            PLOT_ground_theta_1.append(traj[j, 5])
            PLOT_theta_2.append(theta[1])
            PLOT_ground_theta_2.append(traj[j, 6])
            mst.append(np.linalg.norm(traj[j, 5] - theta[0]))
            state_storage.update(predicted_state=s_[:8], original_state=s[-8:])
            reward_traj.append(r)

            # PLOT_state = np.vstack((PLOT_state, np.array([X_[1,0]])))

            logger.logkv('rewards', r)
            logger.logkv('timestep', j)
            logger.logkv('total-length', ep_steps)
            logger.logkv('state', s)
            logger.logkv('predicted-output', X_[1, 0])
            logger.logkv('predicted-action', action)
            logger.logkv('actual-action', [traj[j, 5], traj[j, 6]])
            logger.logkv('action-error', np.linalg.norm(traj[j, 5:7] - theta))
            # logger.logkv('output-error', np.linalg.norm(s[0] - traj[j-1,1]))
            logger.dumpkvs()

            cost = cost + r
            if j == len(traj) - 2:
                done = True

            s = s_

            if done:
                #print('episode:', i,'trajectory_number:',traj_num,'total_cost:',cost,'steps:',j-start_point)
                break
    x = np.linspace(0,
                    np.shape(PLOT_ground_theta_1)[0] - 1,
                    np.shape(PLOT_ground_theta_1)[0])
    # plt.plot(x, PLOT_theta_1, color='blue', label='Tracking')
    # plt.plot(x, PLOT_ground_theta_1, color='black', linestyle='--', label='Ground truth')
    # plt.show()
    plt.style.use('seaborn')
    with h5py.File(variant['log_path'] + '/' + 'CAC_theta.h5', 'w') as hdf:
        hdf.create_dataset('Data', data=PLOT_theta_1)
    with h5py.File(variant['log_path'] + '/' + 'Normal_theta_ground.h5',
                   'w') as hdf:
        hdf.create_dataset('Data', data=PLOT_ground_theta_1)
    with h5py.File(variant['log_path'] + '/' + 'CAC_track.h5', 'w') as hdf:
        hdf.create_dataset('Data', data=agent_traj)
    with h5py.File(variant['log_path'] + '/' + 'GT_track.h5', 'w') as hdf:
        hdf.create_dataset('Data', data=ground_traj)

    fig = plt.figure()
    plt.plot(x,
             PLOT_theta_1,
             linestyle='--',
             color='blue',
             label='Tracking',
             marker='o',
             markersize=1)
    plt.plot(x,
             PLOT_ground_theta_1,
             color='orange',
             linestyle='--',
             label='Ground truth',
             marker='.',
             markersize=3)
    plt.ylim(2000, 8000)
    plt.xlabel('time')
    plt.ylabel('Qmax')
    plt.legend(loc="upper right", markerscale=3., scatterpoints=1, fontsize=10)
    plt.savefig(variant['log_path'] + '/action_tracking_1.jpg')

    fig = plt.figure()
    plt.plot(x,
             PLOT_theta_2,
             linestyle='--',
             color='blue',
             label='Tracking',
             marker='o',
             markersize=1)
    plt.plot(x,
             PLOT_ground_theta_2,
             color='orange',
             linestyle='--',
             label='Ground Truth',
             marker='.',
             markersize=3)
    plt.ylim(0.10, 0.20)
    plt.xlabel('time')
    plt.ylabel('Ro')
    plt.legend(loc="upper right", markerscale=3., scatterpoints=1, fontsize=10)
    plt.savefig(variant['log_path'] + '/action_tracking_2.jpg')

    fig = plt.figure()
    plt.plot(x,
             agent_traj,
             linestyle='--',
             color='blue',
             label='Tracking',
             marker='o',
             markersize=1)
    plt.plot(x,
             ground_traj,
             color='orange',
             linestyle='--',
             label='Ground Truth',
             marker='.',
             markersize=3)
    plt.xlabel('time')
    plt.ylabel('Voltage (V)')
    plt.legend(loc="upper right", markerscale=3., scatterpoints=1, fontsize=10)
    plt.savefig(variant['log_path'] + '/output_tracking.jpg')

    fig = plt.figure()
    plt.plot(np.array(reward_traj),
             np.square(agent_traj - ground_traj),
             linestyle='',
             marker='.',
             markersize=3)
    plt.scatter(np.array(reward_traj), np.square(agent_traj - ground_traj))
    plt.xlabel("reward")
    plt.ylabel("error")
    plt.legend(loc="upper right", markerscale=3., scatterpoints=1, fontsize=10)
    plt.savefig(variant['log_path'] + '/reward_vs_error.jpg')

    state_storage.plot_states(outpath=variant['log_path'])
    return
示例#27
0
def param_variation(variant):
    env_name = variant['env_name']
    env = get_env_from_name(env_name)
    env_params = variant['env_params']

    eval_params = variant['eval_params']
    policy_params = variant['alg_params']
    policy_params.update({
        's_bound': env.observation_space,
        'a_bound': env.action_space,
    })
    disturber_params = variant['disturber_params']
    build_func = get_policy(variant['algorithm_name'])
    s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.shape[0]
    d_dim = env_params['disturbance dim']

    policy = build_func(a_dim, s_dim, d_dim, policy_params)
    # disturber = Disturber(d_dim, s_dim, disturber_params)

    param_variable = eval_params['param_variables']
    grid_eval_param = eval_params['grid_eval_param']
    length_of_pole, mass_of_pole, mass_of_cart, gravity = env.get_params()

    log_path = variant['log_path'] + '/eval'

    if eval_params['grid_eval']:

        param1 = grid_eval_param[0]
        param2 = grid_eval_param[1]
        log_path = log_path + '/' + param1 + '-' + param2
        logger.configure(dir=log_path, format_strs=['csv'])
        logger.logkv('num_of_paths', variant['eval_params']['num_of_paths'])
        for var1 in param_variable[param1]:
            if param1 == 'length_of_pole':
                length_of_pole = var1
            elif param1 == 'mass_of_pole':
                mass_of_pole = var1
            elif param1 == 'mass_of_cart':
                mass_of_cart = var1
            elif param1 == 'gravity':
                gravity = var1

            for var2 in param_variable[param2]:
                if param2 == 'length_of_pole':
                    length_of_pole = var2
                elif param2 == 'mass_of_pole':
                    mass_of_pole = var2
                elif param2 == 'mass_of_cart':
                    mass_of_cart = var2
                elif param2 == 'gravity':
                    gravity = var2

                env.set_params(mass_of_pole=mass_of_pole,
                               length=length_of_pole,
                               mass_of_cart=mass_of_cart,
                               gravity=gravity)
                diagnostic_dict, _ = evaluation(variant, env, policy)

                string_to_print = [
                    param1, ':',
                    str(round(var1, 2)), '|', param2, ':',
                    str(round(var2, 2)), '|'
                ]
                [
                    string_to_print.extend(
                        [key, ':',
                         str(round(diagnostic_dict[key], 2)), '|'])
                    for key in diagnostic_dict.keys()
                ]
                print(''.join(string_to_print))

                logger.logkv(param1, var1)
                logger.logkv(param2, var2)
                [
                    logger.logkv(key, diagnostic_dict[key])
                    for key in diagnostic_dict.keys()
                ]
                logger.dumpkvs()
    else:
        for param in param_variable.keys():
            logger.configure(dir=log_path + '/' + param, format_strs=['csv'])
            logger.logkv('num_of_paths',
                         variant['eval_params']['num_of_paths'])
            env.reset_params()
            for var in param_variable[param]:
                if param == 'length_of_pole':
                    length_of_pole = var
                elif param == 'mass_of_pole':
                    mass_of_pole = var
                elif param == 'mass_of_cart':
                    mass_of_cart = var
                elif param == 'gravity':
                    gravity = var

                env.set_params(mass_of_pole=mass_of_pole,
                               length=length_of_pole,
                               mass_of_cart=mass_of_cart,
                               gravity=gravity)
                diagnostic_dict = evaluation(variant, env, policy)

                string_to_print = [param, ':', str(round(var, 2)), '|']
                [
                    string_to_print.extend(
                        [key, ':',
                         str(round(diagnostic_dict[key], 2)), '|'])
                    for key in diagnostic_dict.keys()
                ]
                print(''.join(string_to_print))

                logger.logkv(param, var)
                [
                    logger.logkv(key, diagnostic_dict[key])
                    for key in diagnostic_dict.keys()
                ]
                logger.dumpkvs()
def train(log_dir):
    """Performs the agent traning.

    Args:
        log_dir (str): The directory in which the final model (policy) and the
        log data is saved.
    """

    # Create environment
    env = get_env_from_name(ENV_NAME, ENV_SEED)

    # Set initial learning rates
    lr_a, lr_l = (
        ALG_PARAMS["lr_a"],
        ALG_PARAMS["lr_l"],
    )
    lr_a_now = ALG_PARAMS["lr_a"]  # learning rate for actor, lambda and alpha
    lr_l_now = ALG_PARAMS["lr_l"]  # learning rate for lyapunov critic

    # Get observation and action space dimension and limits from the environment
    s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.shape[0]
    a_upperbound = env.action_space.high
    a_lowerbound = env.action_space.low

    # Create the Lyapunov Actor Critic agent
    policy = LAC(a_dim, s_dim, log_dir=log_dir)

    # Create replay memory buffer
    pool = Pool(
        s_dim=s_dim,
        a_dim=a_dim,
        store_last_n_paths=TRAIN_PARAMS["num_of_training_paths"],
        memory_capacity=ALG_PARAMS["memory_capacity"],
        min_memory_size=ALG_PARAMS["min_memory_size"],
    )

    # Training setting
    t1 = time.time()
    global_step = 0
    tb_step = 0
    last_training_paths = deque(maxlen=TRAIN_PARAMS["num_of_training_paths"])
    training_started = False

    # Create tensorboard variables
    tb_lr_a = tf.Variable(lr_a, dtype=tf.float32)
    tb_lr_l = tf.Variable(lr_l, dtype=tf.float32)
    tb_lr_lag = tf.Variable(lr_a, dtype=tf.float32)
    tb_ret = tf.Variable(0, dtype=tf.float32)
    tb_len = tf.Variable(0, dtype=tf.float32)
    tb_a_loss = tf.Variable(0, dtype=tf.float32)
    tb_lyapunov_error = tf.Variable(0, dtype=tf.float32)
    tb_entropy = tf.Variable(0, dtype=tf.float32)

    # Initialize tensorboard variables and create summaries
    if USE_TB:
        policy.sess.run(
            [
                tb_lr_a.initializer,
                tb_lr_l.initializer,
                tb_lr_lag.initializer,
                tb_ret.initializer,
                tb_len.initializer,
                tb_a_loss.initializer,
                tb_lyapunov_error.initializer,
                tb_entropy.initializer,
            ]
        )

        # Add tensorboard summaries
        main_sum = tf.compat.v1.summary.merge(
            [
                tf.compat.v1.summary.scalar("lr_a", tb_lr_a),
                tf.compat.v1.summary.scalar("lr_l", tb_lr_l),
                tf.compat.v1.summary.scalar("lr_lag", tb_lr_lag),
                tf.compat.v1.summary.scalar("alpha", policy.alpha),
                tf.compat.v1.summary.scalar("lambda", policy.labda),
            ]
        )
        other_sum = tf.compat.v1.summary.merge(
            [
                tf.compat.v1.summary.scalar("ep_ret", tb_ret),
                tf.compat.v1.summary.scalar("ep_length", tb_len),
                tf.compat.v1.summary.scalar("a_loss", tb_a_loss),
                tf.compat.v1.summary.scalar("lyapunov_error", tb_lyapunov_error),
                tf.compat.v1.summary.scalar("entropy", tb_entropy),
            ]
        )
        policy.tb_writer.add_summary(
            policy.sess.run(main_sum), policy.sess.run(policy.step)
        )
        if WRITE_W_B:
            policy.tb_writer.add_summary(
                policy.sess.run(policy.w_b_sum), policy.sess.run(policy.step),
            )
        policy.tb_writer.flush()  # Above summaries are known from the start

    # Setup logger and log hyperparameters
    logger.configure(dir=log_dir, format_strs=["csv"])
    logger.logkv("tau", ALG_PARAMS["tau"])
    logger.logkv("alpha3", ALG_PARAMS["alpha3"])
    logger.logkv("batch_size", ALG_PARAMS["batch_size"])
    logger.logkv("target_entropy", policy.target_entropy)

    # Training loop
    for i in range(ENV_PARAMS["max_episodes"]):

        # Create variable to store information about the current path
        current_path = {
            "rewards": [],
            "a_loss": [],
            "alpha": [],
            "lambda": [],
            "lyapunov_error": [],
            "entropy": [],
        }

        # Stop training if max number of steps has been reached
        if global_step > ENV_PARAMS["max_global_steps"]:
            break

        # Reset environment
        s = env.reset()

        # Training Episode loop
        for j in range(ENV_PARAMS["max_ep_steps"]):

            # Render environment if requested
            if ENV_PARAMS["eval_render"]:
                env.render()

            # Retrieve (scaled) action based on the current policy
            a = policy.choose_action(s)
            # a = np.squeeze(np.random.uniform(low=-1.0, high=1.0, size=(1, 2)))  # DEBUG
            action = a_lowerbound + (a + 1.0) * (a_upperbound - a_lowerbound) / 2

            # Perform action in env
            s_, r, done, _ = env.step(action)

            # Increment global step count
            if training_started:
                global_step += 1

            # Stop episode if max_steps has been reached
            if j == ENV_PARAMS["max_ep_steps"] - 1:
                done = True
            terminal = 1.0 if done else 0.0

            # Store experience in replay buffer
            pool.store(s, a, r, terminal, s_)

            # Increment tensorboard step counter
            # NOTE: This was done differently from the global_step counter since
            # otherwise there were inconsistencies in the tb log.
            if USE_TB:
                tb_step += 1

            # Optimize weights and parameters using STG
            if (
                pool.memory_pointer > ALG_PARAMS["min_memory_size"]
                and global_step % ALG_PARAMS["steps_per_cycle"] == 0
            ):
                training_started = True

                # Perform STG a set number of times (train per cycle)
                for _ in range(ALG_PARAMS["train_per_cycle"]):
                    batch = pool.sample(ALG_PARAMS["batch_size"])
                    labda, alpha, l_loss, entropy, a_loss = policy.learn(
                        lr_a_now, lr_l_now, lr_a, batch
                    )

            # Save path results
            if training_started:
                current_path["rewards"].append(r)
                current_path["lyapunov_error"].append(l_loss)
                current_path["alpha"].append(alpha)
                current_path["lambda"].append(labda)
                current_path["entropy"].append(entropy)
                current_path["a_loss"].append(a_loss)

            # Evalute the current performance and log results
            if (
                training_started
                and global_step % TRAIN_PARAMS["evaluation_frequency"] == 0
                and global_step > 0
            ):
                logger.logkv("total_timesteps", global_step)
                training_diagnostics = evaluate_training_rollouts(last_training_paths)
                if training_diagnostics is not None:
                    if TRAIN_PARAMS["num_of_evaluation_paths"] > 0:
                        eval_diagnostics = training_evaluation(env, policy)
                        [
                            logger.logkv(key, eval_diagnostics[key])
                            for key in eval_diagnostics.keys()
                        ]
                        training_diagnostics.pop("return")
                    [
                        logger.logkv(key, training_diagnostics[key])
                        for key in training_diagnostics.keys()
                    ]
                    logger.logkv("lr_a", lr_a_now)
                    logger.logkv("lr_l", lr_l_now)
                    string_to_print = ["time_step:", str(global_step), "|"]
                    if TRAIN_PARAMS["num_of_evaluation_paths"] > 0:
                        [
                            string_to_print.extend(
                                [key, ":", str(eval_diagnostics[key]), "|"]
                            )
                            for key in eval_diagnostics.keys()
                        ]
                    [
                        string_to_print.extend(
                            [key, ":", str(round(training_diagnostics[key], 2)), "|"]
                        )
                        for key in training_diagnostics.keys()
                    ]
                    print("".join(string_to_print))
                logger.dumpkvs()

            # Update state
            s = s_

            # Decay learning rate
            if done:

                # Store paths
                if training_started:
                    last_training_paths.appendleft(current_path)

                    # Get current model performance for tb
                    if USE_TB:
                        training_diagnostics = evaluate_training_rollouts(
                            last_training_paths
                        )

                # Log tb variables
                if USE_TB:
                    if i % TB_FREQ == 0:

                        # Update and log learning rate tb vars
                        policy.sess.run(policy.step.assign(tb_step))
                        policy.sess.run(tb_lr_a.assign(lr_a_now))
                        policy.sess.run(tb_lr_l.assign(lr_l_now))
                        policy.sess.run(tb_lr_lag.assign(lr_a))
                        policy.tb_writer.add_summary(
                            policy.sess.run(main_sum), policy.sess.run(policy.step)
                        )

                        # Update and log other training vars to tensorboard
                        if training_started:

                            # Update and log training vars
                            policy.sess.run(
                                tb_ret.assign(training_diagnostics["return"])
                            )
                            policy.sess.run(
                                tb_len.assign(training_diagnostics["length"])
                            )
                            policy.sess.run(
                                tb_a_loss.assign(training_diagnostics["a_loss"])
                            )
                            policy.sess.run(
                                tb_lyapunov_error.assign(
                                    training_diagnostics["lyapunov_error"]
                                )
                            )
                            policy.sess.run(
                                tb_entropy.assign(training_diagnostics["entropy"])
                            )
                            policy.tb_writer.add_summary(
                                policy.sess.run(other_sum), policy.sess.run(policy.step)
                            )

                            # Log network weights
                            if WRITE_W_B:
                                policy.tb_writer.add_summary(
                                    policy.sess.run(policy.w_b_sum),
                                    policy.sess.run(policy.step),
                                )
                        policy.tb_writer.flush()

                # Decay learning rates
                frac = 1.0 - (global_step - 1.0) / ENV_PARAMS["max_global_steps"]
                lr_a_now = lr_a * frac  # learning rate for actor, lambda, alpha
                lr_l_now = lr_l * frac  # learning rate for lyapunov critic
                break

    # Save model and print Running time
    policy.save_result(log_dir)
    # policy.tb_writer.close()
    print("Running time: ", time.time() - t1)
    return
示例#29
0
def dynamic(variant):
    env_name = variant['env_name']
    env = get_env_from_name(env_name)

    eval_params = variant['eval_params']
    policy_params = variant['alg_params']
    build_func = get_policy(variant['algorithm_name'])
    if 'Fetch' in env_name or 'Hand' in env_name:
        s_dim = env.observation_space.spaces['observation'].shape[0] \
                + env.observation_space.spaces['achieved_goal'].shape[0] + \
                env.observation_space.spaces['desired_goal'].shape[0]
    else:
        s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.shape[0]
    policy = build_func(a_dim, s_dim, policy_params)
    # disturber = Disturber(d_dim, s_dim, disturber_params)

    log_path = variant['log_path'] + '/eval/dynamic/' + eval_params[
        'additional_description']
    variant['eval_params'].update({'magnitude': 0})
    logger.configure(dir=log_path, format_strs=['csv'])

    _, paths = evaluation(variant, env, policy)
    max_len = 0
    for path in paths['s']:
        path_length = len(path)
        if path_length > max_len:
            max_len = path_length
    average_path = np.average(np.array(paths['s']), axis=0)
    std_path = np.std(np.array(paths['s']), axis=0)

    for i in range(max_len):
        logger.logkv('average_path', average_path[i])
        logger.logkv('std_path', std_path[i])
        logger.logkv('reference', paths['reference'][0][i])
        logger.dumpkvs()
    if eval_params['directly_show']:
        fig = plt.figure(figsize=(9, 6))
        ax = fig.add_subplot(111)

        if eval_params['plot_average']:
            t = range(max_len)
            ax.plot(t, average_path, color='red')
            # if env_name =='cartpole_cost':
            #     ax.fill_between(t, (average_path - std_path)[:, 0], (average_path + std_path)[:, 0],
            #                     color='red', alpha=.1)
            # else:
            ax.fill_between(t,
                            average_path - std_path,
                            average_path + std_path,
                            color='red',
                            alpha=.1)
        else:
            for path in paths['s']:
                path_length = len(path)
                t = range(path_length)
                path = np.array(path)

                # ax.plot(t, path)
                ax.plot(t, path, color='red')

                #MJS
                # ax.plot(t, path[:, 0], color='red')
                # ax.plot(t, path[:, 1], color='blue')

                # ax.plot(t, path[:,0],label='mRNA 1')
                # ax.plot(t, path[:, 1], label='mRNA 2')
                # ax.plot(t, path[:, 2], label='mRNA 3')
                # ax.plot(t, path[:, 3], label='Protein 1')
                # ax.plot(t, path[:, 4], label='Protein 2')
                # ax.plot(t, path[:, 5], label='Protein 3')

                #osscillator complicated

                # ax.plot(t, path[:, 0],label='mRNA 1')
                # ax.plot(t, path[:, 1], label='mRNA 2')
                # ax.plot(t, path[:, 2], label='mRNA 3')
                # ax.plot(t, path[:, 3], label='mRNA 4')
                # ax.plot(t, path[:, 4], label='Protein 1')
                # ax.plot(t, path[:, 5], label='Protein 2')
                # ax.plot(t, path[:, 6], label='Protein 3')
                # ax.plot(t, path[:, 7], label='Protein 4')

                if path_length > max_len:
                    max_len = path_length
            # MJS
            # plt.ylim(-1000, 1000)
            # ax.plot(t, path[:, 0], color='red', label='s 1')
            # ax.plot(t, path[:, 1], color='blue', label='s 2')

            # cartpole
            # ax.plot(t, path, color='red', label='theta')
            # oscillator
            # ax.plot(t, path, color='red', label='Protein 1')
            # ax.plot(t, paths['reference'][0], color='blue', label='Reference')
            handles, labels = ax.get_legend_handles_labels()
            ax.legend(handles,
                      labels,
                      fontsize=20,
                      loc=2,
                      fancybox=False,
                      shadow=False)
        # if 'reference' in paths.keys():
        #     for path in paths['reference']:
        #         path_length = len(path)
        #         if path_length == max_len:
        #             t = range(path_length)
        #
        #             ax.plot(t, path, color='brown',linestyle='dashed', label='refernce')
        #             break
        #         else:
        #             continue
        #
        #     handles, labels = ax.get_legend_handles_labels()
        #     ax.legend(handles, labels, fontsize=20, loc=2, fancybox=False, shadow=False)
        plt.savefig(env_name + '-' + variant['algorithm_name'] +
                    '-dynamic-state.pdf')
        plt.show()
        if 'c' in paths.keys():
            fig = plt.figure(figsize=(9, 6))
            ax = fig.add_subplot(111)
            for path in paths['c']:
                t = range(len(path))
                ax.plot(t, path)
            plt.savefig(env_name + '-' + variant['algorithm_name'] +
                        '-dynamic-cost.pdf')
            plt.show()
        if 'v' in paths.keys():
            fig = plt.figure(figsize=(9, 6))
            ax = fig.add_subplot(111)
            for path in paths['v']:
                t = range(len(path))
                ax.plot(t, path)
            plt.savefig(env_name + '-' + variant['algorithm_name'] +
                        '-dynamic-value.pdf')
            plt.show()
        return
示例#30
0
def learn(seed,
          policy,
          env,
          nsteps,
          total_timesteps,
          ent_coef,
          lr,
          vf_coef=0.5,
          max_grad_norm=0.5,
          gamma=0.99,
          lam=0.95,
          nminibatches=4,
          noptepochs=4,
          cliprange=0.1,
          next_n=10,
          nslupdates=10,
          seq_len=10,
          ext_coef=1,
          int_coef=0.1,
          K=10):

    rng = np.random.RandomState(seed)
    total_timesteps = int(total_timesteps)

    nenvs = env.num_envs
    ob_space = env.observation_space
    loc_space = 2
    ac_space = env.action_space
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches
    nbatch_sl_train = nenvs * seq_len // nminibatches

    make_model = lambda: Model(policy=policy,
                               ob_space=ob_space,
                               loc_space=loc_space,
                               ac_space=ac_space,
                               nbatch_act=nenvs,
                               nbatch_train=nbatch_train,
                               nbatch_sl_train=nbatch_sl_train,
                               nsteps=nsteps,
                               ent_coef=ent_coef,
                               vf_coef=vf_coef,
                               max_grad_norm=max_grad_norm,
                               seq_len=seq_len,
                               seed=seed)
    model = make_model()

    replay_buffer = Buffer(max_size=1000, seed=seed)
    runner = Runner(env=env,
                    model=model,
                    nsteps=nsteps,
                    gamma=gamma,
                    lam=lam,
                    next_n=next_n,
                    seq_len=seq_len,
                    int_coef=int_coef,
                    ext_coef=ext_coef,
                    replay_buffer=replay_buffer,
                    seed=seed)
    episode_raw_stats = EpisodeStats(nsteps, nenvs)
    episode_stats = EpisodeStats(nsteps, nenvs)
    tfirststart = time.time()
    nupdates = total_timesteps // nbatch
    sl_acc = 0
    p = 0
    for update in range(1, nupdates + 1):
        assert nbatch % nminibatches == 0
        p = update * nbatch / (total_timesteps * 0.875)
        nbatch_train = nbatch // nminibatches
        tstart = time.time()
        obs, locs, goals, raw_rewards, rewards, returns, masks, rnn_masks, actions, values, neglogpacs, states = runner.run(
            K, p)
        episode_raw_stats.feed(raw_rewards, masks)
        episode_stats.feed(rewards, masks)
        mblossvals = []
        assert nenvs % nminibatches == 0
        envsperbatch = nenvs // nminibatches
        envinds = np.arange(nenvs)
        flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
        envsperbatch = nbatch_train // nsteps
        for _ in range(noptepochs):
            rng.shuffle(envinds)
            for start in range(0, nenvs, envsperbatch):
                end = start + envsperbatch
                mbenvinds = envinds[start:end]
                mbflatinds = flatinds[mbenvinds].ravel()
                slices = (arr[mbflatinds]
                          for arr in (obs, locs, goals, returns, rnn_masks,
                                      actions, values, neglogpacs))
                mbstates = states[mbenvinds]
                mblossvals.append(model.train(lr, cliprange, *slices,
                                              mbstates))

        if nslupdates > 0 and sl_acc < 0.75:
            sl_acc, sl_loss = sl_train(model,
                                       replay_buffer,
                                       nslupdates=nslupdates,
                                       seq_len=seq_len,
                                       nenvs=nenvs,
                                       envsperbatch=envsperbatch,
                                       lr=lr)
        elif nslupdates > 0:
            sl_acc, sl_loss = sl_train(model,
                                       replay_buffer,
                                       nslupdates=1,
                                       seq_len=seq_len,
                                       nenvs=nenvs,
                                       envsperbatch=envsperbatch,
                                       lr=lr)

        lossvals = np.mean(mblossvals, axis=0)
        tnow = time.time()
        fps = int(nbatch / (tnow - tstart))
        logger.logkv("serial_timesteps", update * nsteps)
        logger.logkv("nupdates", update)
        logger.logkv("total_timesteps", update * nbatch)
        logger.logkv("fps", fps)
        logger.logkv('episode_raw_reward', episode_raw_stats.mean_reward())
        logger.logkv('imitation_episode_reward',
                     np.mean(runner.recent_imitation_rewards))
        logger.logkv('episode_reward', episode_stats.mean_reward())
        logger.logkv('episode_success_ratio',
                     np.mean(runner.recent_success_ratio))
        logger.logkv('time_elapsed', tnow - tfirststart)
        if nslupdates > 0:
            logger.logkv('sl_loss', sl_loss)
            logger.logkv('sl_acc', sl_acc)
        logger.logkv('replay_buffer_num', replay_buffer.num_episodes())
        logger.logkv('replay_buffer_best', replay_buffer.max_reward())
        if noptepochs > 0:
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv(lossname, lossval)
        logger.dumpkvs()
        print(logger.get_dir())
    env.close()
    return model
示例#31
0
def train(variant):
    Min_cost = 1000000

    data_trajectories = get_data()  # get data (X, W, X_, theta, state)
    env_name = variant['env_name']  # choose your environment
    env = get_env_from_name(env_name)

    num_data_traj = variant['num_data_trajectories']
    reward_id = variant['reward_id']
    env_params = variant['env_params']
    max_episodes = env_params[
        'max_episodes']  # maximum episodes for RL training
    max_ep_steps = env_params[
        'max_ep_steps']  # number of maximum steps in each episode
    max_global_steps = env_params['max_global_steps']
    store_last_n_paths = variant['store_last_n_paths']
    evaluation_frequency = variant['evaluation_frequency']

    policy_params = variant['alg_params']

    min_memory_size = policy_params['min_memory_size']
    steps_per_cycle = policy_params['steps_per_cycle']
    train_per_cycle = policy_params['train_per_cycle']
    batch_size = policy_params['batch_size']

    lr_a, lr_c, lr_l = policy_params['lr_a'], policy_params[
        'lr_c'], policy_params['lr_l']
    lr_a_now = lr_a  # learning rate for actor
    lr_c_now = lr_c  # learning rate for critic
    lr_l_now = lr_l  # learning rate for lyapunov critic

    s_dim = env.observation_space.shape[
        0]  # dimension of state (3 for Battery)

    a_dim = env.action_space.shape[0]  # action space dimension (1 or 2)
    a_upperbound = env.action_space.high
    a_lowerbound = env.action_space.low

    policy = CAC(a_dim, s_dim, policy_params)
    policy.restore(variant['log_path'] + "/0/policy")

    pool_params = {
        's_dim': s_dim,
        'a_dim': a_dim,
        'd_dim': 1,
        'store_last_n_paths': store_last_n_paths,
        'memory_capacity': policy_params['memory_capacity'],
        'min_memory_size': policy_params['min_memory_size'],
        'history_horizon': policy_params['history_horizon'],
        'finite_horizon': policy_params['finite_horizon']
    }
    if 'value_horizon' in policy_params.keys():
        pool_params.update({'value_horizon': policy_params['value_horizon']})
    else:
        pool_params['value_horizon'] = None
    pool = Pool(pool_params)
    # For analyse
    Render = env_params['eval_render']
    ref_s = env.reference_state
    # Training setting
    t1 = time.time()
    global_step = 0
    last_training_paths = deque(maxlen=store_last_n_paths)
    training_started = False

    log_path = variant['log_path']
    logger.configure(dir=log_path, format_strs=['csv'])
    logger.logkv('tau', policy_params['tau'])

    logger.logkv('alpha3', policy_params['alpha3'])
    logger.logkv('batch_size', policy_params['batch_size'])
    logger.logkv('target_entropy', policy.target_entropy)

    for i in range(max_episodes):
        print("episode # ", i)
        print("global steps ", global_step)

        current_path = {
            'rewards': [],
            'distance': [],
            'kl_divergence': [],
            'a_loss': [],
            'alpha': [],
            'lyapunov_error': [],
            'entropy': [],
            'beta': [],
            'action_distance': [],
        }

        if global_step > max_global_steps:
            break

        s = env.reset()

        # Random start point

        # traj_id = np.random.randint(0, len(data_trajectories))
        traj_id = np.random.randint(0, num_data_traj)
        # traj_id = 1
        traj = data_trajectories[traj_id]

        # print(len(traj))
        if variant['traj_start'] == "random":
            start_point = np.random.randint(0, len(traj) - 2)
        else:
            start_point = int(variant['traj_start'])
        # s = traj[start_point, 1]
        s = traj[start_point, -8:]
        # current state, theta,next w, desired state
        # this is for decision making
        # 16,1,4,16
        # s = np.array([s, traj[start_point, 2], traj[start_point, 4]])
        # print(i, s)
        s = np.array(
            list(s) + [traj[start_point, 2]] +
            list(traj[start_point + 1, -8:]))
        # print(s)
        env.state = s
        env.model.state = traj[start_point, -8:]
        # env.state = env.model.state
        # ep_steps = len(traj)
        ep_steps = min(start_point + 1 + max_ep_steps, len(traj))
        # print("selected traj = ", traj_id, " and length = ", len(traj), " starting = ", start_point, " ep_steps = ", ep_steps)
        for j in range(start_point + 1, ep_steps):
            if Render:
                env.render()
            s = env.state
            delta = np.zeros(s.shape)
            # ###### NOSIE ##############

            # noise = np.random.normal(0, 0.01, 0.01)
            # delta[2:]= noise
            # ########IF Noise env##########
            # s= s + delta
            # a = policy.choose_action(s)

            # ###### BIAS ##############

            # noise = s[0:16]*0.01
            # delta[0:16] = noise

            # store_s = s.copy()
            # store_s[2] = store_s[2]-store_s[0]
            # a = policy.choose_action(store_s + delta)
            # print(s, delta)
            a = policy.choose_action(s / ref_s + delta)
            # print("a: ", a)
            action = a_lowerbound + (a + 1.) * (a_upperbound -
                                                a_lowerbound) / 2
            # action = traj[j-1,16]
            # print("a normalize: " , action)

            a_upperbound = env.action_space.high
            a_lowerbound = env.action_space.low

            # Run in simulator
            s_, r, done, X_ = env.step(action, traj[j, 2], traj[j, 1])
            # The new s= current state,next omega, next state
            s_ = np.array(list(s_) + [traj[j + 1, 2]] + list(traj[j + 1, -8:]))
            # s_ = np.array([X_[1][0], traj[j, 2], traj[j,4]])
            # s_ = np.array([traj[j, 1], traj[j, 2], traj[j,4]])
            r = modify_reward(r, s, s_, reward_id)
            # print(r)
            if global_step % 100 == 1:
                print("global step: ", global_step, " true action: ",
                      [traj[j, 5], traj[j, 6]], " predicted action: ", action,
                      " and reward : ", r)

            # print("new state is : ", s_)
            # s_ = np.concatenate([[s_], [theta]], axis=1)[0]
            # s_ = np.concatenate([X_,[[theta]], [traj[j, 9:]]], axis=1)[0]
            env.state = s_
            # store_s_ = s_.copy()
            # store_s_[2] = store_s_[2] - store_s_[0]
            # theta_pre=theta
            if training_started:
                global_step += 1

            if j == ep_steps - 2:
                done = True

            terminal = 1. if done else 0.

            if j > start_point + 2:
                pool.store(s / ref_s, a, np.zeros([1]), np.zeros([1]), r,
                           terminal, s_ / ref_s, _s / ref_s)
                # pool.store(store_s, a, np.zeros([1]), np.zeros([1]), r, terminal, store_s_, store__s)
            # policy.store_transition(s, a, disturbance, r,0, terminal, s_)

            if pool.memory_pointer > min_memory_size and global_step % steps_per_cycle == 0:
                training_started = True
                # print("learning policy")

                for _ in range(train_per_cycle):
                    batch = pool.sample(batch_size)
                    labda, alpha, beta, l_loss, entropy, a_loss, beta, action_distance, kl, distance = policy.learn(
                        lr_a_now, lr_c_now, lr_l_now, lr_a_now / 10, batch)
                    if global_step % 2000 == 1:
                        print("labda = ", labda, " | alpha = ", alpha,
                              " | beta = ", beta, " | l_loss = ", l_loss,
                              " | entropy = ", entropy, " | a_loss = ", a_loss,
                              " | action_distance = ", action_distance)
            if training_started:
                current_path['rewards'].append(r)
                current_path['distance'].append(distance)
                current_path['kl_divergence'].append(kl)
                current_path['lyapunov_error'].append(l_loss)
                current_path['alpha'].append(alpha)
                current_path['entropy'].append(entropy)
                current_path['a_loss'].append(a_loss)
                current_path['beta'].append(beta)
                current_path['action_distance'].append(action_distance)

            if training_started and global_step % evaluation_frequency == 0 and global_step > 0:

                logger.logkv("total_timesteps", global_step)

                training_diagnotic = evaluate_training_rollouts(
                    last_training_paths)
                # print(training_diagnotic)
                if training_diagnotic is not None:
                    print("doing training evaluation")
                    eval_diagnotic = training_evaluation(variant, env, policy)
                    [
                        logger.logkv(key, eval_diagnotic[key])
                        for key in eval_diagnotic.keys()
                    ]
                    training_diagnotic.pop('return')
                    [
                        logger.logkv(key, training_diagnotic[key])
                        for key in training_diagnotic.keys()
                    ]
                    logger.logkv('lr_a', lr_a_now)
                    logger.logkv('lr_c', lr_c_now)
                    logger.logkv('lr_l', lr_l_now)

                    string_to_print = ['time_step:', str(global_step), '|']
                    [
                        string_to_print.extend(
                            [key, ':', str(eval_diagnotic[key]), '|'])
                        for key in eval_diagnotic.keys()
                    ]
                    [
                        string_to_print.extend([
                            key, ':',
                            str(round(training_diagnotic[key], 2)), '|'
                        ]) for key in training_diagnotic.keys()
                    ]
                    print(''.join(string_to_print))

                logger.dumpkvs()
                if eval_diagnotic['test_return'] / eval_diagnotic[
                        'test_average_length'] <= Min_cost:
                    Min_cost = eval_diagnotic['test_return'] / eval_diagnotic[
                        'test_average_length']
                    print("New lowest cost:", Min_cost)
                    policy.save_result(log_path)
                else:
                    print("cost did not improve.")
                    print(
                        "avg cost was ", eval_diagnotic['test_return'] /
                        eval_diagnotic['test_average_length'])
                    print("prev best cost is:", Min_cost)
                    # policy.save_result(log_path)
                if training_started and global_step % (
                        10 * evaluation_frequency) == 0 and global_step > 0:
                    policy.save_result(log_path)

            # State Update
            _s = s
            s = s_
            store__s = _s.copy()
            store__s[2] = store__s[2] - store__s[0]
            # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY
            if done:
                # print("done at ", j)
                if training_started:
                    last_training_paths.appendleft(current_path)
                frac = 1.0 - (global_step - 1.0) / max_global_steps
                lr_a_now = lr_a * frac  # learning rate for actor
                lr_c_now = lr_c * frac  # learning rate for critic
                lr_l_now = lr_l * frac  # learning rate for critic
                break
    policy.save_result(log_path)

    print('Running time: ', time.time() - t1)
    return