示例#1
0
def main(env_name, num_episodes, render, VideoSave, gamma, lam, kl_targ, batch_size):
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name, render)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime("%b-%d_%H-%M-%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)
    #aigym_path = os.path.join('/tmp', env_name, now)
    #env = wrappers.Monitor(env, aigym_path, force=True) 
    scaler = Scaler(obs_dim, env_name)
    scaler.resume()
    val_func = NNValueFunction(obs_dim, env_name)
    policy = Policy(obs_dim, act_dim, kl_targ, env_name)
    episode = 0
    capture = False
    while episode < num_episodes:
        if VideoSave and not capture:
            env.ScreenCapture(5)
            capture = True
        trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size)
        episode += len(trajectories)
        

        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    logger.close()
    policy.close_sess()
    val_func.close_sess()
示例#2
0
def main(env_name, num_episodes, render, gamma, lam, kl_targ, batch_size):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name, render)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime("%b-%d_%H-%M-%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)

    scaler = Scaler(obs_dim, env_name)
    val_func = NNValueFunction(obs_dim, env_name)
    policy = Policy(obs_dim, act_dim, kl_targ, env_name)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    #capture = False
    while episode < num_episodes:
        trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size)
        episode += len(trajectories)
        """if episode > 600 and not capture:
               env.ScreenCapture(5)
               capture = True"""
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories, gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        
        logger.write(display=True)  # write logger results to file and stdout
        scaler.save()
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    logger.close()
    policy.close_sess()
    val_func.close_sess()
示例#3
0
class Experiment:

    def __init__(self, env_name, discount, num_iterations, lamb, animate, kl_target, show):
        self.env_name = env_name
        self.env = gym.make(env_name)
        if env_name == "FetchReach-v0":
            self.env = gym.wrappers.FlattenDictWrapper(self.env, ['observation', 'desired_goal', 'achieved_goal'])
        gym.spaces.seed(1234)
        self.obs_dim = self.env.observation_space.shape[0]# + 1 # adding time step as feature
        self.act_dim = self.env.action_space.shape[0]
        self.discount = discount
        self.num_iterations = num_iterations
        self.lamb = lamb
        self.animate = animate
        self.episodes = 20
        self.killer = GracefulKiller()
        # self.policy = ProximalPolicy(self.obs_dim, self.act_dim, self.env.action_space, kl_target, discount=discount,
        #                              lamb=lamb)
        self.policy = NoTracePolicy(self.obs_dim, self.act_dim, self.env.action_space, kl_target, epochs=20)
        # using MC return would be more helpful
        self.value_func = l2TargetValueFunc(self.obs_dim, epochs=10)
        # self.value_func = ValueFunc(self.obs_dim, discount=discount, lamb=1)

        if not show:
            # save copies of file
            shutil.copy(inspect.getfile(self.policy.__class__), OUTPATH)
            shutil.copy(inspect.getfile(self.value_func.__class__), OUTPATH)
            shutil.copy(inspect.getfile(self.__class__), OUTPATH)

            self.log_file = open(OUTPATH + 'log.csv', 'w')
            self.write_header = True

        print('observation dimension:', self.obs_dim)
        print('action dimension:', self.act_dim)

        # Use of a scaler is crucial
        self.scaler = Scaler(self.obs_dim)
        self.init_scaler()

    def init_scaler(self):
        print('fitting scaler')
        observation_samples = []
        for i in range(5):
            observation = []
            obs = self.env.reset()
            observation.append(obs)
            obs = obs.astype(np.float64).reshape((1, -1))
            done = False
            step = 0
            while not done:
                # obs = np.append(obs, [[step]], axis=1)  # add time step feature
                action = self.policy.get_sample(obs).reshape((1, -1)).astype(np.float64)
                if self.env_name == "FetchReach-v0":
                    obs_new, reward, done, _ = self.env.step(action.reshape(-1))
                else:
                    obs_new, reward, done, _ = self.env.step(action)
                observation.append(obs_new)
                obs = obs_new.astype(np.float64).reshape((1, -1))
                step += 1e-3
            observation_samples.append(observation)
        observation_samples = np.concatenate(observation_samples, axis=0)
        # print(observation_samples.shape)
        self.scaler.update(observation_samples)

    def normalize_obs(self, obs):
        scale, offset = self.scaler.get()
        obs_scaled = (obs-offset)*scale
        self.scaler.update(obs.astype(np.float64).reshape((1, -1)))
        return obs_scaled

    def run_one_episode(self):
        """
        collect data only
        :param save:
        :param train_policy:
        :param train_value_func:
        :param animate:
        :return:
        """
        obs = self.env.reset()
        observes, actions, rewards = [],[],[]
        done = False
        step = 0
        while not done:
            if self.animate:
                self.env.render()
            obs = obs.astype(np.float64).reshape((1, -1))
            obs = self.normalize_obs(obs)
            # obs = np.append(obs, [[step]], axis=1)  # add time step feature
            observes.append(obs)
            action = self.policy.get_sample(obs).reshape((1, -1)).astype(np.float64)
            actions.append(action)
            if self.env_name == "FetchReach-v0":
                obs_new, reward, done, _ = self.env.step(action.reshape(-1))
            else:
                obs_new, reward, done, _ = self.env.step(action)
            if not isinstance(reward, float):
                reward = np.asscalar(reward)
            rewards.append(reward)

            obs = obs_new
            step += 0.003

        return np.concatenate(observes), np.concatenate(actions), np.array(rewards)

    def discounted_sum(self, l, factor):
        discounted = []
        sum = 0
        for i in reversed(l):
            discounted.append(factor*sum+i)
            sum = factor*sum+i
        return np.array(list(reversed(discounted)))

    def run_policy(self, episodes):
        trajectories = []
        for e in range(episodes):
            observes, actions, rewards = self.run_one_episode()
            trajectory = {'observes': observes,
                          'actions': actions,
                          'rewards': rewards}
            # scale rewards
            if self.discount < 0.999:
                rewards = rewards*(1-self.discount)

            trajectory['values'] = self.value_func.predict(observes)
            trajectory['mc_return'] = self.discounted_sum(rewards, self.discount)

            trajectory['td_residual'] = rewards + self.discount*np.append(trajectory['values'][1:],0) - trajectory['values']
            trajectory['gae'] = self.discounted_sum(trajectory['td_residual'], self.discount*self.lamb)

            trajectories.append(trajectory)

        return trajectories

    def run_expr(self):
        ep_steps = []
        ep_rewards = []
        ep_entropy = []
        i = 0
        while i < self.num_iterations:
            trajectories = self.run_policy(20)
            i += len(trajectories)
            observes = np.concatenate([t['observes'] for t in trajectories])
            actions = np.concatenate([t['actions'] for t in trajectories])
            mc_returns = np.concatenate([t['mc_return'] for t in trajectories])
            # advantages = np.concatenate([t['td_residual'] for t in trajectories])
            advantages = np.concatenate([t['gae'] for t in trajectories])

            # normalize advantage estimates
            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6)

            value_func_loss = self.value_func.update(observes, mc_returns)
            policy_loss, kl, entropy, beta = self.policy.update(observes, actions, advantages)

            avg_rewards = np.sum(np.concatenate([t['rewards'] for t in trajectories])) / self.episodes
            avg_timesteps = np.average([len(t['rewards']) for t in trajectories])
            log = {}

            # compute statistics such as mean and std
            log['steps'] = avg_timesteps
            log['rewards'] = avg_rewards
            log['policy_loss'] = policy_loss
            log['kl'] = kl
            log['entropy'] = entropy
            log['value_func_loss'] = value_func_loss
            log['beta'] = beta

            # display
            print('episode: ', i)
            print('average steps: {0}, average rewards: {1}'.format(log['steps'], log['rewards']))
            for key in ['policy_loss', 'kl', 'entropy', 'beta', 'value_func_loss']:
                print('{:s}: {:.2g}'.format(key, log[key]))
            print('\n')
            ep_steps.append(log['steps'])
            ep_rewards.append(log['rewards'])
            ep_entropy.append(log['entropy'])


            # write to log.csv
            if self.write_header:
                fieldnames = [x for x in log.keys()]
                self.writer = csv.DictWriter(self.log_file, fieldnames=fieldnames)
                self.writer.writeheader()
                self.write_header = False
            self.writer.writerow(log)
            # we want the csv file to preserve information even if the program terminates earlier than scheduled.
            self.log_file.flush()

            # save model weights if stopped manually
            if self.killer.kill_now:
                if input('Terminate training (y/[n])? ') == 'y':
                    break
                self.killer.kill_now = False

            # if (i+1)%20 == 0:
            #     print('episode: ', i+1)
            #     print('average steps', np.average(steps))
            #     print('average rewards', np.average(rewards))

        self.policy.save(OUTPATH)
        self.value_func.save(OUTPATH)
        self.scaler.save(OUTPATH)


        plt.figure(figsize=(12,9))

        if self.env_name.startswith('Fetch'):
            ax1 = plt.subplot(121)
            plt.xlabel('episodes')
            plt.ylabel('policy entropy')
            plt.plot(ep_entropy)
            scale_x = self.episodes
            ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x))
            ax1.xaxis.set_major_formatter(ticks_x)
        else:
            ax1 = plt.subplot(121)
            plt.xlabel('episodes')
            plt.ylabel('steps')
            plt.plot(ep_steps)
            scale_x = self.episodes
            ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x))
            ax1.xaxis.set_major_formatter(ticks_x)

        ax2 = plt.subplot(122)
        plt.xlabel('episodes')
        plt.ylabel('episodic rewards')
        plt.plot(ep_rewards)
        scale_x = self.episodes
        ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x))
        ax2.xaxis.set_major_formatter(ticks_x)

        plt.savefig(OUTPATH + 'train.png')

    def load_model(self, load_from):
        from tensorflow.python.tools import inspect_checkpoint as chkp

        # # print all tensors in checkpoint file
        # chkp.print_tensors_in_checkpoint_file(load_from+'policy/policy.pl', tensor_name='', all_tensors=True, all_tensor_names=True)
        self.policy.load(load_from + 'policy/policy.pl')
        self.value_func.load(load_from + 'value_func/value_func.pl')

    def demonstrate_agent(self, load_from):
        self.load_model(load_from)
        with open(load_from + "scaler.pkl", 'rb') as file:
            self.scaler = pickle.load(file)
        self.animate = True
        for i in range(10):
            observes, actons, rewards = self.run_one_episode()
            ep_rewards = np.sum(rewards)
            ep_steps = len(rewards)
            print("Total steps: {0}, total rewards: {1}\n".format(ep_steps, ep_rewards))
示例#4
0
class GeneratorAgentPure(object):
    def __init__(self,
                 env,
                 policy_function,
                 value_function,
                 discriminator,
                 gamma,
                 lam,
                 init_qpos,
                 init_qvel,
                 logger=None):
        self.env = env
        self.obs_dim = env.observation_space.shape[0]
        self.act_dim = env.action_space.shape[0]

        self.policy = policy_function
        self.value = value_function
        self.discriminator = discriminator
        self.gamma = gamma
        self.lam = lam

        self.init_qpos = init_qpos
        self.init_qvel = init_qvel

        self.scaler = Scaler(self.obs_dim)

        # logger
        self.logger = logger

        # set scaler's scale and offset by collecting 5 episodes
        self.collect(timesteps=2048)

    def discount(self, x, gamma):
        return scipy.signal.lfilter([1.0], [1.0, -gamma], x[::-1])[::-1]

    def get_random(self):
        idx = np.random.randint(low=0, high=self.init_qpos.shape[1], size=1)
        return np.squeeze(self.init_qpos[:,
                                         idx]), np.squeeze(self.init_qvel[:,
                                                                          idx])

    def collect(self, timesteps):
        trajectories = []
        trew_stat = []

        scale, offset = self.scaler.get()

        self.logger.log('scale_offset', [scale, offset])

        buffer_time = 0
        while buffer_time < timesteps:
            unscaled_obs, scaled_obs, actions, rewards = [], [], [], []
            egocentric = []
            done = False
            obs = self.env.reset()
            qpos, qvel = self.get_random()
            # we are setting initial qpos and qvel from expert
            self.env.set_state(qpos, qvel)
            timestep = 0
            while not done and timestep < 1000:
                obs = obs.astype(np.float32).reshape(1, -1)
                unscaled_obs.append(obs)
                obs = (obs - offset) * scale
                scaled_obs.append(obs)
                acts = self.policy.sample(obs)
                actions.append(acts.astype(np.float32).reshape(1, -1))
                obs, rew, done, _ = self.env.step(acts)
                rewards.append(rew)
                timestep += 1
                buffer_time += 1

            # statistics
            trew_stat.append(np.sum(rewards))

            # episode info
            traj_obs = np.concatenate(scaled_obs)
            traj_unscaled_obs = np.concatenate(unscaled_obs)
            traj_acts = np.concatenate(actions)
            #traj_rews = np.array(rewards, dtype=np.float64)
            traj_rews = np.squeeze(
                self.discriminator.get_rewards(traj_unscaled_obs, traj_acts))

            # scale rewards using running std of the experiment
            # traj_scaled_rews = traj_rews * np.squeeze(rew_scale)
            traj_scaled_rews = traj_rews

            # calculate discount sum of rewards
            traj_disc_rews = self.discount(traj_scaled_rews, self.gamma)

            # calculate advantages
            traj_values = self.value.predict(traj_obs)

            deltas = traj_scaled_rews - traj_values + np.append(
                traj_values[1:] * self.gamma, 0)
            traj_advantages = self.discount(deltas, self.gamma * self.lam)

            trajectory = {
                'observations': traj_obs,
                'actions': traj_acts,
                'tdlam': traj_disc_rews,
                'advantages': traj_advantages,
                'unscaled_obs': traj_unscaled_obs
            }
            trajectories.append(trajectory)

        # update observation scaler
        uns_obs = np.concatenate([t['unscaled_obs'] for t in trajectories])
        self.scaler.update(uns_obs)

        # update rewards scaler
        #uns_rews = np.concatenate([t['unscaled_rews'] for t in trajectories])
        #self.rew_scaler.update(uns_rews)
        observations = np.concatenate(
            [t['observations'] for t in trajectories])
        actions = np.concatenate([t['actions'] for t in trajectories])
        tdlam = np.concatenate([t['tdlam'] for t in trajectories])
        advantages = np.concatenate([t['advantages'] for t in trajectories])
        advantages = (advantages - np.mean(advantages)) / np.std(advantages)

        # check stats
        print('mean_trew: %f' % np.mean(trew_stat))
        self.logger.log('trew_stat', np.mean(trew_stat))

        return observations, uns_obs, actions, tdlam, advantages
示例#5
0
def train_models(env_name, num_episodes, gamma, lam, kl_targ, coef,
                 use_lr_adjust, ada_kl_penalty, seed, epochs, phi_epochs,
                 max_timesteps, reg_scale, phi_lr, phi_hs, policy_size,
                 phi_obj, load_model):

    env, obs_dim, act_dim = init_gym(env_name)
    set_global_seeds(seed)
    env.seed(seed)
    env._max_episode_steps = max_timesteps
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    aigym_path = os.path.join('log-files/', env_name, now)
    env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim)
    policy = Policy(obs_dim,
                    act_dim,
                    kl_targ,
                    epochs,
                    phi_epochs,
                    policy_size=policy_size,
                    phi_hidden_sizes=phi_hs,
                    reg_scale=reg_scale,
                    lr_phi=phi_lr,
                    phi_obj=phi_obj)

    run_policy(env,
               policy,
               scaler,
               num_episodes,
               max_timesteps=max_timesteps,
               mode=load_model)  # run a few to init scaler

    episode = 0
    for i in range(2000):
        print("sampling and training at %s iteration\n" % (i))
        trajectories, traj_len_list = run_policy(env,
                                                 policy,
                                                 scaler,
                                                 num_episodes,
                                                 max_timesteps=max_timesteps,
                                                 mode=load_model)

        num_traj = len(trajectories)

        episode += len(trajectories)
        add_value(trajectories, val_func)
        add_disc_sum_rew(trajectories, gamma)
        add_gae(trajectories, gamma, lam)

        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)

        policy.update(load_model,
                      observes,
                      actions,
                      advantages,
                      use_lr_adjust,
                      ada_kl_penalty,
                      c=0.)  # update policy
        val_func.fit(observes, disc_sum_rew)

    # Save models
    policy.save_policy()
    val_func.save_val_func()
    refine_scaler = False
    if refine_scaler == True:
        run_policy(env,
                   policy,
                   scaler,
                   num_episodes,
                   max_timesteps=max_timesteps,
                   mode=load_model)  # run a few to refine scaler
    with open('models/scaler/scaler.pkl', 'wb') as output:
        pickle.dump(scaler, output, pickle.HIGHEST_PROTOCOL)
    logger.log("saved model")
示例#6
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)
    aigym_path = os.path.join('/tmp', env_name, now)
    #env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)
    policy = Policy(env_name, obs_dim, act_dim, kl_targ, hid1_mult,
                    policy_logvar)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False

    scale, offset = scaler.get()
    data = {'SCALE': scale, 'OFFSET': offset}
    directory_to_store_data = '../saved_models/' + env_name + '/'
    if not os.path.exists(directory_to_store_data):
        os.makedirs(directory_to_store_data)
    file_name = directory_to_store_data + 'scale_and_offset.pkl'
    with open(file_name, 'wb') as f:
        pickle.dump(data, f)

    logger.close()
    policy.close_sess()
    val_func.close_sess()
示例#7
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """
    killer = GracefulKiller()
    #TODO Change init_gym for one of my functions
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime("%b-%d_%H:%M:%S").replace(
        ":", "_")  # create unique directories
    logger = Logger(logname=env_name, now=now)
    pathFolder = logger.pathFolder
    #Change wrappers.Monitor for a class of mine that controls de simulation
    #Creo que el wrapper no sirve de nada para mi ejemplo
    #env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim)
    policy = Policy(obs_dim, act_dim, kl_targ)

    #Esto es para alimentar con el optimo
    trajectories = initiatePolicyWithOptimum(env, policy, scaler, logger)
    add_value(trajectories, val_func)  # add estimated values to episodes
    add_disc_sum_rew(trajectories, gamma)  # calculated discounted sum of Rs
    add_gae(trajectories, gamma, lam)  # calculate advantage
    # concatenate all episodes into single NumPy arrays
    observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
    print(actions.shape)
    policy.update(observes, actions, advantages, logger)  # update policy
    val_func.fit(observes, disc_sum_rew, logger)  # update value function

    # No estoy seguro de si esto es necesario ya
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    logger.close()
    policy.close_sess(pathFolder)
    val_func.close_sess(pathFolder)
示例#8
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar, max_frames, load_and_run):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    print("Env loaded.")
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)
    aigym_path = os.path.join('/tmp', env_name, now)
    # env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)
    policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar)

    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)

    if load_and_run:
        val_func.load_weights()
        while True:
            run_episode(env, policy, scaler, animate=True)
        exit()
    episode = 0
    print("Running episodes...")
    while episode < num_episodes:
        episode_startime = time.time()
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size,
                                  max_frames=max_frames)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        try:
            add_gae(trajectories, gamma, lam)  # calculate advantage
            # concatenate all episodes into single NumPy arrays
            observes, actions, advantages, disc_sum_rew = build_train_set(
                trajectories)
        except Exception as e:
            print(e)
            print('skipping...')
            continue

        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
        print("Batch took %i seconds to run." %
              (time.time() - episode_startime))

        if not episode % 1000:
            val_func.save_weights()

    run_episode(env, policy, scaler, animate=True)

    logger.close()
    policy.close_sess()
    val_func.close_sess()
示例#9
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size,
         net_size_factor, noise_bias, weight, use_ppoclip):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    # now = datetime.utcnow().strftime("%b-%d_%H:%M:%S")  # create unique directories
    now = datetime.now().strftime("%b-%d_%H:%M:%S") + "_single"
    logger = Logger(logname=env_name, now=now)
    aigym_path = os.path.join('/tmp', env_name, now)
    # env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dim)
    if weight == "None":
        val_func = NNValueFunction(obs_dim, net_size_factor=net_size_factor)
        policy = None
        if use_ppoclip == "False":
            policy = Policy(obs_dim,
                            act_dim,
                            kl_targ,
                            net_size_factor=net_size_factor,
                            noise_bias=noise_bias)
        elif use_ppoclip == "True":
            policy = PolicyClip(obs_dim,
                                act_dim,
                                kl_targ,
                                net_size_factor=net_size_factor,
                                noise_bias=noise_bias)
            #assert False, "Not tested"
        else:
            assert False, "Unreachable"
    else:
        token = weight.split(".")
        token[-3] = token[-3][:-5] + "value"
        weight_2 = ".".join(token)
        val_func = NNValueFunctionContinue(weight_2,
                                           obs_dim,
                                           net_size_factor=net_size_factor)
        policy = PolicyContinue(weight,
                                obs_dim,
                                act_dim,
                                kl_targ,
                                net_size_factor=net_size_factor,
                                noise_bias=noise_bias)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger,
                      scaler)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    logger.close()
    # with open("test_dump", 'w') as f:
    #     pickle.dump(policy, f)
    policy.close_sess()
    val_func.close_sess()
示例#10
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size,hid1_mult,
         policy_logvar, weights_path, init_episode, experiment_name, resume, augment=False):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """
    killer = GracefulKiller()
    logger = Logger(logname=env_name, sub_dir=experiment_name)
    aigym_path = os.path.join('results', env_name, experiment_name)

    if resume:
        weights_path = aigym_path
        ckpt = tf.train.get_checkpoint_state(weights_path)
        init_episode = int(os.path.basename(ckpt.model_checkpoint_path).split('-')[1])

    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim = 45
    # obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())

    # env = wrappers.Monitor(env, aigym_path, force=True)
    if augment:
        obs_dim *= 2
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)
    policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar, weights_path)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, 5, augment)
    episode = init_episode
    while episode <= num_episodes:
        if episode % 1000 is 0:
            # record one episode
            record(env_name, aigym_path, policy, scaler, augment)
            policy.save(aigym_path, episode)
        trajectories = run_policy(env, policy, scaler, logger, batch_size, augment)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories, gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    #record one last episode
    record(env_name, aigym_path, policy, scaler, augment)
    logger.close()
    policy.close_sess()
    val_func.close_sess()
示例#11
0
    gamma = 0.995
    lam = 0.98
    batch_size = 5

    env = gym.make(env_name)
    obs_dim = env.observation_space.shape[0]
    obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode())
    act_dim = env.action_space.shape[0]
    # sess = tf.Session()
    policy = Policy(obs_dim, act_dim)
    val_func = NNValueFunction(obs_dim)
    # sess.run(tf.compat.v1.initializers.global_variables())

    now = datetime.utcnow().strftime("%b-%d_%H:%M:%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)

    scaler = Scaler(obs_dim)

    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    while episode < num_episodes:
        trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size)
        episode += len(trajectories)
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories, val_func, gamma, lam)
        policy.update(observes, actions, advantages, logger)
        val_func.fit(observes, disc_sum_rew, logger)
        logger.log({
            '_Episode': episode,
        })
        logger.write(display=True)
示例#12
0
class Experiment:

    def __init__(self, discount, num_iterations, lamb, animate, kl_target, **kwargs):
        self.env_name = 'RoboschoolHumanoidFlagrun-v1'
        self.env = gym.make(self.env_name)
        gym.spaces.seed(1234) # for reproducibility
        self.obs_dim = self.env.observation_space.shape[0] + 1 # adding time step as feature
        self.act_dim = self.env.action_space.shape[0]
        self.discount = discount
        self.num_iterations = num_iterations
        self.lamb = lamb
        self.animate = animate

        self.buffer = Buffer(1000000, self.obs_dim, self.act_dim) # 1000000 is the size they have used in paper
        self.episodes = 20 # larger episodes can reduce variance
        self.killer = GracefulKiller()

        self.policy = QPropPolicy(self.obs_dim, self.act_dim, self.env.action_space, kl_target, epochs=20)
        self.critic = DeterministicCritic(self.obs_dim, self.act_dim, self.discount, OUTPATH)
        self.value_func = l2TargetValueFunc(self.obs_dim, epochs=10)

        if 'show' in kwargs and not kwargs['show']:
            # save copies of file
            shutil.copy(inspect.getfile(self.policy.__class__), OUTPATH)
            shutil.copy(inspect.getfile(self.value_func.__class__), OUTPATH)
            shutil.copy(inspect.getfile(self.critic.__class__), OUTPATH)
            shutil.copy(inspect.getfile(self.__class__), OUTPATH)

            self.log_file = open(OUTPATH + 'log.csv', 'w')
            self.write_header = True

        print('Observation dimension:', self.obs_dim)
        print('Action dimension:', self.act_dim)

        # The use of a scaler is crucial
        self.scaler = Scaler(self.obs_dim)
        self.init_scaler()

    def init_scaler(self):
        """
        Collection observations from 5 episodes to initialize Scaler.
        :return: a properly initialized scaler
        """
        print('Fitting scaler')
        observation_samples = []
        for i in range(5):
            observation = []
            obs = self.env.reset()
            observation.append(obs)
            obs = obs.astype(np.float64).reshape((1, -1))
            done = False
            step = 0
            while not done:
                obs = np.append(obs, [[step]], axis=1)  # add time step feature
                action = self.policy.get_sample(obs).reshape((1, -1)).astype(np.float64)
                obs_new, reward, done, _ = self.env.step(action.reshape(-1))
                observation.append(obs_new)
                obs = obs_new.astype(np.float64).reshape((1, -1))
                step += 1e-3
            observation_samples.append(observation)
        observation_samples = np.concatenate(observation_samples, axis=0)
        self.scaler.update(observation_samples)

    def normalize_obs(self, obs):
        """
        Transform and update the scaler on the fly.
        :param obs: Raw observation
        :return: normalized observation
        """
        scale, offset = self.scaler.get()
        obs_scaled = (obs-offset)*scale
        self.scaler.update(obs.astype(np.float64).reshape((1, -1)))
        return obs_scaled

    def run_one_episode(self):
        """
        collect a trajectory of (obs, act, reward, obs_next)
        """
        obs = self.env.reset()
        observes, actions, rewards = [],[],[]
        done = False
        step = 0
        while not done:
            if self.animate:
                self.env.render()

            obs = obs.astype(np.float64).reshape((1, -1))
            obs = self.normalize_obs(obs)
            obs = np.append(obs, [[step]], axis=1)  # add time step feature at normalized observation
            observes.append(obs)

            action = self.policy.get_sample(obs).reshape((1, -1)).astype(np.float64)
            actions.append(action)
            obs_new, reward, done, _ = self.env.step(action.reshape(-1))
            if not isinstance(reward, float):
                reward = np.asscalar(reward)
            rewards.append(reward)

            obs = obs_new
            step += 0.003

        return np.concatenate(observes), np.concatenate(actions), np.array(rewards)

    def discounted_sum(self, l, factor):
        """
        Discounted sum of return or advantage estimates along a trajectory.
        :param l: a list containing the values of discounted summed interest.
        :param factor: discount factor in the disc_sum case or discount*lambda for GAE
        :return: discounted sum of l with regard to factor
        """
        discounted = []
        sum = 0
        for i in reversed(l):
            discounted.append(factor*sum+i)
            sum = factor*sum+i
        return np.array(list(reversed(discounted)))

    def run_policy(self, episodes):
        """
        Gather a batch of trajectory samples.
        :param episodes: size of batch.
        :return: a batch of samples
        """
        trajectories = []
        for e in range(episodes):
            observes, actions, rewards = self.run_one_episode()
            trajectory = {'observes': observes,
                          'actions': actions,
                          'rewards': rewards,
                          'scaled_rewards': rewards*(1-self.discount)}
            trajectories.append(trajectory)

        return trajectories

    def run_expr(self):
        ep_steps = []
        ep_rewards = []
        ep_entropy = []
        i = 0
        while i < self.num_iterations:
            trajectories = self.run_policy(20)
            # add to experience replay buffer
            self.buffer.append(trajectories)
            print('buffer size:', self.buffer.size())

            i += len(trajectories)

            # for E=20, T=50, the total number of samples would be 1000
            # In future needs to account for not uniform time steps per episode.
            # e.g. in Hopper-v2 environment not every episode has same time steps
            # E = len(trajectories)
            # num_samples = np.sum([len(t['rewards']) for t in trajectories])
            gradient_steps = np.sum([len(t['rewards']) for t in trajectories])

            """train critic"""
            # train all samples in the buffer, to the extreme
            # self.critic.fit(self.policy, self.buffer, epochs=20, num_samples=self.buffer.size())
            # train some samples minibatches only
            critic_loss_mean, critic_loss_std = self.critic.another_fit_func(self.policy, self.buffer, gradient_steps)

            """calculation of episodic discounted return only needs rewards"""
            mc_returns = np.concatenate([self.discounted_sum(t['scaled_rewards'], self.discount) for t in trajectories])

            """using current batch of samples to update baseline"""
            observes = np.concatenate([t['observes'] for t in trajectories])
            actions = np.concatenate([t['actions'] for t in trajectories])
            value_func_loss = self.value_func.update(observes, mc_returns)

            """compute GAE"""
            for t in trajectories:
                t['values'] = self.value_func.predict(t['observes'])
                # IS it really legitimate to insert 0 at the last obs?
                t['td_residual'] = t['scaled_rewards'] + self.discount * np.append(t['values'][1:], 0) - t['values']
                t['gae'] = self.discounted_sum(t['td_residual'], self.discount * self.lamb)
            advantages = np.concatenate([t['gae'] for t in trajectories])
            """normalize advantage estimates, Crucial step"""
            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6)

            """compute control variate"""""
            cv = self.critic.get_contorl_variate(self.policy, observes, actions)
            # cv must not be centered
            # cv = (cv - cv.mean()) / (cv.std() + 1e-6)

            """conservative control variate"""
            eta = [1 if i > 0 else 0 for i in advantages*cv]

            """center learning signal"""
            # check that advantages and CV should be of size E*T
            # eta controls the on-off of control variate
            learning_signal = advantages - eta*cv
            # learning_signal = (learning_signal - learning_signal.mean()) / (learning_signal.std() + 1e-6)

            """controlled taylor eval term"""
            ctrl_taylor = np.concatenate([ [eta[i]*act] for i, act in enumerate(self.critic.get_taylor_eval(self.policy, observes))])

            """policy update"""
            ppo_loss, ddpg_loss, kl, entropy, beta = self.policy.update(observes, actions, learning_signal, ctrl_taylor)

            avg_rewards = np.sum(np.concatenate([t['rewards'] for t in trajectories])) / self.episodes
            avg_timesteps = np.average([len(t['rewards']) for t in trajectories])
            log = {}

            # save training statistics
            log['steps'] = avg_timesteps
            log['rewards'] = avg_rewards
            log['critic_loss'] = critic_loss_mean
            log['policy_ppo_loss'] = ppo_loss
            log['policy_ddpg_loss'] = ddpg_loss
            log['kl'] = kl
            log['entropy'] = entropy
            log['value_func_loss'] = value_func_loss
            log['beta'] = beta

            # display
            print('episode: ', i)
            print('average steps: {0}, average rewards: {1}'.format(log['steps'], log['rewards']))
            for key in ['critic_loss', 'policy_ppo_loss', 'policy_ddpg_loss', 'value_func_loss', 'kl', 'entropy', 'beta']:
                print('{:s}: {:.2g}'.format(key, log[key]))
            print('\n')
            ep_steps.append(log['steps'])
            ep_rewards.append(log['rewards'])
            ep_entropy.append(log['entropy'])

            # write to log.csv
            if self.write_header:
                fieldnames = [x for x in log.keys()]
                self.writer = csv.DictWriter(self.log_file, fieldnames=fieldnames)
                self.writer.writeheader()
                self.write_header = False
            self.writer.writerow(log)
            # we want the csv file to preserve information even if the program terminates earlier than scheduled.
            self.log_file.flush()

            # save model weights if stopped early
            if self.killer.kill_now:
                if input('Terminate training (y/[n])? ') == 'y':
                    break
                self.killer.kill_now = False

        self.policy.save(OUTPATH)
        self.value_func.save(OUTPATH)
        self.critic.save(OUTPATH)
        self.scaler.save(OUTPATH)

        plt.figure(figsize=(12,9))

        if self.env_name.startswith('Fetch'):
            ax1 = plt.subplot(121)
            plt.xlabel('episodes')
            plt.ylabel('policy entropy')
            plt.plot(ep_entropy)
            scale_x = self.episodes
            ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x))
            ax1.xaxis.set_major_formatter(ticks_x)
        else:
            ax1 = plt.subplot(121)
            plt.xlabel('episodes')
            plt.ylabel('steps')
            plt.plot(ep_steps)
            scale_x = self.episodes
            ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x))
            ax1.xaxis.set_major_formatter(ticks_x)

        ax2 = plt.subplot(122)
        plt.xlabel('episodes')
        plt.ylabel('episodic rewards')
        plt.plot(ep_rewards)
        scale_x = self.episodes
        ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x))
        ax2.xaxis.set_major_formatter(ticks_x)

        plt.savefig(OUTPATH + 'train.png')

    def load_model(self, load_from):
        """
        Load all Function Approximators plus a Scaler.
        Replaybuffer is not restored though.
        :param load_from: Dir containing saved weights.
        """
        from tensorflow.python.tools import inspect_checkpoint as chkp
        # # print all tensors in checkpoint file
        # chkp.print_tensors_in_checkpoint_file(load_from+'policy/policy.pl', tensor_name='', all_tensors=True, all_tensor_names=True)
        self.policy.load(load_from + 'policy/')
        self.value_func.load(load_from + 'value_func/')
        self.critic.load(load_from+'critic/')
        with open(load_from + "scaler.pkl", 'rb') as file:
            self.scaler = pickle.load(file)

    def demonstrate_agent(self, load_from):
        """
        Simply run the policy without training.
        :param load_from:
        :return:
        """
        self.load_model(load_from)
        while True:
            observes, actons, rewards = self.run_one_episode()
            ep_rewards = np.sum(rewards)
            ep_steps = len(rewards)
            print("Total steps: {0}, total rewards: {1}\n".format(ep_steps, ep_rewards))
示例#13
0
文件: agent.py 项目: zxgineng/deeprl
class Agent:
    def __init__(self, env, name, chief=None):
        assert name == 'chief' or 'worker' in name
        if 'worker' in name:
            assert chief is not None
            self.chief = chief
        else:
            self.scaler = Scaler(Config.data.state_dim)
        self.name = name
        self.env = env
        self.sess = None
        self.coord = None

        with tf.variable_scope(name):
            self._build_graph()

    def _build_graph(self):
        self.actor = Actor()
        self.critic = Critic()
        if 'worker' in self.name:
            self._build_update_op()

    def _build_update_op(self):
        global_step = tf.train.get_global_step()
        tf.assign_add(global_step, 1, name='global_step_add')

        with tf.variable_scope('sync'):
            with tf.variable_scope('pull'):
                pull_a_params_op = [
                    actor_param.assign(chief_param)
                    for actor_param, chief_param in zip(
                        self.actor.params, self.chief.actor.params)
                ]
                pull_c_params_op = [
                    critic_param.assign(chief_param)
                    for critic_param, chief_param in zip(
                        self.critic.params, self.chief.critic.params)
                ]
                self.pull_op = tf.group(pull_a_params_op + pull_c_params_op)

            with tf.variable_scope('push'):
                update_a_op = self.chief.actor.optimizer.apply_gradients(
                    zip(self.actor.grads, self.chief.actor.params))
                update_c_op = self.chief.critic.optimizer.apply_gradients(
                    zip(self.critic.grads, self.chief.critic.params))
                self.update_op = tf.group([update_a_op, update_c_op])

    def init_scaler(self, init_episode=5):
        for e in range(init_episode):
            observation = self.env.reset()
            states = []
            done = False
            count = 0
            while not done:
                states.append(observation)
                action = self.choose_action(observation)
                next_observation, reward, done, info = self.env.step(action)
                observation = next_observation

                if Config.train.get('max_episode_steps', None):
                    count += 1
                    if count == Config.train.max_episode_steps:
                        break
            self.scaler.update(np.array(states))

    def update_chief(self, states, actions, target_v):
        feed_dict = {self.critic.states: states}
        value = self.sess.run(self.critic.value, feed_dict)
        td_error = np.array(target_v) - value
        feed_dict = {
            self.critic.states: states,
            self.critic.target_v: target_v,
            self.actor.states: states,
            self.actor.actions: actions,
            self.actor.td_error: td_error
        }
        self.sess.run([
            self.critic.loss, self.update_op, self.name + '/global_step_add:0'
        ], feed_dict)

    def pull_params(self):
        self.sess.run(self.pull_op)

    def cal_target_v(self, done, next_observation, rewards):
        if done:
            next_value = 0
        else:
            next_value = self.sess.run(
                self.critic.value, {
                    self.critic.states:
                    [self.chief.scaler.normalize(next_observation)]
                })[0, 0]
        target_v = []
        for reward in rewards[::-1]:
            next_value = reward + Config.train.reward_decay * next_value
            target_v.append([next_value])
        target_v.reverse()
        return target_v

    def choose_action(self, observation):
        if Config.data.action_type == 'discrete':
            policy = self.sess.run(self.actor.policy,
                                   {self.actor.states: [observation]})[0]
            action = np.random.choice(range(Config.data.action_num), p=policy)
        else:
            action = self.sess.run(self.actor.sample,
                                   {self.actor.states: [observation]})
        return action

    def eval(self, animate=False):
        assert self.name == 'chief'
        observation = self.env.reset()
        ep_reward = 0
        count = 0
        done = False
        while not done:
            if animate:
                self.env.render()
            action = self.choose_action(self.scaler.normalize(observation))
            next_observation, reward, done, info = self.env.step(action)
            ep_reward += reward
            observation = next_observation

            if Config.train.get('max_episode_steps', None):
                count += 1
                if count == Config.train.max_episode_steps:
                    break
        return ep_reward

    def work(self):
        total_step = 0
        states, actions, rewards, unscaled_states = [], [], [], []
        self.pull_params()

        while not self.coord.should_stop():
            observation = self.env.reset()
            ep_reward = 0
            done = False
            count = 0
            while not done:
                unscaled_states.append(observation)
                observation = self.chief.scaler.normalize(observation)
                states.append(observation)
                action = self.choose_action(observation)
                next_observation, reward, done, info = self.env.step(action)
                total_step += 1
                ep_reward += reward

                actions.append(action)
                rewards.append(reward)
                if total_step % Config.train.update_n_iter == 0 or done:
                    target_v = self.cal_target_v(done, next_observation,
                                                 rewards)
                    self.update_chief(states, actions, target_v)
                    self.chief.scaler.update(np.array(unscaled_states))
                    states, actions, rewards, unscaled_states = [], [], [], []
                    self.pull_params()

                observation = next_observation

                if Config.train.get('max_episode_steps', None):
                    count += 1
                    if count == Config.train.max_episode_steps:
                        break
示例#14
0
def main2(env_name, num_episodes, gamma, lam, kl_targ, batch_size,
          net_size_factor, noise_bias, weight, use_ppoclip):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """
    global alive_coef, progress_coef, threshold1, threshold2, change_rate
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    # now = datetime.utcnow().strftime("%b-%d_%H:%M:%S")  # create unique directories
    now = datetime.now().strftime(
        "%b-%d_%H:%M:%S") + "_multi_hop_{},{},{}".format(
            change_rate, threshold1, threshold2)
    logger = Logger(logname=env_name, now=now)
    aigym_path = os.path.join('/tmp', env_name, now)
    # env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dim)
    if weight == "None":
        val_func = NNValueFunction(obs_dim,
                                   net_size_factor=net_size_factor,
                                   alive_coef=alive_coef,
                                   progress_coef=progress_coef,
                                   reward_dim=reward_dim)
        policy = Policy(obs_dim,
                        act_dim,
                        kl_targ,
                        net_size_factor=net_size_factor,
                        noise_bias=noise_bias)

    else:
        token = weight.split(".")
        token[-3] = token[-3][:-5] + "value"
        weight_2 = ".".join(token)
        # assert False, "unreachable"
        val_func = NNValueFunctionContinue(weight_2,
                                           obs_dim,
                                           net_size_factor=net_size_factor,
                                           alive_coef=alive_coef,
                                           progress_coef=progress_coef)
        policy = PolicyContinue(weight,
                                obs_dim,
                                act_dim,
                                kl_targ,
                                net_size_factor=net_size_factor,
                                noise_bias=noise_bias)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    flag1 = False
    flag2 = False
    flag3 = False
    reward_queue = []
    queue_num = 100
    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger,
                      scaler)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False

        alive_sum = 0
        progr_sum = 0
        for t in trajectories:
            tmp_rewards = t['orig_rewards']
            tmp_rewards = np.sum(tmp_rewards, axis=0)
            alive_sum += tmp_rewards[0]
            progr_sum += tmp_rewards[1]
        reward_queue.append(np.mean([t['rewards'].sum()
                                     for t in trajectories]))
        reward_queue = reward_queue[-queue_num:]
        reward_std = np.std(np.array(reward_queue))

        print("Reward std by {} episode : {}".format(queue_num, reward_std))

        if alive_sum >= 5000:
            flag3 = True

        if (flag3 and alive_sum > progr_sum * threshold1) or flag1:
            flag1 = True
            alive_coef -= change_rate
            progress_coef += change_rate
            val_func.alive_coef = float(alive_coef)
            val_func.progress_coef = float(progress_coef)
            if alive_sum < progr_sum * threshold2:
                flag1 = False

        if progr_sum > alive_sum * threshold1 or flag2:
            flag2 = True
            alive_coef += change_rate
            progress_coef -= change_rate
            val_func.alive_coef = float(alive_coef)
            val_func.progress_coef = float(progress_coef)

            if progr_sum < alive_sum * threshold2:
                flag2 = False

        print(alive_sum, progr_sum)

        logger.log_model_3({
            "alive_coef": alive_coef,
            "progress_coef": progress_coef,
            "alive_sum": alive_sum,
            "progr_sum": progr_sum
        })

    logger.close()
    policy.close_sess()
    val_func.close_sess()
示例#15
0
文件: train.py 项目: wu6u3/async_ppo
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar):
    '''
    '''
    ##################
    #  shared policy #
    ##################

    tic = time.clock()

    manarger = MPManager()
    manarger.start()

    shared_env, shared_obs_dim, shared_act_dim = init_gym(env_name)
    shared_obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    shared_logger = Logger(logname=env_name, now=now + "-Master")
    shared_aigym_path = os.path.join('./vedio', env_name, now + "-Master")
    #env = wrappers.Monitor(env, aigym_path, force=True)
    shared_scaler = Scaler(shared_obs_dim)

    shared_val_func = NNValueFunction(shared_obs_dim, hid1_mult, -1, None)
    shared_policy = Policy(shared_obs_dim, shared_act_dim, kl_targ, hid1_mult,
                           policy_logvar, -1, None)

    learning_rate_input = tf.placeholder("float")
    grad_applier = RMSPropApplier(learning_rate=learning_rate_input,
                                  decay=RMSP_ALPHA,
                                  momentum=0.0,
                                  epsilon=RMSP_EPSILON,
                                  clip_norm=GRAD_NORM_CLIP,
                                  device=device)

    # lacal policy declair
    env_a = [None] * N_WORKERS
    obs_dim_a = [None] * N_WORKERS
    act_dim_a = [None] * N_WORKERS
    logger_a = [None] * N_WORKERS
    aigym_path_a = [None] * N_WORKERS
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    val_func_a = [None] * N_WORKERS
    policy_a = [None] * N_WORKERS
    scaler_a = [None] * N_WORKERS
    for i in range(N_WORKERS):
        env_a[i], obs_dim_a[i], act_dim_a[i] = init_gym(env_name)
        obs_dim_a[
            i] += 1  # add 1 to obs dimension for time step feature (see run_episode())
        logger_a[i] = Logger(logname=env_name, now=now + "-" + str(i))
        aigym_path_a[i] = os.path.join('./vedio', env_name, now + "-" + str(i))
        #env_a[i] = wrappers.Monitor(env, aigym_path, force=True)
        scaler_a[i] = Scaler(obs_dim_a[i])

        val_func_a[i] = NNValueFunction(obs_dim_a[i], hid1_mult, i,
                                        shared_val_func)
        val_func_a[i].apply_gradients = grad_applier.apply_gradients(
            shared_val_func.get_vars(), val_func_a[i].gradients)

        policy_a[i] = Policy(obs_dim_a[i], act_dim_a[i], kl_targ, hid1_mult,
                             policy_logvar, i, shared_policy)
        policy_a[i].apply_gradients = grad_applier.apply_gradients(
            shared_policy.get_vars(), policy_a[i].gradients)

    # init tensorflow
    sess = tf.Session(config=tf.ConfigProto(log_device_placement=False,
                                            allow_soft_placement=True))
    init = tf.global_variables_initializer()

    ## start sess
    sess.run(init)

    ## init shared scalar policy
    run_policy(sess,
               shared_env,
               shared_policy,
               shared_scaler,
               shared_logger,
               episodes=5)

    def single_work(thread_idx):
        """ training loop

        Args:
            env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
            num_episodes: maximum number of episodes to run
            gamma: reward discount factor (float)
            lam: lambda from Generalized Advantage Estimate
            kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
            batch_size: number of episodes per policy training batch
            hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
            policy_logvar: natural log of initial policy variance
        """
        env = env_a[thread_idx]
        policy = policy_a[thread_idx]
        #obs_dim = obs_dim_a[thread_idx]
        #act_dim = act_dim_a[thread_idx]
        logger = logger_a[thread_idx]
        aigym_path = aigym_path_a[thread_idx]
        scaler = scaler_a[thread_idx]
        val_func = val_func_a[thread_idx]

        print("=== start thread " + str(policy.get_thread_idx()) + " " +
              policy.get_scope() + " ===")
        print(shared_policy.get_vars())
        print(policy.get_vars())

        # run a few episodes of untrained policy to initialize scaler:
        #run_policy(sess, env, policy, scaler, logger, episodes=5)

        #policy.sync(shared_policy)
        #val_func.sync(shared_val_func)
        episode = 0

        while episode < num_episodes:

            ## copy global var into local
            sess.run(policy.sync)
            sess.run(val_func.sync)

            ## compute new model on local policy
            trajectories = run_policy(sess,
                                      env,
                                      policy,
                                      scaler,
                                      logger,
                                      episodes=batch_size)
            episode += len(trajectories)
            add_value(sess, trajectories,
                      val_func)  # add estimated values to episodes
            add_disc_sum_rew(trajectories,
                             gamma)  # calculated discounted sum of Rs
            add_gae(trajectories, gamma, lam)  # calculate advantage
            # concatenate all episodes into single NumPy arrays
            observes, actions, advantages, disc_sum_rew = build_train_set(
                trajectories)
            # add various stats to training log:
            log_batch_stats(observes, actions, advantages, disc_sum_rew,
                            logger, episode,
                            time.clock() - tic)

            policy.update(sess, observes, actions, advantages,
                          logger)  # update policy
            val_func.fit(sess, observes, disc_sum_rew,
                         logger)  # update value function

            #cur_learning_rate = self._anneal_learning_rate(global_t)
            feed_dict = {
                policy.old_log_vars_ph: policy.old_log_vars_np,
                policy.old_means_ph: policy.old_means_np,
                policy.obs_ph: observes,
                policy.act_ph: actions,
                policy.advantages_ph: advantages,
                policy.beta_ph: policy.beta,
                policy.lr_ph: policy.lr,
                policy.eta_ph: policy.eta,
                learning_rate_input: policy.lr
            }

            sess.run(policy.apply_gradients, feed_dict)

            shared_policy.update(sess, observes, actions, advantages,
                                 shared_logger)

            feed_dict = {
                val_func.obs_ph: observes,
                val_func.val_ph: disc_sum_rew,
                learning_rate_input: val_func.lr
            }

            sess.run(val_func.apply_gradients, feed_dict)

            shared_val_func.fit(sess, observes, disc_sum_rew, shared_logger)

            shared_logger.log({'_Time': time.clock() - tic})

            logger.write(
                display=True)  # write logger results to file and stdout

        logger.close()

    ## end def single work

    train_threads = []
    for i in range(N_WORKERS):
        train_threads.append(threading.Thread(target=single_work, args=(i, )))

    [t.start() for t in train_threads]
    [t.join() for t in train_threads]

    saver = tf.train.Saver()
    for i in range(N_WORKERS):
        logger_a[i].close()

    #path = os.path.join('log-files', env_name, now+'-Master', 'checkpoint')
    #saver.save(sess, path )

    sess.close()
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar, task_identity):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """
    print('Training started for ' + env_name + ' and task_identity ' +
          str(task_identity))

    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name=env_name,
                                     task_identity=task_identity)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)
    policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar,
                    env_name, task_identity)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False

    scale, offset = scaler.get()
    #scale_and_offset_data = {'scale': scale, 'offset': offset}
    #scale_and_offset_file = 'scale_and_offset_file_' + env_name + '_' + task_identity + '.pkl'
    #with open(scale_and_offset_file, 'wb') as f:
    #    pickle.dump(scale_and_offset_data, f)
    #### Saving expert trajectories after sufficient training has been made
    ## Visualization
    #aigym_path = os.path.join(VIDEO_LOGS_DIRECTORY, env_name, now)
    #env = wrappers.Monitor(env, aigym_path, force=True)
    trajectories = run_policy(env,
                              policy,
                              scaler,
                              logger,
                              episodes=DEMONSTRATOR_EPISODES_TO_LOG)
    data_to_store = {
        DEMONSTRATOR_TRAJECTORY_KEY: trajectories,
        SCALE_KEY: scale,
        OFFSET_KEY: offset
    }
    directory_to_store_trajectories = './../' + DEMONSTRATOR_TRAJECTORIES_DIRECTORY
    if not os.path.exists(directory_to_store_trajectories):
        os.makedirs(directory_to_store_trajectories)
    file_to_store_trajectories = directory_to_store_trajectories + env_name + '_' + task_identity + '.pkl'
    with open(file_to_store_trajectories, "wb") as f:
        pickle.dump(data_to_store, f)

    logger.close()
    policy.close_sess()
    val_func.close_sess()
示例#17
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, init_logvar):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (multiplier of obs dimension)
        init_logvar: natural log of initial policy variance
    """
    print('load model (l)?')
    loading = input('')
    pybullet.connect(pybullet.DIRECT)
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    # print('obs_dim') # 45 for HumanoidFlagrunBulletEnv-v0, HumanoidFlagrunHarderBulletEnv-v0
    # print(obs_dim)
    # print('act_dim') # 17 for HumanoidFlagrunBelletEnv-v0, HumanoidFlagrunHarderBulletEnv-v0
    # print(act_dim)
    # input('')
    now = datetime.utcnow().strftime("%b-%d_%H:%M:%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)
    aigym_path = os.path.join('/tmp', env_name, now)
    env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dim)

    val_func = NNValueFunction(obs_dim, hid1_mult, loading)
    policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, init_logvar)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)

    policy_model = policy.get_trpo_policy_model()
    valNN_model = val_func.get_valNN_model()
    lr = val_func.get_lr()

    if loading == 'l':
        policy_model.load_weights('pol_weights.h5')
        pol_weights = policy_model.get_weights()
        print('pol_weights')
        print(pol_weights)
        input('')
        loading == 'n'

    save_weights_flag = 1

    episode = 0
    while episode < num_episodes:
        trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size)
        episode += len(trajectories)

        if episode <= batch_size:
            if loading == 'l':
                traj = open('trajectories.obj', 'rb')
                trajectories = pickle.load(traj)
                traj.close()
                print('342')
                input('')
        elif episode == num_episodes-batch_size:
            traj = open('trajectories.obj','wb')
            pickle.dump(trajectories,traj)
            traj.close()
            print('348')
            input('')

        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories, gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        if episode > 50:
            policy_model = policy.get_trpo_policy_model()
            print('about to save model')
            input('')
            policy_model.save('policy_model')
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    logger.close()

    if save_weights_flag == 1:
        valNN_model.save('val_weights.h5')

        policy_weights = policy_model.get_weights()
        print('policy_weights')
        print(policy_weights)
        input('')
        # policy_model.save_weights('pol_weights.hdf5')
        policy_model.save_weights('pol_weights.h5')
示例#18
0
def main(env_name, num_episodes, gamma, lamda, kl_targ, batch_size, hid1_mult, init_pol_logvar, animate,\
        save_video, num_episodes_sim, task_params, task_name):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lamda: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        init_pol_logvar: natural log of initial policy variance
        save_video: Boolean determining if videos of the agent will be saved
        num_episodes_sim: Number of episodes to simulate/save videos for
        task_params: list of parameters to modify each environment for a different task
        task_name: name user assigns to the task being used to modify the environment
    """

    # ****************  Environment Initialization and Paths  ***************
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())

    # Paths
    print("\n\n---- PATHS: ----")
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)  # logger object
    aigym_path = os.path.join('./videos', env_name, task_name,
                              now)  # videos folders
    agent_path = os.path.join('agents', env_name,
                              now)  # agent / policy folders
    os.makedirs(agent_path)
    print("Path for Saved Videos: {}".format(aigym_path))
    print("Path for Saved Agents: {}\n".format(agent_path))

    # Initialize Policy, Value Networks and Scaler
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)
    policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, init_pol_logvar)
    run_policy(env, policy, scaler, logger,
               episodes=5)  # run some episodes to initialize scaler

    # Start Trainning
    animate = True if animate == "True" else False
    save_video = True if save_video == "True" else False
    saver_perc = int(
        num_episodes *
        0.02)  # determinines when the agent and video should be saved
    saver_offset = saver_perc
    killer = GracefulKiller()
    episode = 0

    while episode < num_episodes:

        # Obtain 'batch_size' trajectories and add additional intermediate calculations
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size,
                                  animate=animate)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lamda)  # calculate advantage

        # Concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)

        # Logging Stats
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)

        # Update Policy and Value Networks
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout

        # Store Policy, Value Network and Scaler: every 20% of total episodes or in first/last episode
        if episode >= saver_offset or episode >= num_episodes or episode <= batch_size or killer.kill_now:
            # TODO: Make saving agent/video a method so that it can be called in killer.kill_now
            saver_offset += saver_perc
            policy.tf_saver.save(policy.sess, "{}/policy_ep_{}".format(
                agent_path, episode))  # Save Policy Network
            val_func.tf_saver.save(val_func.sess, "{}/val_func_ep_{}".format(
                agent_path, episode))  # Save Value Network
            pickle.dump(
                scaler,
                open("{}/scaler_ep_{}.p".format(agent_path, episode), 'wb'))
            print("---- Saved Agent at Episode {} ----".format(episode))

            # Save video of current agent/policy
            if save_video:
                print("---- Saving Video at Episode {} ----".format(episode))
                _ = sim_agent(
                    env,
                    policy,
                    scaler,
                    num_episodes_sim,
                    save_video=True,
                    out_dir=aigym_path +
                    "/vid_ep_{}/{}_{}".format(episode, task_name, task))
                env.close()  # closes window open by monitor wrapper
                env, _, _ = init_gym(
                    env_name
                )  # Recreate env as it is killed when saving videos
            print("\n\n")

            # If Ctrl + C is Pressed, ask user if Trainning shall be terminated
            if killer.kill_now:
                if input('Terminate training (y/[n])? ') == 'y':
                    break
                killer.kill_now = False

    # Terminate Sessions
    env.close()
    logger.close()
    policy.close_sess()
    val_func.close_sess()
示例#19
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """

    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name, False)
    if time_state:
        obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H-%M-%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)

    scaler = Scaler(obs_dim, env_name)
    val_func = NNValueFunction(obs_dim, env_name, True)
    arg = [obs_dim, act_dim, kl_targ, time_state, env_name]
    policy = Policy(obs_dim, act_dim, kl_targ, env_name, True)

    episode = 0
    #progresses = None
    while episode < num_episodes:
        trajectories, progress = run_policy(env,
                                            policy,
                                            scaler,
                                            logger,
                                            arg,
                                            episodes=batch_size)
        #TODO change init setup
        try:
            progresses
        except:
            progresses = progress
        else:
            progresses = np.concatenate([progresses, progress], 1)

        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        scaler.save()
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    path = os.path.join('savedmodel/' + env_name)
    path = os.path.join(path, 'prog.dat')
    progresses.dump(path)
    logger.close()
    policy.close_sess()
    val_func.close_sess()
示例#20
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar, print_results, risk_targ):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now_utc = datetime.utcnow()  # create unique directories
    now = str(now_utc.day) + '-' + now_utc.strftime('%b') + '-' + str(
        now_utc.year) + '_' + str(
            ((now_utc.hour - 4) % 24)) + '.' + str(now_utc.minute) + '.' + str(
                now_utc.second)  # adjust for Montreal Time Zone
    logger = Logger(logname=env_name, now=now)
    aigym_path = os.path.join('/tmp', env_name, now)
    #env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)
    policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar,
                    risk_targ, 'CVaR', batch_size, 1)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    kl_terms = np.array([])
    beta_terms = np.array([])
    if print_results:
        rew_graph = np.array([])
        mean_rew_graph = np.array([])
    #big_li_rew_nodisc0 = np.array([])
    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        #predicted_values_0 = [t['values'][0] for t in trajectories]
        add_disc_sum_rew(
            trajectories, gamma, scaler.mean_rew,
            np.sqrt(scaler.var_rew))  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam, scaler.mean_rew,
                np.sqrt(scaler.var_rew))  # calculate advantage
        nodisc0 = -0.0001 * np.array(
            [t['rewards'].sum() for t in trajectories])  # scaled for gradients
        print(nodisc0)
        disc0 = [t['disc_sum_rew'][0] for t in trajectories]
        print('scaled sum rewards', nodisc0)
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        lamb = policy.update(observes, actions, advantages, nodisc0,
                             logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        kl_terms = np.append(kl_terms, policy.check_kl)
        x1 = list(range(1, (len(kl_terms) + 1)))
        rewards = plt.plot(x1, kl_terms)
        plt.title('RAPPO')
        plt.xlabel("Episode")
        plt.ylabel("KL Divergence")
        plt.savefig("KL_curve.png")
        plt.close("KL_curve.png")
        beta_terms = np.append(beta_terms, policy.beta)
        x2 = list(range(1, (len(beta_terms) + 1)))
        mean_rewards = plt.plot(x2, beta_terms)
        plt.title('RAPPO')
        plt.xlabel("Batch")
        plt.ylabel("Beta Lagrange Multiplier")
        plt.savefig("lagrange_beta_curve.png")
        plt.close("lagrange_beta_curve.png")
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
        if print_results:
            rew_graph = np.append(rew_graph, disc0)
            x1 = list(range(1, (len(rew_graph) + 1)))
            rewards = plt.plot(x1, rew_graph)
            plt.title('RAPPO')
            plt.xlabel("Episode")
            plt.ylabel("Discounted sum of rewards")
            plt.savefig("learning_curve.png")
            plt.close()
            mean_rew_graph = np.append(mean_rew_graph, np.mean(disc0))
            x2 = list(range(1, (len(mean_rew_graph) + 1)))
            mean_rewards = plt.plot(x2, mean_rew_graph)
            plt.title('RAPPO')
            plt.xlabel("Batch")
            plt.ylabel("Mean of Last Batch")
            plt.savefig("learning_curve2.png")
            plt.close()
    if print_results:
        tr = run_policy(env, policy, scaler, logger, episodes=1000)
        sum_rewww = [t['rewards'].sum() for t in tr]
        hist_dat = np.array(sum_rewww)
        fig = plt.hist(hist_dat, bins=2000, edgecolor='b', linewidth=1.2)
        plt.title('RAPPO')
        plt.xlabel("Sum of Rewards")
        plt.ylabel("Frequency")
        plt.savefig("RA_ppo.png")
        plt.close()
        with open('sum_rew_final_policy.pkl', 'wb') as f:
            pickle.dump(sum_rewww, f)
        logger.final_log()
    logger.close()
    policy.close_sess()
    val_func.close_sess()
示例#21
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar, **kwargs):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """
    memory = deque([])
    memory_size = kwargs['memory_size']
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)
    aigym_path = os.path.join('/tmp', env_name, now)
    env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)
    target_policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult,
                           policy_logvar)  # kl_targ = 0?
    explore_policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult,
                            policy_logvar)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, target_policy, scaler, logger, episodes=5, fix_drct_dist=0)
    run_policy(env,
               explore_policy,
               scaler,
               logger,
               episodes=5,
               fix_drct_dist=0)
    episode = 0
    fix_drct_dist_range = (0.3, 0)

    while episode < num_episodes:
        # save model
        if episode % 200 == 0:
            save_path = target_policy.saver.save(
                target_policy.sess,
                "/home/csc63182/testspace/models/halfcheetah-trpo/model-%d.ckpt"
                % (episode))

        # run a few episodes
        fix_drct_dist = (
            (episode * fix_drct_dist_range[1]) +
            (num_episodes - episode) * fix_drct_dist_range[0]) / num_episodes
        target_trajectories = run_policy(env,
                                         target_policy,
                                         scaler,
                                         logger,
                                         episodes=batch_size,
                                         fix_drct_dist=0)
        explore_trajectories = run_policy(env,
                                          explore_policy,
                                          scaler,
                                          logger,
                                          episodes=batch_size,
                                          fix_drct_dist=fix_drct_dist)

        # Add to memory
        n_explore = max(0, int(batch_size * (1 - episode / num_episodes)) - 1)
        trajectories = target_trajectories + explore_trajectories[:n_explore]
        episode += batch_size
        memory += trajectories
        while len(memory) > memory_size:
            memory.popleft()

        # train explore network
        add_value(explore_trajectories,
                  val_func)  # add estimated values to episodes
        add_disc_sum_rew(explore_trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(explore_trajectories, gamma, lam)  # calculate advantage
        observes, actions, advantages, disc_sum_rew = build_train_set(
            explore_trajectories)
        explore_policy.update(observes, actions, advantages,
                              logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function

        # train target network
        # re-sample trajectories
        trajectories = sample(memory, batch_size)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        target_policy.update(observes, actions, advantages,
                             logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    with open('rewards_%s.txt' % kwargs['log_postfix'], 'w') as f:
        for reward in rewards_record:
            f.write('%f\n' % reward)
    plt.plot((np.arange(len(rewards_record)) + 1) * batch_size, rewards_record)
    plt.savefig('learning_curve_%s.png' % kwargs['log_postfix'])
    logger.close()
    explore_policy.close_sess()
    target_policy.close_sess()
    val_func.close_sess()
示例#22
0
def main(num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar):
    """ Main training loop
    Args:
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """
    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym()
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())

    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)
    policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, episodes=5)
    episode = 0
    #Inizialize reward list (to keep track of improvements)
    avg_rew_list = []
    while episode < num_episodes:
        print(episode)
        trajectories = run_policy(env, policy, scaler, episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        policy.update(observes, actions, advantages)  # update policy
        val_func.fit(observes, disc_sum_rew)  # update value function
        avg_rew_list.append(avg_rewards(trajectories))
        #Save every 20000 epidodes models (value_func, policy, scaler) and average rewards
        if not episode % 20000:
            print("Saving models")
            policy.save(episode)
            val_func.save(episode)
            f = open("models/scaler-" + str(episode) + ".pkl", 'wb')
            pickle.dump(scaler, f, pickle.HIGHEST_PROTOCOL)
            f.close()
            f2 = open("models/rewards-" + str(episode) + ".pkl", 'wb')
            pickle.dump(deepcopy(avg_rew_list), f2, pickle.HIGHEST_PROTOCOL)
            f2.close()
        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False
    #Show animation at the end of training
    while True:
        obs = env.reset()
        step = 0.0
        scale, offset = scaler.get()
        scale[-1] = 1.0
        offset[-1] = 0.0
        done = False
        while not done:
            obs = obs.astype(np.float32).reshape((1, -1))
            obs = np.append(obs, [[step]], axis=1)
            obs = (obs - offset) * scale
            action = policy.sample(obs).reshape((1, -1)).astype(np.float32)
            obs, reward, done, _ = env.step(np.squeeze(action, axis=0))
            env.render1()
            env.render2()
            step += 1e-3
    policy.close_sess()
    val_func.close_sess()
示例#23
0
文件: train_v0.py 项目: parksang21/RL
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar, clipping_range):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
    """

    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)

    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)
    policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar,
                    clipping_range)
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0

    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout

        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False

        if episode % 100 == 0:
            policy.save_sess()

    logger.close()
    policy.close_sess()
    val_func.close_sess()
示例#24
0
def main(env_name, num_iterations, gamma, lam, kl_targ, batch_size, hid1_mult,
         policy_logvar, coef, use_lr_adjust, ada_kl_penalty, seed, epochs,
         phi_epochs, max_timesteps, reg_scale, phi_lr, phi_hs, policy_size,
         phi_obj):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_iterations: maximum number of iterations to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        policy_logvar: natural log of initial policy variance
        coef: coefficient of Stein control variate
        use_lr_adjust: whether adjust lr based on kl
        ada_kl_penalty: whether adjust kl penalty
        max_timesteps: maximum time steps per trajectory
        reg_scale: regularization coefficient 
        policy_size: policy network size
        phi_obj: FitQ or MinVar
    """

    env, obs_dim, act_dim = init_gym(env_name)
    set_global_seeds(seed)
    env.seed(seed)
    env._max_episode_steps = max_timesteps
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())

    now = datetime.utcnow().strftime("%b-%d_%H:%M:%S")
    aigym_path = os.path.join('log-files/', env_name, now)
    env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False)

    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim, hid1_mult)

    policy = Policy(obs_dim,
                    act_dim,
                    kl_targ,
                    hid1_mult,
                    policy_logvar,
                    epochs,
                    phi_epochs,
                    policy_size=policy_size,
                    phi_hidden_sizes=phi_hs,
                    c_ph=coef,
                    reg_scale=reg_scale,
                    lr_phi=phi_lr,
                    phi_obj=phi_obj)

    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env,
               policy,
               scaler,
               batch_size=1000,
               max_timesteps=max_timesteps)

    for _ in range(num_iterations):
        logger.log("\n#Training Iter %d" % (_))
        logger.log("Draw Samples..")

        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  batch_size=batch_size,
                                  max_timesteps=max_timesteps)

        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage

        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)

        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew)

        logger.log("Starting Training...")
        policy.update(observes, actions, advantages, \
                use_lr_adjust, ada_kl_penalty)  # update policy

        val_func.fit(observes, disc_sum_rew)  # update value function

        logger.log('--------------------------------\n')

    policy.close_sess()
    val_func.close_sess()
示例#25
0
class Experiment:
    def __init__(self, env_name, discount, num_iterations, lamb, animate,
                 kl_target, show):
        self.env_name = env_name
        self.env = gym.make(env_name)
        if env_name == "FetchReach-v0":
            self.env = gym.wrappers.FlattenDictWrapper(
                self.env, ['observation', 'desired_goal', 'achieved_goal'])
        gym.spaces.seed(1234)
        self.obs_dim = self.env.observation_space.shape[
            0] + 1  # adding time step as feature
        self.act_dim = self.env.action_space.shape[0]
        self.discount = discount
        self.num_iterations = num_iterations
        self.lamb = lamb
        self.animate = animate

        self.buffer = Buffer(50000, self.obs_dim, self.act_dim)
        self.episodes = 20
        self.killer = GracefulKiller()

        self.policy = QPropPolicy(self.obs_dim,
                                  self.act_dim,
                                  self.env.action_space,
                                  kl_target,
                                  epochs=20)
        self.critic = DeterministicCritic(self.obs_dim, self.act_dim,
                                          self.discount, OUTPATH)
        # using MC return would be more helpful
        self.value_func = l2TargetValueFunc(self.obs_dim, epochs=10)

        if not show:
            # save copies of file
            shutil.copy(inspect.getfile(self.policy.__class__), OUTPATH)
            shutil.copy(inspect.getfile(self.value_func.__class__), OUTPATH)
            shutil.copy(inspect.getfile(self.__class__), OUTPATH)

            self.log_file = open(OUTPATH + 'log.csv', 'w')
            self.write_header = True

        print('observation dimension:', self.obs_dim)
        print('action dimension:', self.act_dim)

        # Use of a scaler is crucial
        self.scaler = Scaler(self.obs_dim)
        self.init_scaler()

    def init_scaler(self):
        """
        5 episodes empirically determined.
        :return:
        """
        print('Fitting scaler')
        observation_samples = []
        for i in range(5):
            observation = []
            obs = self.env.reset()
            observation.append(obs)
            obs = obs.astype(np.float64).reshape((1, -1))
            done = False
            step = 0
            while not done:
                obs = np.append(obs, [[step]], axis=1)  # add time step feature
                action = self.policy.get_sample(obs).reshape(
                    (1, -1)).astype(np.float64)
                if self.env_name == "FetchReach-v0":
                    obs_new, reward, done, _ = self.env.step(
                        action.reshape(-1))
                else:
                    obs_new, reward, done, _ = self.env.step(action)
                observation.append(obs_new)
                obs = obs_new.astype(np.float64).reshape((1, -1))
                step += 1e-3
            observation_samples.append(observation)
        observation_samples = np.concatenate(observation_samples, axis=0)
        self.scaler.update(observation_samples)

    def normalize_obs(self, obs):
        """
        transform and update on the fly.
        :param obs:
        :return:
        """
        scale, offset = self.scaler.get()
        obs_scaled = (obs - offset) * scale
        self.scaler.update(obs.astype(np.float64).reshape((1, -1)))
        return obs_scaled

    def run_one_episode(self):
        """
        collect data only
        :param save:
        :param train_policy:
        :param train_value_func:
        :param animate:
        :return:
        """
        obs = self.env.reset()
        observes, actions, rewards = [], [], []
        done = False
        step = 0
        while not done:
            if self.animate:
                self.env.render()
            obs = obs.astype(np.float64).reshape((1, -1))
            obs = self.normalize_obs(obs)
            obs = np.append(obs, [[step]], axis=1)  # add time step feature
            observes.append(obs)
            action = self.policy.get_sample(obs).reshape(
                (1, -1)).astype(np.float64)
            actions.append(action)
            if self.env_name == "FetchReach-v0":
                obs_new, reward, done, _ = self.env.step(action.reshape(-1))
            else:
                obs_new, reward, done, _ = self.env.step(action)
            if not isinstance(reward, float):
                reward = np.asscalar(reward)
            rewards.append(reward)

            obs = obs_new
            step += 0.003

        return np.concatenate(observes), np.concatenate(actions), np.array(
            rewards)

    def discounted_sum(self, l, factor):
        discounted = []
        sum = 0
        for i in reversed(l):
            discounted.append(factor * sum + i)
            sum = factor * sum + i
        return np.array(list(reversed(discounted)))

    def run_policy(self, episodes):
        """
        gather a batch of samples.
        :param episodes:
        :return:
        """
        trajectories = []
        for e in range(episodes):
            observes, actions, rewards = self.run_one_episode()
            trajectory = {
                'observes': observes,
                'actions': actions,
                'rewards': rewards,
                'scaled_rewards': rewards * (1 - self.discount)
            }
            trajectories.append(trajectory)

        return trajectories

    def run_expr(self):
        ep_steps = []
        ep_rewards = []
        ep_entropy = []
        i = 0
        while i < self.num_iterations:
            trajectories = self.run_policy(20)
            # add to experience replay buffer
            self.buffer.append(trajectories)
            print('buffer size:', self.buffer.size())

            i += len(trajectories)

            # for E=20, T=50, the total number of samples would be 1000
            # In future needs to account for not uniform time steps per episode.
            # e.g. in Hopper-v2 environment not every episode has same time steps
            # E = len(trajectories)
            # num_samples = np.sum([len(t['rewards']) for t in trajectories])
            gradient_steps = np.sum([len(t['rewards']) for t in trajectories])
            if self.env_name == "FetchReach-v0":
                assert (gradient_steps == 20 * 50)
            """train critic"""
            # train all samples in the buffer, to the extreme
            # self.critic.fit(self.policy, self.buffer, epochs=10, num_samples=self.buffer.size())
            # train some samples minibatches only
            self.critic.another_fit_func(self.policy, self.buffer, 5000)
            """calculation of episodic discounted return only needs rewards"""
            mc_returns = np.concatenate([
                self.discounted_sum(t['scaled_rewards'], self.discount)
                for t in trajectories
            ])
            """using current batch of samples to update baseline"""
            observes = np.concatenate([t['observes'] for t in trajectories])
            actions = np.concatenate([t['actions'] for t in trajectories])
            value_func_loss = self.value_func.update(observes, mc_returns)
            """compute GAE"""
            for t in trajectories:
                t['values'] = self.value_func.predict(t['observes'])
                # IS it really legitimate to insert 0 at the last obs?
                t['td_residual'] = t[
                    'scaled_rewards'] + self.discount * np.append(
                        t['values'][1:], 0) - t['values']
                t['gae'] = self.discounted_sum(t['td_residual'],
                                               self.discount * self.lamb)
            advantages = np.concatenate([t['gae'] for t in trajectories])
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-6)
            """compute control variate""" ""
            cv = self.critic.get_contorl_variate(self.policy, observes,
                                                 actions)
            """conservative control variate"""
            eta = [1 if i > 0 else 0 for i in advantages * cv]
            """center learning signal"""
            # check that advantages and CV should be of size E*T
            # eta controls the on-off of control variate
            learning_signal = advantages - eta * cv
            # learning_signal = (learning_signal - learning_signal.mean()) / (learning_signal.std() + 1e-6)
            """controlled taylor eval term"""
            ctrl_taylor = np.concatenate(
                [[eta[i] * act] for i, act in enumerate(
                    self.critic.get_taylor_eval(self.policy, observes))])

            policy_loss, kl, entropy, beta = self.policy.update(
                observes, actions, learning_signal, ctrl_taylor)

            # normalize advantage estimates
            # advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6)

            avg_rewards = np.sum(
                np.concatenate([t['rewards']
                                for t in trajectories])) / self.episodes
            avg_timesteps = np.average(
                [len(t['rewards']) for t in trajectories])
            log = {}

            # compute statistics such as mean and std
            log['steps'] = avg_timesteps
            log['rewards'] = avg_rewards
            log['policy_loss'] = policy_loss
            log['kl'] = kl
            log['entropy'] = entropy
            log['value_func_loss'] = value_func_loss
            log['beta'] = beta

            # display
            print('episode: ', i)
            print('average steps: {0}, average rewards: {1}'.format(
                log['steps'], log['rewards']))
            for key in [
                    'policy_loss', 'kl', 'entropy', 'beta', 'value_func_loss'
            ]:
                print('{:s}: {:.2g}'.format(key, log[key]))
            print('\n')
            ep_steps.append(log['steps'])
            ep_rewards.append(log['rewards'])
            ep_entropy.append(log['entropy'])

            # write to log.csv
            if self.write_header:
                fieldnames = [x for x in log.keys()]
                self.writer = csv.DictWriter(self.log_file,
                                             fieldnames=fieldnames)
                self.writer.writeheader()
                self.write_header = False
            self.writer.writerow(log)
            # we want the csv file to preserve information even if the program terminates earlier than scheduled.
            self.log_file.flush()

            # save model weights if stopped manually
            if self.killer.kill_now:
                if input('Terminate training (y/[n])? ') == 'y':
                    break
                self.killer.kill_now = False

            # if (i+1)%20 == 0:
            #     print('episode: ', i+1)
            #     print('average steps', np.average(steps))
            #     print('average rewards', np.average(rewards))

        self.policy.save(OUTPATH)
        self.value_func.save(OUTPATH)
        self.scaler.save(OUTPATH)

        plt.figure(figsize=(12, 9))

        if self.env_name.startswith('Fetch'):
            ax1 = plt.subplot(121)
            plt.xlabel('episodes')
            plt.ylabel('policy entropy')
            plt.plot(ep_entropy)
            scale_x = self.episodes
            ticks_x = ticker.FuncFormatter(
                lambda x, pos: '{0:g}'.format(x * scale_x))
            ax1.xaxis.set_major_formatter(ticks_x)
        else:
            ax1 = plt.subplot(121)
            plt.xlabel('episodes')
            plt.ylabel('steps')
            plt.plot(ep_steps)
            scale_x = self.episodes
            ticks_x = ticker.FuncFormatter(
                lambda x, pos: '{0:g}'.format(x * scale_x))
            ax1.xaxis.set_major_formatter(ticks_x)

        ax2 = plt.subplot(122)
        plt.xlabel('episodes')
        plt.ylabel('episodic rewards')
        plt.plot(ep_rewards)
        scale_x = self.episodes
        ticks_x = ticker.FuncFormatter(
            lambda x, pos: '{0:g}'.format(x * scale_x))
        ax2.xaxis.set_major_formatter(ticks_x)

        plt.savefig(OUTPATH + 'train.png')

    def load_model(self, load_from):
        from tensorflow.python.tools import inspect_checkpoint as chkp

        # # print all tensors in checkpoint file
        # chkp.print_tensors_in_checkpoint_file(load_from+'policy/policy.pl', tensor_name='', all_tensors=True, all_tensor_names=True)
        self.policy.load(load_from + 'policy/policy.pl')
        self.value_func.load(load_from + 'value_func/value_func.pl')

    def demonstrate_agent(self, load_from):
        self.load_model(load_from)
        with open(load_from + "scaler.pkl", 'rb') as file:
            self.scaler = pickle.load(file)
        self.animate = True
        for i in range(10):
            observes, actons, rewards = self.run_one_episode()
            ep_rewards = np.sum(rewards)
            ep_steps = len(rewards)
            print("Total steps: {0}, total rewards: {1}\n".format(
                ep_steps, ep_rewards))
示例#26
0
class Agent:
    #Warning! policy.py and critic.py are still work in progress and contain many global variables that should be converted to
    #class member variables. Before that is done, all instances of Agent must use the same values for the following:
    #PPOepsilon,nHidden,nUnitsPerLayer,activation,H,entropyLossWeight,sdLowLimit
    def __init__(self,
                 stateDim: int,
                 actionDim: int,
                 actionMin: np.array,
                 actionMax: np.array,
                 learningRate=0.0005,
                 gamma=0.99,
                 GAElambda=0.95,
                 PPOepsilon=0.2,
                 PPOentropyLossWeight=0,
                 nHidden: int = 2,
                 nUnitsPerLayer: int = 128,
                 mode="PPO-CMA-m",
                 activation="lrelu",
                 H: int = 9,
                 entropyLossWeight: float = 0,
                 sdLowLimit=0.01,
                 useScaler: bool = True,
                 criticTimestepScale=0.001):
        #Create policy network
        print("Creating policy")
        self.actionMin = actionMin.copy()
        self.actionMax = actionMax.copy()
        self.actionDim = actionDim
        self.stateDim = stateDim
        self.useScaler = useScaler
        if useScaler:
            self.scaler = Scaler(stateDim)
        self.scalerInitialized = False
        self.normalizeAdvantages = True
        self.gamma = gamma
        self.GAElambda = GAElambda
        self.criticTimestepScale = 0 if gamma == 0 else criticTimestepScale  #with gamma==0, no need for this
        piEpsilon = None
        nHistory = 1
        negativeAdvantageAvoidanceSigma = 0
        if mode == "PPO-CMA" or mode == "PPO-CMA-m":
            usePPOLoss = False  #if True, we use PPO's clipped surrogate loss function instead of the standard -A_i * log(pi(a_i | s_i))
            separateVarAdapt = True
            self.reluAdvantages = True if mode == "PPO-CMA" else False
            nHistory = H  #policy mean adapts immediately, policy covariance as an aggreagate of this many past iterations
            useSigmaSoftClip = True
            negativeAdvantageAvoidanceSigma = 1 if mode == "PPO-CMA-m" else 0
        elif mode == "PPO":
            usePPOLoss = True  #if True, we use PPO's clipped surrogate loss function instead of the standard -A_i * log(pi(a_i | s_i))
            separateVarAdapt = False
            # separateSigmaAdapt=False
            self.reluAdvantages = False
            useSigmaSoftClip = True
            piEpsilon = 0
        else:
            raise ("Unknown mode {}".format(mode))
        self.policy = Policy(
            stateDim,
            actionDim,
            actionMin,
            actionMax,
            entropyLossWeight=PPOentropyLossWeight,
            networkActivation=activation,
            networkDepth=nHidden,
            networkUnits=nUnitsPerLayer,
            networkSkips=False,
            learningRate=learningRate,
            minSigma=sdLowLimit,
            PPOepsilon=PPOepsilon,
            usePPOLoss=usePPOLoss,
            separateVarAdapt=separateVarAdapt,
            nHistory=nHistory,
            useSigmaSoftClip=useSigmaSoftClip,
            piEpsilon=piEpsilon,
            negativeAdvantageAvoidanceSigma=negativeAdvantageAvoidanceSigma)

        #Create critic network, +1 stateDim because at least in OpenAI gym, episodes are time-limited and the value estimates thus depend on simulation time.
        #Thus, we use time step as an additional feature for the critic.
        #Note that this does not mess up generalization, as the feature is not used for the policy during training or at runtime
        print("Creating critic network")
        self.critic = Critic(stateDim=stateDim + 1,
                             learningRate=learningRate,
                             nHidden=nHidden,
                             networkUnits=nUnitsPerLayer,
                             networkActivation=activation,
                             useSkips=False,
                             lossType="L1")

        #Experience trajectory buffers for the memorize() and updateWithMemorized() methods
        self.experienceTrajectories = []
        self.currentTrajectory = []

    #call this after tensorflow's global variables initializer
    def init(self, sess: tf.Session, verbose=False):
        #Pretrain the policy to output the initial Gaussian for all states
        self.policy.init(
            sess, 0, 1,
            0.5 * (self.actionMin + self.actionMax) * np.ones(self.actionDim),
            0.5 * (self.actionMax - self.actionMin) * np.ones(self.actionDim),
            256, 2000, verbose)

    #stateObs is an n-by-m tensor, where n = number of observations, m = number of observation variables
    def act(self,
            sess: tf.Session,
            stateObs: np.array,
            deterministic=False,
            clipActionToLimits=True):
        #Expand a single 1d-observation into a batch of 1 vectors
        if len(stateObs.shape) == 1:
            stateObs = np.reshape(stateObs, [1, stateObs.shape[0]])
        #Query the policy for the action, except for the first iteration where we sample directly from the initial exploration Gaussian
        #that covers the whole action space.
        #This is done because we don't know the scale of state observations a priori; thus, we can only init the state scaler in update(),
        #after we have collected some experience.
        if self.useScaler and (not self.scalerInitialized):
            actions = np.random.normal(
                0.5 * (self.actionMin + self.actionMax) *
                np.ones(self.actionDim),
                0.5 * (self.actionMax - self.actionMin) *
                np.ones(self.actionDim),
                size=[stateObs.shape[0], self.actionDim])
            if clipActionToLimits:
                actions = np.clip(
                    actions, np.reshape(self.actionMin, [1, self.actionDim]),
                    np.reshape(self.actionMax, [1, self.actionDim]))
            return actions
        else:
            if self.useScaler:
                scaledObs = self.scaler.process(stateObs)
            else:
                scaledObs = stateObs
            if deterministic:
                actions = self.policy.getExpectation(sess, scaledObs)
            else:
                actions = self.policy.sample(sess, scaledObs)
            if clipActionToLimits:
                actions = np.clip(actions, self.actionMin, self.actionMax)
            return actions

    def memorize(self, observation: np.array, action: np.array, reward: float,
                 nextObservation: np.array, done: bool):
        e = Experience(observation, action, reward, nextObservation, done)
        self.currentTrajectory.append(e)
        if done:
            self.experienceTrajectories.append(self.currentTrajectory)
            self.currentTrajectory = []

    def getAverageActionStdev(self):
        if self.useScaler and (not self.scalerInitialized):
            return np.mean(0.5 * (self.actionMax - self.actionMin))
        else:
            return self.policy.usedSigmaSum / (1e-20 +
                                               self.policy.usedSigmaSumCounter)

    #If you call memorize() after each action, you can update the agent with this method.
    #If you handle the experience buffers yourself, e.g., due to a multithreaded implementation, use the update() method instead.
    def updateWithMemorized(self,
                            sess: tf.Session,
                            batchSize: int = 512,
                            nBatches: int = 100,
                            verbose=True,
                            valuesValid=False,
                            timestepsValid=False):
        self.update(sess,
                    experienceTrajectories=self.experienceTrajectories,
                    batchSize=batchSize,
                    nBatches=nBatches,
                    verbose=verbose,
                    valuesValid=valuesValid,
                    timestepsValid=timestepsValid)
        averageEpisodeReturn = 0
        for t in self.experienceTrajectories:
            episodeReturn = 0
            for e in t:
                episodeReturn += e.r
            averageEpisodeReturn += episodeReturn
        averageEpisodeReturn /= len(self.experienceTrajectories)
        self.experienceTrajectories = []
        self.currentTrajectory = []
        return averageEpisodeReturn

    #experienceTrajectories is a list of lists of Experience instances such that each of the contained lists corresponds to an episode simulation trajectory
    def update(self,
               sess: tf.Session,
               experienceTrajectories,
               batchSize: int = 512,
               nBatches: int = 100,
               verbose=True,
               valuesValid=False,
               timestepsValid=False):
        trajectories = experienceTrajectories  #shorthand

        #Collect all data into linear arrays for training.
        nTrajectories = len(trajectories)
        nData = 0
        for trajectory in trajectories:
            nData += len(trajectory)
            #propagate values backwards along trajectory if not already done
            if not valuesValid:
                for i in reversed(range(len(trajectory) - 1)):
                    #value estimates, used for training the critic and estimating advantages
                    trajectory[i].V = trajectory[
                        i].r + self.gamma * trajectory[i + 1].V
            #update time steps if not updated
            if not timestepsValid:
                for i in range(len(trajectory)):
                    trajectory[i].timeStep = i
        allStates = np.zeros([nData, self.stateDim])
        allActions = np.zeros([nData, self.actionDim])
        allValues = np.zeros([nData])
        allTimes = np.zeros([nData, 1])
        k = 0
        for trajectory in trajectories:
            for e in trajectory:
                allStates[k, :] = e.s
                allValues[k] = e.V
                allActions[k, :] = e.a
                allTimes[k, 0] = e.timeStep * self.criticTimestepScale
                k += 1

        #Update scalers
        if self.useScaler:
            self.scaler.update(allStates)
            scale, offset = self.scaler.get()
            self.scalerInitialized = True
        else:
            offset = 0
            scale = 1

        #Scale the observations for training the critic
        scaledStates = self.scaler.process(allStates)

        #Train critic
        def augmentCriticObs(obs: np.array, timeSteps: np.array):
            return np.concatenate([obs, timeSteps], axis=1)

        self.critic.train(sess,
                          augmentCriticObs(scaledStates, allTimes),
                          allValues,
                          batchSize,
                          nEpochs=0,
                          nBatches=nBatches,
                          verbose=verbose)

        #Policy training needs advantages, which depend on the critic we just trained.
        #We use Generalized Advantage Estimation by Schulman et al.
        if verbose:
            print("Estimating advantages...".format(len(trajectories)))
        for t in trajectories:
            #query the critic values of all states of this trajectory in one big batch
            nSteps = len(t)
            states = np.zeros([nSteps + 1, self.stateDim])
            timeSteps = np.zeros([nSteps + 1, 1])
            for i in range(nSteps):
                states[i, :] = t[i].s
                timeSteps[i, 0] = t[i].timeStep * self.criticTimestepScale
            states[nSteps, :] = t[nSteps - 1].s_next
            states = (states - offset) * scale
            values = self.critic.predict(sess,
                                         augmentCriticObs(states, timeSteps))

            #GAE loop, i.e., take the instantaneous advantage (how much value a single action brings, assuming that the
            #values given by the critic are unbiased), and smooth those along the trajectory using 1st-order IIR filter.
            for step in reversed(range(nSteps - 1)):
                delta_t = t[step].r + self.gamma * values[step +
                                                          1] - values[step]
                t[step].advantage = delta_t + self.GAElambda * self.gamma * t[
                    step + 1].advantage

        #Gather the advantages to linear array and apply ReLU and normalization if needed
        allAdvantages = np.zeros([nData])
        k = 0
        for trajectory in trajectories:
            for e in trajectory:
                allAdvantages[k] = e.advantage
                k += 1

        if self.reluAdvantages:
            allAdvantages = np.clip(allAdvantages, 0, np.inf)
        if self.normalizeAdvantages:
            aMean = np.mean(allAdvantages)
            aSd = np.std(allAdvantages)
            if verbose:
                print("Advantage mean {}, sd{}".format(aMean, aSd))
            allAdvantages /= 1e-10 + aSd

        #Train policy. Note that this uses original unscaled states, because the PPO-CMA variance training needs a history of
        #states in the same scale
        self.policy.train(sess,
                          allStates,
                          allActions,
                          allAdvantages,
                          batchSize,
                          nEpochs=0,
                          nBatches=nBatches,
                          stateOffset=offset,
                          stateScale=scale,
                          verbose=verbose)
示例#27
0
def main(env_name, num_episodes, gamma, lamda, kl_targ, clipping_range, pol_loss_type, batch_size, init_pol_logvar, animate,\
        save_video, save_rate, num_episodes_sim, task_params, task_name, dims_core_hid, dims_head_hid, act_func_name,\
        episode_to_load, now_to_load):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lamda: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        clipping_range: max value to clip the policy gradient ratio
        pol_loss_type: string determining which type of loss to use for the Policy Network
        batch_size: number of episodes per policy training batch
        init_pol_logvar: natural log of initial policy variance
        save_video: Boolean determining if videos of the agent will be saved
        save_rate: Int determining how often to save videos for
        num_episodes_sim: Number of episodes to simulate/save videos for
        task_params: list of parameters to modify each environment for a different task
        task_name: name user assigns to the task being used to modify the environment
    """


    # ****************  Environment Initialization and Paths  ***************
    task_params_str = ''.join(str(e) +', ' for e in task_params)
    num_tasks = len(task_params)
    envs = [None]*num_tasks
    scalers = [None]*num_tasks
    loggers = [None]*num_tasks

    print ("\n\n------ PATHS: ------")
    start_time = datetime.now()
    if episode_to_load == None: now = start_time.strftime("%b-%d_%H:%M:%S") # If NOT loading from Checkpoint -> used to  create unique directories
    else: 
        assert now_to_load != None,\
            "\n\nWARNING: Date time to load ({}) was not provided. Please provide a valid date time of an experiment".format(now_to_load)
        now = now_to_load
    logs_path = os.path.join('log-files', env_name, task_name, task_params_str, now)

    for task in range(num_tasks):
        # Create task specific environment 
        envs[task], obs_dim, act_dim = init_gym(env_name, task_param = task_params[task])
        obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())

        # Create task specific Paths and logger object
        loggers[task] = Logger(logname= [env_name, task_name, task_params_str], now=now, \
                               logname_file= "_{}_{}".format(task_name, task_params[task])) 

        if episode_to_load == None: # If NOT loading from Checkpoint
            scalers[task] = Scaler(obs_dim)            

            # Auxiliary saver (becase logger sometimes fails or takes to much time)
            with open(logs_path + '/aux_{}_{}.txt'.format(task_name, task_params[task]), 'w') as f: 
                f.write("_Episode" + "  " + "_MeanReward")

        
    aigym_path= os.path.join('./videos', env_name, task_name, task_params_str, now) # videos folders 
    agent_path = os.path.join('agents', env_name , task_name, task_params_str, now) # agent / policy folders  
    if episode_to_load == None: # If NOT loading from Checkpoint 
        os.makedirs(agent_path)
        with open(agent_path + '/commandline_args.txt', 'w') as f: f.write(' '.join(sys.argv[1:]))  # save commandline command
        with open(logs_path + '/commandline_args.txt', 'w') as f: f.write(' '.join(sys.argv[1:]))  # save commandline command

    print("\nPath for Saved Videos : {}".format(aigym_path)) 
    print("Path for Saved Agents: {}\n".format(agent_path))    


    # ****************  Initialize Policy, Value Networks and Scaler  ***************
    print ("\n\n------ NEURAL NETWORKS: ------")
    dims_core_hid.insert(0, obs_dim) # Modify dims list to have the size of the layer 'n-1' at position '0'
    dims_head_hid.insert(0, dims_head_hid[-1])
    
    val_func = NNValueFunction(obs_dim, dims_core_hid, dims_head_hid, num_tasks)#, act_func_name)
    policy = Policy(obs_dim, act_dim, dims_core_hid, dims_head_hid, num_tasks, pol_loss_type = pol_loss_type)

    # Load from Checkpoint:
    # Validate intented episode to load OR get last episode number if no target load episode was provided 
    if episode_to_load != None:
        load_agent_path = agent_path # agent / policy folders
        saved_ep_list = [file.split(".")[0].split("_")[-1] for file in os.listdir(load_agent_path) if "policy" in file]

        if episode_to_load == -1: # Get last saved episode
            episode_to_load = sorted([int(ep_string) for ep_string in saved_ep_list])[-1]

        else: # Validate if episode_to_load was indeed saved 
            assert str(episode_to_load) in saved_ep_list,\
            "\n\nWARNING: Episode you want to load ({}) was not stored during trainning".format(episode_to_load)

        # Load Policy Network's Ops and Variables & Load Scaler Object
        policy.tf_saver.restore(policy.sess, "{}/policy_ep_{}".format(load_agent_path, episode_to_load)) 
        val_func.tf_saver.restore(val_func.sess, "{}/val_func_ep_{}".format(load_agent_path, episode_to_load))
        scalers = pickle.load(open("{}/scalers_ep_{}.p".format(load_agent_path, episode_to_load), 'rb'))         
        print("\n\n ---- CHECKPOINT LOAD:  Episoded Loaded **{}**".format(episode_to_load))

        # Delete extra epochs that where logged to the auxiliary logs
        for task in range(num_tasks):
            aux_log_path = logs_path + '/aux_{}_{}.txt'.format(task_name, task_params[task])
            aux_log = pd.read_table(aux_log_path, delim_whitespace=True)
            idx_to_cut = aux_log.index[aux_log["_Episode"] == episode_to_load ].tolist()[0]
            aux_log[0:idx_to_cut+1].to_csv(aux_log_path, header=True, index=False, sep=' ', mode='w') # overwrite trimmed aux_log


    # If NOT loading from Checkpoint: run some episodes to initialize scalers and create Tensor board dirs
    elif episode_to_load == None:
        for task in range(num_tasks): run_policy(envs[task], policy, scalers[task], loggers[task], episodes=5, task=task)  

        # Tensor Board writer
        os.makedirs(agent_path + '/tensor_board/policy')
        os.makedirs(agent_path + '/tensor_board/valFunc')

    tb_pol_writer = tf.summary.FileWriter(agent_path + '/tensor_board/policy', graph=policy.g)
    tb_val_writer = tf.summary.FileWriter(agent_path + '/tensor_board/valFunc', graph=val_func.g)


    # ****************  Start Training  ***************
    print ("\n\n------ TRAINNING: ------")
    animate = True if animate == "True" else False
    save_video = True if save_video == "True" else False
    saver_offset = save_rate
    killer = GracefulKiller()

    if episode_to_load == None: episode = 0
    else: episode = episode_to_load
    
    # Episode is counted across all tasks i.e. N episodes indicates each tasks has been runned for N times
    while episode < num_episodes and not killer.kill_now:

        # ****************  Obtain data (train set)  ***************         
        observes_all = [None]*num_tasks
        actions_all = [None]*num_tasks
        advantages_all = [None]*num_tasks
        disc_sum_rew_all = [None]*num_tasks

        episode += batch_size
        for task in range(num_tasks):

            # Obtain 'batch_size' trajectories and add additional intermediate calculations
            trajectories = run_policy(envs[task],policy, scalers[task], loggers[task],episodes=batch_size,task=task,animate=animate)
            
            add_value(trajectories, val_func, task)  # add estimated values to episodes
            add_disc_sum_rew(trajectories, gamma)  # calculated discounted sum of Rs
            add_gae(trajectories, gamma, lamda)  # calculate advantage

            # Concatenate all episodes into single NumPy arrays
            observes_all[task], actions_all[task], advantages_all[task], disc_sum_rew_all[task] = build_train_set(trajectories)

            # Logging Stats
            log_batch_stats(observes_all[task], actions_all[task], advantages_all[task], disc_sum_rew_all[task], \
                            loggers[task], episode)

        # ****************  Update Policy and Value Networks  ***************
        print ("*************************************")
        for task in range(num_tasks):
            pol_summary = policy.update(task, observes_all[task], actions_all[task], advantages_all[task], loggers[task])  # update policy
            val_summary = val_func.fit(task, observes_all[task], disc_sum_rew_all[task], loggers[task])  # update value function
            # Auxiliary saver (because logger sometimes fails or takes to much time)
            with open(logs_path + '/aux_{}_{}.txt'.format(task_name, task_params[task]), 'a') as f: 
                f.write("\n" + str(loggers[task].log_entry['_Episode']) + "  " + str(loggers[task].log_entry['_MeanReward'])) 
            loggers[task].write(display=True)  # write logger results to file and stdout

            tb_pol_writer.add_summary(pol_summary, global_step=episode)
            tb_val_writer.add_summary(val_summary, global_step=episode)


        # ****************  Storing NN and Videos  ***************
        # Store Policy, Value Network and Scaler: every 'save_rate' of total episodes or in first/last episode
        if episode >= saver_offset or episode >=num_episodes or episode <=batch_size or killer.kill_now:
        # TODO: Make saving agent/video a method so that it can be called in killer.kill_now 
            saver_offset += save_rate
            policy.tf_saver.save(policy.sess, "{}/policy_ep_{}".format(agent_path, episode)) # Save Policy Network
            val_func.tf_saver.save(val_func.sess, "{}/val_func_ep_{}".format(agent_path, episode)) # Save Value Network
            pickle.dump(scalers, open("{}/scalers_ep_{}.p".format(agent_path, episode), 'wb'))            
            print ("---- Saved Agent at Episode {} ----".format(episode))

            # Save video of current agent/policy
            if save_video: 
                print ("---- Saving Video at Episode {} ----".format(episode))
                for task in range(num_tasks):
                    print("Environment Wind: {}".format(envs[task].env.world.gravity))
                    _ = sim_agent(envs[task], policy, task, scalers[task], num_episodes_sim, save_video=True, 
                                    out_dir=aigym_path + "/vid_ep_{}/{}_{}".format(episode, task_name, task_params[task]))
                    envs[task].close() # closes window open by monitor wrapper
                    envs[task], _, _ = init_gym(env_name,task_param=task_params[task]) # Recreate env as it was killed
            print("\n\n")

            # If Ctrl + C is Pressed, ask user if Trainning shall be terminated
            if killer.kill_now:
                if input('Terminate training (y/[n])? ') == 'y':
                    break
                killer.kill_now = False

    # ****************  Terminate Variables  **************
    for task in range(num_tasks):
        envs[task].close()
        loggers[task].close()
    policy.close_sess()
    val_func.close_sess()

    # Save elapsed time
    end_time = datetime.now()
    elapsed_time = end_time - start_time
    timedelta(0, 8, 562000)
    delta_time = divmod(elapsed_time.days * 86400 + elapsed_time.seconds, 60)
    delta_str = "Elapsed Time: {} min {} seconds".format(delta_time[0], delta_time[1])
    # save elapsed time, 'a' to append not overwrite
    with open(agent_path + '/commandline_args.txt', 'a') as f: f.write('\n\n' + delta_str) 
    with open(logs_path + '/commandline_args.txt', 'a') as f: f.write('\n\n' + delta_str)  
示例#28
0
class Recognizer(object):
    def __init__(self):
        self.resize_shape = (100, 100)
        shape_predictor_path = 'data/shape_predictor_68_face_landmarks.dat'
        self.shape_predictor = dlib.shape_predictor(shape_predictor_path)
        self.eye_and_mouth_indices = [39, 42, 57]
        self.template_landmarks = get_template_landmarks(
            self.eye_and_mouth_indices, self.resize_shape[0])
        npload = np.load('data/mean_std2.npz')
        mean, std = npload['mean'], npload['std']
        self.scaler = Scaler(mean=mean, std=std)

        # model_emb_path = 'data/epoch_17_test_eer0.191621.hdf5'

        model_path = 'data/cnn_model/epoch_66_val_loss1.206078.hdf5'
        model_emb_path = 'data/emb_model/model_10_epoch_10_test_eer0.169731_test2_err0.204908.hdf5'

        # model_path = 'data/cnn_model/epoch_16_val_loss1.231896.hdf5'
        # model_emb_path = 'data/emb_model/model_8_epoch_15_test_eer0.127431_test2_err0.218662.hdf5'
        # model_emb_path = 'data/emb_model/model_8_epoch_1_test_eer0.133520_test2_err0.216839.hdf5'
        # model_emb_path = 'data/emb_model/model_9_epoch_5_test_eer0.127574_test2_err0.229637.hdf5'

        # model_path = 'data/cnn_model/epoch_232_val_loss1.351451.hdf5'
        # model_emb_path = 'data/emb_model/model_1_epoch_0_test_eer0.114874.hdf5'
        #
        # model_path = 'data/cnn_model/epoch_57_val_loss1.699622.hdf5'
        # model_emb_path = 'data/emb_model/model_2_epoch_25_test_eer0.106689.hdf5'

        # model_path = 'data/cnn_model/epoch_29_val_loss1.441430.hdf5'
        # model_emb_path = 'data/emb_model/model_5_epoch_2_test_eer0.143211.hdf5'
        # model_emb_path = 'data/emb_model/model_6_epoch_6_test_eer_0.135497_test2_err0.254601.hdf5'

        # model_emb_path = '../data/Modeltpe2/epoch_0_test_eer0.139840.hdf5'
        # model_emb_path = '../data/Modeltpe3/epoch_12_test_eer0.107399.hdf5'
        # model_emb_path = 'data/emb_model/model_4_epoch_1_test_eer0.108006.hdf5'

        model = keras.models.load_model(model_path)
        self.model_emb = keras.models.load_model(model_emb_path)
        self.bottleneck = Bottleneck(model)

        npload = np.load('data/face_base.npz')
        self.x, self.y = npload['x'], npload['y']
        print(self.x.shape, self.y.shape)

        with open('data/labels_dict.pkl', 'rb') as file:
            self.labels_dict = pickle.load(file)

        self.knn = KNeighborsClassifier(n_neighbors=1, metric=metric, n_jobs=1)
        self.knn.fit(self.x, self.y)

    def iterate_similarities(self, emb):
        for i, person_emb in enumerate(self.x):
            sim = person_emb @ emb.T
            yield sim, i

    def predict(self, img, img_gray, rect):
        img = align_img(img, img_gray, rect, self.shape_predictor,
                        self.template_landmarks, self.eye_and_mouth_indices,
                        self.resize_shape)
        batch_x = [img]
        import matplotlib.pyplot as plt
        # plt.imshow(img)
        # plt.show()
        batch_x = self.scaler.transform(batch_x)
        batch_x = self.bottleneck.predict(transpose_matrix(batch_x))
        batch_x = self.model_emb.predict(batch_x)

        # batch_x = self.model_emb.predict(transpose_matrix(batch_x))

        pred_labels = self.knn.predict(batch_x)
        neighbors = self.knn.kneighbors(batch_x)
        label_neighbors = [
            self.labels_dict[self.y[ind]] for ind in neighbors[1][0]
        ]
        print(label_neighbors, neighbors[0])

        # label_ind = max(self.iterate_similarities(batch_x[0]), key=lambda x: x[0])[1]
        # label = self.y[label_ind]
        label = pred_labels[0]
        return self.labels_dict[label], label_neighbors
示例#29
0
    def __init__(self,
                 stateDim: int,
                 actionDim: int,
                 actionMin: np.array,
                 actionMax: np.array,
                 learningRate=0.0005,
                 gamma=0.99,
                 GAElambda=0.95,
                 PPOepsilon=0.2,
                 PPOentropyLossWeight=0,
                 nHidden: int = 2,
                 nUnitsPerLayer: int = 128,
                 mode="PPO-CMA-m",
                 activation="lrelu",
                 H: int = 9,
                 entropyLossWeight: float = 0,
                 sdLowLimit=0.01,
                 useScaler: bool = True,
                 criticTimestepScale=0.001):
        #Create policy network
        print("Creating policy")
        self.actionMin = actionMin.copy()
        self.actionMax = actionMax.copy()
        self.actionDim = actionDim
        self.stateDim = stateDim
        self.useScaler = useScaler
        if useScaler:
            self.scaler = Scaler(stateDim)
        self.scalerInitialized = False
        self.normalizeAdvantages = True
        self.gamma = gamma
        self.GAElambda = GAElambda
        self.criticTimestepScale = 0 if gamma == 0 else criticTimestepScale  #with gamma==0, no need for this
        piEpsilon = None
        nHistory = 1
        negativeAdvantageAvoidanceSigma = 0
        if mode == "PPO-CMA" or mode == "PPO-CMA-m":
            usePPOLoss = False  #if True, we use PPO's clipped surrogate loss function instead of the standard -A_i * log(pi(a_i | s_i))
            separateVarAdapt = True
            self.reluAdvantages = True if mode == "PPO-CMA" else False
            nHistory = H  #policy mean adapts immediately, policy covariance as an aggreagate of this many past iterations
            useSigmaSoftClip = True
            negativeAdvantageAvoidanceSigma = 1 if mode == "PPO-CMA-m" else 0
        elif mode == "PPO":
            usePPOLoss = True  #if True, we use PPO's clipped surrogate loss function instead of the standard -A_i * log(pi(a_i | s_i))
            separateVarAdapt = False
            # separateSigmaAdapt=False
            self.reluAdvantages = False
            useSigmaSoftClip = True
            piEpsilon = 0
        else:
            raise ("Unknown mode {}".format(mode))
        self.policy = Policy(
            stateDim,
            actionDim,
            actionMin,
            actionMax,
            entropyLossWeight=PPOentropyLossWeight,
            networkActivation=activation,
            networkDepth=nHidden,
            networkUnits=nUnitsPerLayer,
            networkSkips=False,
            learningRate=learningRate,
            minSigma=sdLowLimit,
            PPOepsilon=PPOepsilon,
            usePPOLoss=usePPOLoss,
            separateVarAdapt=separateVarAdapt,
            nHistory=nHistory,
            useSigmaSoftClip=useSigmaSoftClip,
            piEpsilon=piEpsilon,
            negativeAdvantageAvoidanceSigma=negativeAdvantageAvoidanceSigma)

        #Create critic network, +1 stateDim because at least in OpenAI gym, episodes are time-limited and the value estimates thus depend on simulation time.
        #Thus, we use time step as an additional feature for the critic.
        #Note that this does not mess up generalization, as the feature is not used for the policy during training or at runtime
        print("Creating critic network")
        self.critic = Critic(stateDim=stateDim + 1,
                             learningRate=learningRate,
                             nHidden=nHidden,
                             networkUnits=nUnitsPerLayer,
                             networkActivation=activation,
                             useSkips=False,
                             lossType="L1")

        #Experience trajectory buffers for the memorize() and updateWithMemorized() methods
        self.experienceTrajectories = []
        self.currentTrajectory = []
示例#30
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, eval):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """

    if eval:
        print("Evaluating: ")
        evaluate(env_name, num_episodes)
        exit()

    killer = GracefulKiller()
    env, obs_dim, act_dim = init_gym(env_name)
    now = datetime.utcnow().strftime(
        "%b-%d_%H:%M:%S")  # create unique directories
    logger = Logger(logname=env_name, now=now)
    aigym_path = os.path.join('/tmp', env_name, now)
    #env = wrappers.Monitor(env, aigym_path, force=True)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim)
    policy = Policy(obs_dim, act_dim, kl_targ)
    #policy.restore_weights() ## -------------
    #val_func.restore_weights() ## -------------
    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, logger, episodes=5)
    episode = 0
    while episode < num_episodes:
        trajectories = run_policy(env,
                                  policy,
                                  scaler,
                                  logger,
                                  episodes=batch_size)
        episode += len(trajectories)
        add_value(trajectories, val_func)  # add estimated values to episodes
        add_disc_sum_rew(trajectories,
                         gamma)  # calculated discounted sum of Rs
        add_gae(trajectories, gamma, lam)  # calculate advantage
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(
            trajectories)
        # add various stats to training log:
        log_batch_stats(observes, actions, advantages, disc_sum_rew, logger,
                        episode)
        policy.update(observes, actions, advantages, logger)  # update policy
        val_func.fit(observes, disc_sum_rew, logger)  # update value function
        logger.write(display=True)  # write logger results to file and stdout

        if killer.kill_now:
            if input('Terminate training (y/[n])? ') == 'y':
                break
            killer.kill_now = False

    print("Scaler vars,means: ")
    print(scaler.vars, scaler.means)

    for i in range(3):
        run_episode(env, policy, scaler, animate=True)

    #policy.save_weights()
    #val_func.save_weights()

    #WARNING: scaler is disabled

    logger.close()
    policy.close_sess()
    val_func.close_sess()
示例#31
0
def eval_models(env_name, num_episodes, gamma, lam, kl_targ, coef,
                use_lr_adjust, ada_kl_penalty, seed, epochs, phi_epochs,
                max_timesteps, reg_scale, phi_lr, phi_hs, policy_size, phi_obj,
                load_model):

    env, obs_dim, act_dim = init_gym(env_name)
    set_global_seeds(seed)
    env.seed(seed)
    env._max_episode_steps = max_timesteps
    obs_dim += 1
    now = datetime.utcnow().strftime("%b-%d_%H:%M:%S")
    aigym_path = os.path.join('log-files/', env_name, now)
    env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False)
    scaler = Scaler(obs_dim)
    val_func = NNValueFunction(obs_dim)
    policy = Policy(obs_dim,
                    act_dim,
                    kl_targ,
                    epochs,
                    phi_epochs,
                    policy_size=policy_size,
                    phi_hidden_sizes=phi_hs,
                    reg_scale=reg_scale,
                    lr_phi=phi_lr,
                    phi_obj=phi_obj)

    logger.log("loading model")
    load_dir = "models/"
    policy.load_model(load_dir)
    val_func.load_val_model(load_dir)

    run_policy(env, policy, scaler, num_episodes, max_timesteps=max_timesteps)

    episode = 0

    trajectories, traj_len_list = run_policy(env,
                                             policy,
                                             scaler,
                                             num_episodes,
                                             max_timesteps=max_timesteps)

    num_traj = len(trajectories)
    logger.log("Avg Length %d total Length %d"%( \
            np.mean(traj_len_list), \
            np.sum(traj_len_list)))

    episode += len(trajectories)
    add_value(trajectories, val_func)
    add_disc_sum_rew(trajectories, gamma)
    add_gae(trajectories, gamma, lam)

    observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)

    sub_folder = "eval_data/%s_%s_data_seed=%d_max-steps=%d"%(\
                        env_name, phi_obj,
                        seed, max_timesteps)
    if not os.path.exists(sub_folder):
        os.mkdir(sub_folder)

    # save original gradient
    mc_grad_info = policy.get_batch_gradient(observes,
                                             actions,
                                             advantages,
                                             c=0.)
    mc_grad_info['traj_lens'] = traj_len_list
    with open(sub_folder + '/mc_num_episode=%d.pkl' % (num_episodes),
              'wb') as fp:
        pickle.dump(mc_grad_info, fp)

    policy.update(load_model,
                  observes,
                  actions,
                  advantages,
                  use_lr_adjust,
                  ada_kl_penalty,
                  c=1)  # update policy

    stein_grad_info = policy.get_batch_gradient(observes, \
                    actions, advantages, c=1.)

    stein_grad_info['traj_lens'] = traj_len_list
    with open(sub_folder + '/stein_num_episode=%d.pkl' % (num_episodes),
              'wb') as fp:
        pickle.dump(stein_grad_info, fp)
示例#32
0
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, restore_path,
         out_path, thread_count, animation_mode, gait_name, gait_length,
         gaits_config_path, reward_mask, log_rewards, gait_reward_weight,
         g_colab, progress_reward_weight, phase_time_limit):
    """ Main training loop

    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
    """
    killer = GracefulKiller()
    # restore_path = os.path.abspath(restore_path)
    env, obs_dim, act_dim = init_gym(env_name)
    log_rewards = log_rewards or (num_episodes == 0)
    env_list = []
    if thread_count > 1:
        env_list, obs_dim, act_dim = init_gyms(env_name, batch_size)
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    start_time = datetime.now()  # create unique directories
    start_time_str = start_time.strftime("%b-%d/%H.%M.%S")
    logger = Logger(logname=env_name, now=start_time_str, out_path=out_path)
    env.env.set_params(gaits_config_path=gaits_config_path,
                       gait_name=gait_name,
                       gait_cycle_len=gait_length,
                       out_path=logger.path,
                       log_rewards=log_rewards,
                       render_mode=animation_mode,
                       reward_mask=reward_mask,
                       contact_reward=gait_reward_weight,
                       g_colab=g_colab,
                       progress_weight=progress_reward_weight,
                       phase_time_limit=phase_time_limit)
    scaler = Scaler(obs_dim)

    val_func = NNValueFunction(obs_dim, logger, restore_path)
    policy = Policy(obs_dim, act_dim, kl_targ, logger, restore_path)

    log_train_info(logger, num_episodes, start_time_str, gait_name,
                   gait_length, batch_size, restore_path, reward_mask,
                   gait_reward_weight, progress_reward_weight,
                   phase_time_limit)

    # run a few episodes of untrained policy to initialize scaler:
    episode = 0
    try:
        if restore_path is None:
            print("\nInitializing scaler (may take some time)... ")
            run_policy(env, policy, scaler, logger, episodes=5)
            print("Done\n")
        else:
            scaler.load(restore_path, obs_dim)

        while episode < num_episodes:
            sim_time = datetime.now()
            if thread_count > 1:
                trajectories = run_policy_parallel(env_list,
                                                   policy,
                                                   scaler,
                                                   logger,
                                                   episodes=batch_size,
                                                   thread_num=thread_count)
            else:
                trajectories = run_policy(env,
                                          policy,
                                          scaler,
                                          logger,
                                          episodes=batch_size)
            sim_time = datetime.now() - sim_time

            episode += len(trajectories)
            add_value(trajectories,
                      val_func)  # add estimated values to episodes
            add_disc_sum_rew(trajectories,
                             gamma)  # calculated discounted sum of Rs
            add_gae(trajectories, gamma, lam)  # calculate advantage
            # concatenate all episodes into single NumPy arrays
            observes, actions, advantages, disc_sum_rew = build_train_set(
                trajectories)
            # add various stats to training log:
            train_time = datetime.now() - start_time
            policy_time = datetime.now()
            policy.update(observes, actions, advantages,
                          logger)  # update policy
            policy_time = datetime.now() - policy_time
            val_time = datetime.now()
            val_func.fit(observes, disc_sum_rew,
                         logger)  # update value function
            val_time = datetime.now() - val_time

            log_batch_stats(observes, actions, advantages, disc_sum_rew,
                            logger, episode, train_time, sim_time, policy_time,
                            val_time)
            logger.write(
                display=True)  # write logger results to file and stdout
            print("Estimated time left: {}\n".format(
                estimate_time_left(episode, num_episodes, train_time)))

            if episode % 1000 == 0:
                policy.save()
                val_func.save()
                scaler.save(logger.path)
                print("Data saved at {}\n".format(logger.path))
                update_train_info(logger, episode)
                if animation_mode > 0:
                    run_policy(env,
                               policy,
                               scaler,
                               logger,
                               episodes=1,
                               animate=True,
                               anim_name='epizode_{}'.format(episode))
            if episode % 5000 == 0:
                os.rename(
                    os.path.join(logger.path, 'value_dump'),
                    os.path.join(logger.path, 'value_dump_' + str(episode)))
                os.rename(
                    os.path.join(logger.path, 'policy_dump'),
                    os.path.join(logger.path, 'policy_dump_' + str(episode)))
                # if episode == 20000:
                #     reward_mask = 63
                #     env.env.set_params(gaits_config_path=gaits_config_path, gait_name=gait_name, gait_cycle_len=gait_length,
                #                        out_path=logger.path, log_rewards=log_rewards, render_mode=animation_mode,
                #                        reward_mask=reward_mask, contact_reward=gait_reward_weight, g_colab=g_colab)
                print("Progress Enabled")
            if killer.kill_now:
                # if input('Terminate training (y/[n])? ') == 'y':
                #     break
                # killer.kill_now = False
                break
    finally:
        if animation_mode > 0 or num_episodes == 0:
            print("Rendering result video")
            try:
                trajectories = run_policy(
                    env,
                    policy,
                    scaler,
                    logger,
                    episodes=1,
                    animate=True,
                    anim_name='final_epizode_{}'.format(episode))
                # for walk analysis
                for t in trajectories:
                    logger.log_trajectory(t)
            except Exception as e:
                print("Failed to animate results, error: {}".format(e))
                raise e

        scaler.save(logger.path)
        policy.close_sess()
        val_func.close_sess()
        update_train_info(logger, episode)
        logger.close()