Exemplo n.º 1
0
def sample_paths(N,
    policy,
    T=1e6,
    env=None,
    env_name=None,
    pegasus_seed=None,
    num_cpu='max',
    paths_per_call=5,
    mode='sample'):
    """
    params:
    N               : number of sample points
    policy          : policy to be used to sample the data
    T               : maximum length of trajectory
    env             : env object to sample from
    env_name        : name of env to be sampled from 
                      (one of env or env_name must be specified)
    pegasus_seed    : seed for environment (numpy speed must be set externally)
    """

    if num_cpu == 1:
        return sample_paths_one_core(N, policy, T, env, env_name, pegasus_seed, mode)
    else:
        start_time = timer.time()
        print("####### Gathering Samples #######")
        sampled_so_far = 0
        paths_so_far = 0
        paths = []
        while sampled_so_far <= N:
            if pegasus_seed is None:
                new_paths = trajectory_sampler.sample_paths_parallel(paths_per_call,
                            policy, T, env_name, pegasus_seed, num_cpu, suppress_print=True, mode=mode)

            else:
                pegasus_seed += paths_so_far
                new_paths = trajectory_sampler.sample_paths_parallel(paths_per_call,
                            policy, T, env_name, pegasus_seed, num_cpu, suppress_print=True, mode=mode)

            for path in new_paths:
                paths.append(path)
            paths_so_far += paths_per_call
            new_samples = np.sum([len(p['rewards']) for p in new_paths])
            sampled_so_far += new_samples
        print("======= Samples Gathered  ======= | >>>> Time taken = %f " % (timer.time()-start_time) )
        print("................................. | >>>> # samples = %i # trajectories = %i " % (sampled_so_far, paths_so_far) )
        return paths
Exemplo n.º 2
0
    def train_step(self, N,
                   sample_mode='trajectories',
                   env_name=None,
                   T=1e6,
                   gamma=0.995,
                   gae_lambda=0.98,
                   num_cpu='max'):

        # Clean up input arguments
        if env_name is None: env_name = self.env.env_id
        if sample_mode != 'trajectories' and sample_mode != 'samples':
            print("sample_mode in NPG must be either 'trajectories' or 'samples'")
            quit()

        ts = timer.time()

        if sample_mode == 'trajectories':
            paths = trajectory_sampler.sample_paths_parallel(N, self.policy, T, env_name,
                                                             self.seed, num_cpu)
        elif sample_mode == 'samples':
            paths = batch_sampler.sample_paths(N, self.policy, T, env_name=env_name,
                                               pegasus_seed=self.seed, num_cpu=num_cpu)

        if self.save_logs:
            self.logger.log_kv('time_sampling', timer.time() - ts)

        self.seed = self.seed + N if self.seed is not None else self.seed

        # compute returns
        process_samples.compute_returns(paths, gamma)
        # compute advantages
        process_samples.compute_advantages(paths, self.baseline, gamma, gae_lambda)
        # train from paths
        eval_statistics = self.train_from_paths(paths)
        eval_statistics.append(N)
        # fit baseline
        if self.save_logs:
            ts = timer.time()
            error_before, error_after = self.baseline.fit(paths, return_errors=True)
            self.logger.log_kv('time_VF', timer.time()-ts)
            self.logger.log_kv('VF_error_before', error_before)
            self.logger.log_kv('VF_error_after', error_after)
        else:
            self.baseline.fit(paths)

        return eval_statistics
Exemplo n.º 3
0
def _evaluation_rollout(agent, num_rollouts, num_cpu):
    """Performs an evaluation rollout.

    Args:
        agent: The MJRL agent to perform the rollout with.
        num_rollouts: The number of rollouts.
        num_cpu: The number of CPUs to perform the rollout on.

    Returns:
        The mean return over the rollout.
    """
    eval_paths = sample_paths_parallel(
        N=num_rollouts,
        policy=agent.policy,
        num_cpu=num_cpu,
        env_name=agent.env.env_id,
        mode='evaluation',
    )
    return np.mean([np.sum(path['rewards']) for path in eval_paths])
Exemplo n.º 4
0
def train_agent(
    job_name,
    agent,
    seed=0,
    niter=101,
    gamma=0.995,
    gae_lambda=None,
    num_cpu=1,
    sample_mode='trajectories',
    num_traj=50,
    num_samples=50000,  # has precedence, used with sample_mode = 'samples'
    save_freq=10,
    evaluation_rollouts=None,
    plot_keys=['stoc_pol_mean'],
):

    np.random.seed(seed)
    if os.path.isdir(job_name) == False:
        os.mkdir(job_name)
    previous_dir = os.getcwd()
    os.chdir(job_name)  # important! we are now in the directory to save data
    if os.path.isdir('iterations') == False: os.mkdir('iterations')
    if os.path.isdir('logs') == False and agent.save_logs == True:
        os.mkdir('logs')
    best_policy = copy.deepcopy(agent.policy)
    best_perf = -1e8
    train_curve = best_perf * np.ones(niter)
    mean_pol_perf = 0.0
    e = GymEnv(agent.env.env_id)

    for i in range(niter):
        print(
            "......................................................................................"
        )
        print("ITERATION : %i " % i)
        if train_curve[i - 1] > best_perf:
            best_policy = copy.deepcopy(agent.policy)
            best_perf = train_curve[i - 1]
        N = num_traj if sample_mode == 'trajectories' else num_samples
        args = dict(N=N,
                    sample_mode=sample_mode,
                    gamma=gamma,
                    gae_lambda=gae_lambda,
                    num_cpu=num_cpu)
        stats = agent.train_step(**args)
        train_curve[i] = stats[0]
        if evaluation_rollouts is not None and evaluation_rollouts > 0:
            print("Performing evaluation rollouts ........")
            eval_paths = sample_paths_parallel(N=evaluation_rollouts,
                                               policy=agent.policy,
                                               num_cpu=num_cpu,
                                               env_name=e.env_id,
                                               mode='evaluation',
                                               pegasus_seed=seed)
            mean_pol_perf = np.mean(
                [np.sum(path['rewards']) for path in eval_paths])
            if agent.save_logs:
                agent.logger.log_kv('eval_score', mean_pol_perf)
        if i % save_freq == 0 and i > 0:
            if agent.save_logs:
                agent.logger.save_log('logs/')
                make_train_plots(log=agent.logger.log,
                                 keys=plot_keys,
                                 save_loc='logs/')
            policy_file = 'policy_%i.pickle' % i
            baseline_file = 'baseline_%i.pickle' % i
            pickle.dump(agent.policy, open('iterations/' + policy_file, 'wb'))
            pickle.dump(agent.baseline,
                        open('iterations/' + baseline_file, 'wb'))
            pickle.dump(best_policy, open('iterations/best_policy.pickle',
                                          'wb'))
        # print results to console
        if i == 0:
            result_file = open('results.txt', 'w')
            print("Iter | Stoc Pol | Mean Pol | Best (Stoc) \n")
            result_file.write(
                "Iter | Sampling Pol | Evaluation Pol | Best (Sampled) \n")
            result_file.close()
        print("[ %s ] %4i %5.2f %5.2f %5.2f " %
              (timer.asctime(timer.localtime(
                  timer.time())), i, train_curve[i], mean_pol_perf, best_perf))
        result_file = open('results.txt', 'a')
        result_file.write("%4i %5.2f %5.2f %5.2f \n" %
                          (i, train_curve[i], mean_pol_perf, best_perf))
        result_file.close()
        if agent.save_logs:
            print_data = sorted(
                filter(lambda v: np.asarray(v[1]).size == 1,
                       agent.logger.get_current_log().items()))
            print(tabulate(print_data))

    # final save
    pickle.dump(best_policy, open('iterations/best_policy.pickle', 'wb'))
    if agent.save_logs:
        agent.logger.save_log('logs/')
        make_train_plots(log=agent.logger.log,
                         keys=plot_keys,
                         save_loc='logs/')
    os.chdir(previous_dir)
Exemplo n.º 5
0
        f = open(iterdir + 'policy_199.pickle', 'rb')
        policy_stl = pickle.load(f)
        f.close()
        f = open(iterdir + 'baseline_199.pickle', 'rb')
        baseline_stl[task_id] = pickle.load(f)
        f.close()

        agent_stl = NPG(e[task_id],
                        policy_stl,
                        baseline_stl[task_id],
                        normalized_step_size=0.01,
                        seed=SEED,
                        save_logs=False)
        agent_stl.set_task(task_id)
        eval_paths = trajectory_sampler.sample_paths_parallel(
            N=10,
            policy=policy_stl,
            num_cpu=num_cpu,
            env_name=e[task_id].env_id,
            mode='evaluation',
            pegasus_seed=SEED)

        forward_transfer_results[task_id] = np.mean(
            [np.sum(path['rewards']) for path in eval_paths])

    result_file = open(job_name_stl_seed + '/finetune_results.txt', 'w')
    result_file.write(str(forward_transfer_results))
    result_file.close()

    SEED += 10
Exemplo n.º 6
0
def train_agent(job_name, agent,
                seed = 0,
                niter = 101,
                gamma = 0.995,
                gae_lambda = None,
                num_cpu = 1,
                sample_mode = 'trajectories',
                num_traj = 50,
                num_samples = 50000, # has precedence, used with sample_mode = 'samples'
                save_freq = 10,
                evaluation_rollouts = None,
                plot_keys = ['stoc_pol_mean'],
                ):

    np.random.seed(seed)
    if os.path.isdir(job_name) == False:
        os.mkdir(job_name)
    previous_dir = os.getcwd()
    os.chdir(job_name) # important! we are now in the directory to save data
    if os.path.isdir('iterations') == False: os.mkdir('iterations')
    if os.path.isdir('logs') == False and agent.save_logs == True: os.mkdir('logs')
    best_policy = copy.deepcopy(agent.policy)
    best_perf = -1e8
    train_curve = best_perf*np.ones(niter)
    mean_pol_perf = 0.0
    e = GymEnv(agent.env.env_id)

    for i in range(niter):
        print("......................................................................................")
        print("ITERATION : %i " % i)
        if train_curve[i-1] > best_perf:
            best_policy = copy.deepcopy(agent.policy)
            best_perf = train_curve[i-1]
        N = num_traj if sample_mode == 'trajectories' else num_samples
        args = dict(N=N, sample_mode=sample_mode, gamma=gamma, gae_lambda=gae_lambda, num_cpu=num_cpu)
        stats = agent.train_step(**args)
        train_curve[i] = stats[0]
        if evaluation_rollouts is not None and evaluation_rollouts > 0:
            print("Performing evaluation rollouts ........")
            eval_paths = sample_paths_parallel(N=evaluation_rollouts, policy=agent.policy, num_cpu=num_cpu,
                                               env_name=e.env_id, mode='evaluation', pegasus_seed=seed)
            mean_pol_perf = np.mean([np.sum(path['rewards']) for path in eval_paths])
            if agent.save_logs:
                agent.logger.log_kv('eval_score', mean_pol_perf)
        if i % save_freq == 0 and i > 0:
            if agent.save_logs:
                agent.logger.save_log('logs/')
                make_train_plots(log=agent.logger.log, keys=plot_keys, save_loc='logs/')
            policy_file = 'policy_%i.pickle' % i
            baseline_file = 'baseline_%i.pickle' % i
            pickle.dump(agent.policy, open('iterations/' + policy_file, 'wb'))
            pickle.dump(agent.baseline, open('iterations/' + baseline_file, 'wb'))
            pickle.dump(best_policy, open('iterations/best_policy.pickle', 'wb'))
        # print results to console
        if i == 0:
            result_file = open('results.txt', 'w')
            print("Iter | Stoc Pol | Mean Pol | Best (Stoc) \n")
            result_file.write("Iter | Sampling Pol | Evaluation Pol | Best (Sampled) \n")
            result_file.close()
        print("[ %s ] %4i %5.2f %5.2f %5.2f " % (timer.asctime(timer.localtime(timer.time())),
                                                 i, train_curve[i], mean_pol_perf, best_perf))
        result_file = open('results.txt', 'a')
        result_file.write("%4i %5.2f %5.2f %5.2f \n" % (i, train_curve[i], mean_pol_perf, best_perf))
        result_file.close()
        if agent.save_logs:
            print_data = sorted(filter(lambda v: np.asarray(v[1]).size == 1,
                                       agent.logger.get_current_log().items()))
            print(tabulate(print_data))

    # final save
    pickle.dump(best_policy, open('iterations/best_policy.pickle', 'wb'))
    if agent.save_logs:
        agent.logger.save_log('logs/')
        make_train_plots(log=agent.logger.log, keys=plot_keys, save_loc='logs/')
    os.chdir(previous_dir)