Python Policy 예제들, src.core.policy.Policy Python 예제들

예제 #1

0

파일 보기

파일: es.py 프로젝트: sash-a/es_pytorch

def test_params(comm: MPI.Comm, n: int, policy: Policy, nt: NoiseTable, gen_obstat: ObStat,
                fit_fn: Callable[[Module], TrainingResult], rs: RandomState) \
        -> Tuple[np.ndarray, np.ndarray, np.ndarray, int]:
    """
    Tests `n` different perturbations of `policy`'s params and returns the positive and negative results
    (from all processes).

    Where positive_result[i] is the fitness when the noise at nt[noise_inds[i]] is added to policy.flat_params
    and negative_result[i] is when the same noise is subtracted

    :returns: tuple(positive results, negative results, noise inds, total steps)
    """
    results_pos, results_neg, inds = [], [], []
    for _ in range(n):
        idx, noise = nt.sample(rs)
        inds.append(idx)
        # for each noise ind sampled, both add and subtract the noise
        results_pos.append(fit_fn(policy.pheno(noise)))
        results_neg.append(fit_fn(policy.pheno(-noise)))
        gen_obstat.inc(*results_pos[-1].ob_sum_sq_cnt)
        gen_obstat.inc(*results_neg[-1].ob_sum_sq_cnt)

    n_objectives = len(results_pos[0].result)
    results = _share_results(comm, [tr.result for tr in results_pos],
                             [tr.result for tr in results_neg], inds)
    gen_obstat.mpi_inc(comm)
    steps = comm.allreduce(sum([tr.steps for tr in results_pos + results_neg]),
                           op=MPI.SUM)

    return results[:,
                   0:n_objectives], results[:, n_objectives:2 *
                                            n_objectives], results[:,
                                                                   -1], steps

예제 #2

0

파일 보기

파일: reporters.py 프로젝트: sash-a/es_pytorch

    def _log_gen(self, fits: np.ndarray, noiseless_tr: TrainingResult,
                 policy: Policy, steps: int):
        super()._log_gen(fits, noiseless_tr, policy, steps)
        if self.comm.rank == MpiReporter.MAIN:  # saving policy and all fits to files
            dist, rew = calc_dist_rew(noiseless_tr)
            save_policy = (rew > self.best_rew or dist > self.best_dist)
            self.best_rew = max(rew, self.best_rew)
            self.best_dist = max(dist, self.best_dist)
            if save_policy:  # Saving policy if it obtained a better reward or distance
                policy.save(self.policy_folder, str(self.gen))
                self.print(
                    f'saving policy with rew:{rew:0.2f} and dist:{dist:0.2f}')

            np.save(path.join(f'{self.fit_folder}', f'{self.gen}.np'), fits)

예제 #3

0

파일 보기

파일: es.py 프로젝트: sash-a/es_pytorch

def step(
    cfg,
    comm: MPI.Comm,
    policy: Policy,
    nt: NoiseTable,
    env: gym.Env,
    fit_fn: Callable[[Module], TrainingResult],
    rs: RandomState = np.random.RandomState(),
    ranker: Ranker = CenteredRanker,
    reporter: Reporter = StdoutReporter(MPI.COMM_WORLD)
) -> [TrainingResult, ObStat]:
    """
    Runs a single generation of ES
    :param fit_fn: Evaluates the policy returns a :class:`TrainingResult`
    :param ranker: A subclass of :class:`Ranker` that is able to rank the fitnesses
    :returns: :class:`TrainingResult` of the noiseless policy at that generation
    """
    assert cfg.general.policies_per_gen % comm.size == 0 and (
        cfg.general.policies_per_gen / comm.size) % 2 == 0
    eps_per_proc = int((cfg.general.policies_per_gen / comm.size) / 2)

    gen_obstat = ObStat(env.observation_space.shape, 0)
    pos_res, neg_res, inds, steps = test_params(comm, eps_per_proc, policy, nt,
                                                gen_obstat, fit_fn, rs)

    reporter.print(f'n dupes: {len(inds) - len(set(inds))}')

    ranker.rank(pos_res, neg_res, inds)
    approx_grad(policy, ranker, nt, policy.flat_params, cfg.general.batch_size,
                cfg.policy.l2coeff)
    noiseless_result = fit_fn(policy.pheno(np.zeros(len(policy))), False)
    reporter.log_gen(ranker.fits, noiseless_result, policy, steps)

    return noiseless_result, gen_obstat

예제 #4

0

파일 보기

def mean_behv(policy: Policy, r_fn: Callable[[torch.nn.Module], NSResult],
              rollouts: int):
    behvs = [
        r_fn(policy.pheno(np.zeros(len(policy)))).behaviour
        for _ in range(rollouts)
    ]
    return np.mean(behvs, axis=0)

예제 #5

0

파일 보기

def run_saved_policy(policy_path: str, env: gym.Env, steps: int):
    run_saved(Policy.load(policy_path).pheno(), env, steps)

예제 #6

0

파일 보기

파일: es.py 프로젝트: sash-a/es_pytorch

def approx_grad(policy: Policy, ranker: Ranker, nt: NoiseTable,
                params: ndarray, batch_size: int, l2coeff: float):
    """Approximating gradient and update policy params"""
    grad = scale_noise(ranker.ranked_fits, ranker.noise_inds, nt, len(policy),
                       batch_size) / ranker.n_fits_ranked
    policy.optim_step(l2coeff * params - grad)

예제 #7

0

파일 보기

파일: simple_example.py 프로젝트: sash-a/es_pytorch

    cfg_file = utils.parse_args()
    cfg = utils.load_config(cfg_file)

    env: gym.Env = gym.make(cfg.env.name)

    # seeding; this must be done before creating the neural network so that params are deterministic across processes
    rs, my_seed, global_seed = utils.seed(comm, cfg.general.seed, env)
    all_seeds = comm.alltoall(
        [my_seed] *
        comm.size)  # simply for saving/viewing the seeds used on each proc
    print(f'seeds:{all_seeds}')

    # initializing obstat, policy, optimizer, noise and ranker
    nn = FeedForward(cfg.policy.layer_sizes, torch.nn.Tanh(), env,
                     cfg.policy.ac_std, cfg.policy.ob_clip)
    policy: Policy = Policy(nn, cfg.noise.std,
                            Adam(len(Policy.get_flat(nn)), cfg.policy.lr))
    nt: NoiseTable = NoiseTable.create_shared(comm, cfg.noise.tbl_size,
                                              len(policy), None,
                                              cfg.general.seed)
    ranker = CenteredRanker()

    def r_fn(model: torch.nn.Module) -> TrainingResult:
        save_obs = (rs.random() if rs is not None else
                    np.random.random()) < cfg.policy.save_obs_chance
        rews, behv, obs, steps = gym_runner.run_model(model, env, 10000, rs)
        return RewardResult(
            rews, behv, obs if save_obs else np.array(
                [np.zeros(env.observation_space.shape)]), steps)

    assert cfg.general.policies_per_gen % comm.size == 0 and (
        cfg.general.policies_per_gen / comm.size) % 2 == 0

예제 #8

0

파일 보기

def main(cfg: Munch):
    full_name = f'{cfg.env.name}-{cfg.general.name}'
    comm: MPI.Comm = MPI.COMM_WORLD
    env: gym.Env = gym.make(cfg.env.name)

    mlflow_reporter = MLFlowReporter(comm, cfg) if cfg.general.mlflow else None
    reporter = DefaultMpiReporterSet(comm, full_name,
                                     LoggerReporter(comm, full_name),
                                     StdoutReporter(comm), mlflow_reporter)

    # seeding
    rs, my_seed, global_seed = utils.seed(comm, cfg.general.seed, env)
    all_seeds = comm.alltoall(
        [my_seed] * comm.size)  # simply for saving the seeds used on each proc
    reporter.print(f'seeds:{all_seeds}')

    if cfg.nsr.adaptive:
        reporter.print("NSRA")
    elif cfg.nsr.progressive:
        reporter.print("P-NSRA")

    archive: Optional[np.ndarray] = None

    def ns_fn(model: torch.nn.Module, use_ac_noise=True) -> NSRResult:
        """Reward function"""
        save_obs = rs.random() < cfg.policy.save_obs_chance
        rews, behv, obs, steps = gym_runner.run_model(
            model, env, cfg.env.max_steps, rs if use_ac_noise else None)
        return NSRResult(
            rews, behv[-3:], obs
            if save_obs else np.array([np.zeros(env.observation_space.shape)]),
            steps, archive, cfg.novelty.k)

    # init population
    population = []
    nns = []
    for _ in range(cfg.general.n_policies):
        nns.append(
            FeedForward(cfg.policy.layer_sizes, torch.nn.Tanh(), env,
                        cfg.policy.ac_std, cfg.policy.ob_clip))
        population.append(
            Policy(nns[-1], cfg.noise.std,
                   Adam(len(Policy.get_flat(nns[-1])), cfg.policy.lr)))
    # init optimizer and noise table
    nt: NoiseTable = NoiseTable.create_shared(comm, cfg.noise.tbl_size,
                                              len(population[0]), reporter,
                                              cfg.general.seed)
    policies_best_rewards = [-np.inf] * cfg.general.n_policies
    time_since_best = [0 for _ in range(cfg.general.n_policies)
                       ]  # TODO should this be per individual?
    obj_weight = [cfg.nsr.initial_w for _ in range(cfg.general.n_policies)]

    best_rew = -np.inf
    best_dist = -np.inf

    archive, policies_novelties = init_archive(comm, cfg, population, ns_fn)

    for gen in range(cfg.general.gens):  # main loop
        # picking the policy from the population
        idx = random.choices(list(range(len(policies_novelties))),
                             weights=policies_novelties,
                             k=1)[0]
        if cfg.nsr.progressive: idx = gen % cfg.general.n_policies
        idx = comm.scatter([idx] * comm.size)
        ranker = MultiObjectiveRanker(CenteredRanker(), obj_weight[idx])
        # reporting
        if cfg.general.mlflow: mlflow_reporter.set_active_run(idx)
        reporter.start_gen()
        reporter.log({'idx': idx})
        reporter.log({'w': obj_weight[idx]})
        reporter.log({'time since best': time_since_best[idx]})
        # running es
        tr, gen_obstat = es.step(cfg, comm, population[idx], nt, env, ns_fn,
                                 rs, ranker, reporter)
        for policy in population:
            policy.update_obstat(gen_obstat)  # shared obstat

        tr = comm.scatter([tr] * comm.size)  # sharing result
        # updating the weighting for choosing the next policy to be evaluated
        behv = comm.scatter(
            [mean_behv(population[idx], ns_fn, cfg.novelty.rollouts)] *
            comm.size)
        nov = comm.scatter([novelty(behv, archive, cfg.novelty.k)] * comm.size)
        archive = update_archive(
            comm, behv, archive)  # adding new behaviour and sharing archive
        policies_novelties[idx] = nov

        dist = np.linalg.norm(np.array(tr.positions[-3:-1]))
        rew = tr.reward

        if cfg.nsr.adaptive:
            obj_weight[idx], policies_best_rewards[idx], time_since_best[
                idx] = nsra(cfg, rew, obj_weight[idx],
                            policies_best_rewards[idx], time_since_best[idx])
        elif cfg.nsr.progressive:
            obj_weight[
                idx] = 1 if gen > cfg.nsr.end_progression_gen else gen / cfg.nsr.end_progression_gen

        # Saving policy if it obtained a better reward or distance
        if (rew > best_rew or dist > best_dist) and comm.rank == 0:
            best_rew = max(rew, best_rew)
            best_dist = max(dist, best_dist)

            # Only need to save the archive, policy is saved by DefaultMpiReportedSet
            archive_path = path.join('saved', full_name, 'archives')
            if not path.exists(archive_path):
                os.makedirs(archive_path)
            np.save(path.join(archive_path, f'{gen}.np'), archive)

        reporter.end_gen()

    mlflow.end_run()  # ending the outer mlflow run

예제 #9

0

파일 보기

파일: obj.py 프로젝트: sash-a/es_pytorch

def main(cfg):
    comm: MPI.Comm = MPI.COMM_WORLD

    full_name = f'{cfg.env.name}-{cfg.general.name}'
    mlflow_reporter = MLFlowReporter(comm, cfg) if cfg.general.mlflow else None
    reporter = DefaultMpiReporterSet(comm, full_name,
                                     LoggerReporter(comm, full_name),
                                     StdoutReporter(comm), mlflow_reporter)

    env: gym.Env = gym.make(cfg.env.name)

    # seeding
    rs, my_seed, global_seed = utils.seed(comm, cfg.general.seed, env)
    all_seeds = comm.alltoall(
        [my_seed] * comm.size)  # simply for saving the seeds used on each proc
    reporter.print(f'seeds:{all_seeds}')

    # initializing policy, optimizer, noise and env
    if 'load' in cfg.policy:
        policy: Policy = Policy.load(cfg.policy.load)
        nn: BaseNet = policy._module
    else:
        nn: BaseNet = FeedForward(cfg.policy.layer_sizes, torch.nn.Tanh(), env,
                                  cfg.policy.ac_std, cfg.policy.ob_clip)
        policy: Policy = Policy(nn, cfg.noise.std,
                                Adam(len(Policy.get_flat(nn)), cfg.policy.lr))

    nt: NoiseTable = NoiseTable.create_shared(comm, cfg.noise.tbl_size,
                                              len(policy), reporter,
                                              global_seed)

    ranker = CenteredRanker()
    if 0 < cfg.experimental.elite < 1:
        ranker = EliteRanker(CenteredRanker(), cfg.experimental.elite)

    best_max_rew = -np.inf  # highest achieved in any gen

    def r_fn(model: torch.nn.Module, use_ac_noise=True) -> TrainingResult:
        save_obs = rs.random() < cfg.policy.save_obs_chance
        rews = np.zeros(cfg.env.max_steps)
        for _ in range(max(1, cfg.general.eps_per_policy)):
            rew, behv, obs, steps = gym_runner.run_model(
                model, env, cfg.env.max_steps, rs if use_ac_noise else None)
            rews[:len(rew)] += np.array(rew)

        rews /= max(1, cfg.general.eps_per_policy)
        return RewardResult(
            rews.tolist(), behv, obs if save_obs else np.array(
                [np.zeros(env.observation_space.shape)]), steps)

    time_since_best = 0
    noise_std_inc = 0.08
    for gen in range(cfg.general.gens):
        if cfg.general.mlflow: mlflow_reporter.set_active_run(0)
        reporter.start_gen()

        if cfg.noise.std_decay != 1:
            reporter.log({'noise std': policy.std})
        if cfg.policy.lr_decay != 1:
            reporter.log({'lr': policy.optim.lr})
        if cfg.policy.ac_std_decay != 1:
            reporter.log({'ac std': nn._action_std})

        tr, gen_obstat = es.step(cfg, comm, policy, nt, env, r_fn, rs, ranker,
                                 reporter)
        policy.update_obstat(gen_obstat)

        cfg.policy.ac_std = nn._action_std = nn._action_std * cfg.policy.ac_std_decay
        cfg.noise.std = policy.std = max(cfg.noise.std * cfg.noise.std_decay,
                                         cfg.noise.std_limit)
        cfg.policy.lr = policy.optim.lr = max(
            cfg.policy.lr * cfg.policy.lr_decay, cfg.policy.lr_limit)

        reporter.log({'obs recorded': policy.obstat.count})

        max_rew_ind = np.argmax(ranker.fits[:, 0])
        max_rew = ranker.fits[:, 0][max_rew_ind]

        time_since_best = 0 if max_rew > best_max_rew else time_since_best + 1
        reporter.log({'time since best': time_since_best})
        # increasing noise std if policy is stuck
        if time_since_best > cfg.experimental.max_time_since_best and cfg.experimental.explore_with_large_noise:
            cfg.noise.std = policy.std = policy.std + noise_std_inc

        if 0 < cfg.experimental.elite < 1:  # using elite extension
            if time_since_best > cfg.experimental.max_time_since_best and cfg.experimental.elite < 1:
                ranker.elite_percent = cfg.experimental.elite
            if time_since_best == 0:
                ranker.elite_percent = 1
            reporter.print(f'elite percent: {ranker.elite_percent}')

        # Saving max rew if it obtained best ever rew
        if max_rew > best_max_rew and comm.rank == 0:
            best_max_rew = max_rew
            coeff = 1 if max_rew_ind < ranker.n_fits_ranked // 2 else -1  # checking if pos or neg noise ind used
            # TODO save this as a policy
            torch.save(
                policy.pheno(coeff *
                             ranker.noise_inds[max_rew_ind %
                                               (ranker.n_fits_ranked // 2)]),
                path.join('saved', full_name, 'weights',
                          f'gen{gen}-rew{best_max_rew:0.0f}.pt'))
            reporter.print(f'saving max policy with rew:{best_max_rew:0.2f}')

        reporter.end_gen()
    mlflow.end_run(
    )  # in the case where mlflow is the reporter, just ending its run

예제 #10

0

파일 보기

파일: multi_agent.py 프로젝트: sash-a/es_pytorch

    # seeding; this must be done before creating the neural network so that params are deterministic across processes
    cfg.general.seed = (generate_seed(comm)
                        if cfg.general.seed is None else cfg.general.seed)
    rs = utils.seed(comm, cfg.general.seed, env)

    # initializing obstat, policy, optimizer, noise and ranker
    obstats: List[ObStat] = [
        ObStat(env.observation_space[i].shape, 1e-2) for i in range(2)
    ]
    neuralnets = [
        FeedForward(cfg.policy.layer_sizes, torch.nn.Tanh(), env,
                    cfg.policy.ac_std, cfg.policy.ob_clip)
    ]
    policies: List[Policy] = [
        Policy(nn, cfg.noise.std, Adam(len(Policy.get_flat(nn)),
                                       cfg.policy.lr)) for nn in neuralnets
    ]
    nt: NoiseTable = NoiseTable.create_shared(comm, cfg.noise.tbl_size,
                                              len(policies[0]), None,
                                              cfg.general.seed)
    ranker = CenteredRanker()

    def r_fn(models: List[torch.nn.Module],
             use_ac_noise=True) -> TrainingResult:
        save_obs = rs.random() < cfg.policy.save_obs_chance
        rews, behv, obs, steps = gym_runner.multi_agent_gym_runner(
            models, env, cfg.env.max_steps, rs if use_ac_noise else None)
        return MultiAgentTrainingResult(
            rews, behv, obs if save_obs else np.array(
                [np.zeros(env.observation_space.shape)]), steps)