def test_params(comm: MPI.Comm, n: int, policy: Policy, nt: NoiseTable, gen_obstat: ObStat, fit_fn: Callable[[Module], TrainingResult], rs: RandomState) \ -> Tuple[np.ndarray, np.ndarray, np.ndarray, int]: """ Tests `n` different perturbations of `policy`'s params and returns the positive and negative results (from all processes). Where positive_result[i] is the fitness when the noise at nt[noise_inds[i]] is added to policy.flat_params and negative_result[i] is when the same noise is subtracted :returns: tuple(positive results, negative results, noise inds, total steps) """ results_pos, results_neg, inds = [], [], [] for _ in range(n): idx, noise = nt.sample(rs) inds.append(idx) # for each noise ind sampled, both add and subtract the noise results_pos.append(fit_fn(policy.pheno(noise))) results_neg.append(fit_fn(policy.pheno(-noise))) gen_obstat.inc(*results_pos[-1].ob_sum_sq_cnt) gen_obstat.inc(*results_neg[-1].ob_sum_sq_cnt) n_objectives = len(results_pos[0].result) results = _share_results(comm, [tr.result for tr in results_pos], [tr.result for tr in results_neg], inds) gen_obstat.mpi_inc(comm) steps = comm.allreduce(sum([tr.steps for tr in results_pos + results_neg]), op=MPI.SUM) return results[:, 0:n_objectives], results[:, n_objectives:2 * n_objectives], results[:, -1], steps
def _log_gen(self, fits: np.ndarray, noiseless_tr: TrainingResult, policy: Policy, steps: int): super()._log_gen(fits, noiseless_tr, policy, steps) if self.comm.rank == MpiReporter.MAIN: # saving policy and all fits to files dist, rew = calc_dist_rew(noiseless_tr) save_policy = (rew > self.best_rew or dist > self.best_dist) self.best_rew = max(rew, self.best_rew) self.best_dist = max(dist, self.best_dist) if save_policy: # Saving policy if it obtained a better reward or distance policy.save(self.policy_folder, str(self.gen)) self.print( f'saving policy with rew:{rew:0.2f} and dist:{dist:0.2f}') np.save(path.join(f'{self.fit_folder}', f'{self.gen}.np'), fits)
def step( cfg, comm: MPI.Comm, policy: Policy, nt: NoiseTable, env: gym.Env, fit_fn: Callable[[Module], TrainingResult], rs: RandomState = np.random.RandomState(), ranker: Ranker = CenteredRanker, reporter: Reporter = StdoutReporter(MPI.COMM_WORLD) ) -> [TrainingResult, ObStat]: """ Runs a single generation of ES :param fit_fn: Evaluates the policy returns a :class:`TrainingResult` :param ranker: A subclass of :class:`Ranker` that is able to rank the fitnesses :returns: :class:`TrainingResult` of the noiseless policy at that generation """ assert cfg.general.policies_per_gen % comm.size == 0 and ( cfg.general.policies_per_gen / comm.size) % 2 == 0 eps_per_proc = int((cfg.general.policies_per_gen / comm.size) / 2) gen_obstat = ObStat(env.observation_space.shape, 0) pos_res, neg_res, inds, steps = test_params(comm, eps_per_proc, policy, nt, gen_obstat, fit_fn, rs) reporter.print(f'n dupes: {len(inds) - len(set(inds))}') ranker.rank(pos_res, neg_res, inds) approx_grad(policy, ranker, nt, policy.flat_params, cfg.general.batch_size, cfg.policy.l2coeff) noiseless_result = fit_fn(policy.pheno(np.zeros(len(policy))), False) reporter.log_gen(ranker.fits, noiseless_result, policy, steps) return noiseless_result, gen_obstat
def mean_behv(policy: Policy, r_fn: Callable[[torch.nn.Module], NSResult], rollouts: int): behvs = [ r_fn(policy.pheno(np.zeros(len(policy)))).behaviour for _ in range(rollouts) ] return np.mean(behvs, axis=0)
def run_saved_policy(policy_path: str, env: gym.Env, steps: int): run_saved(Policy.load(policy_path).pheno(), env, steps)
def approx_grad(policy: Policy, ranker: Ranker, nt: NoiseTable, params: ndarray, batch_size: int, l2coeff: float): """Approximating gradient and update policy params""" grad = scale_noise(ranker.ranked_fits, ranker.noise_inds, nt, len(policy), batch_size) / ranker.n_fits_ranked policy.optim_step(l2coeff * params - grad)
cfg_file = utils.parse_args() cfg = utils.load_config(cfg_file) env: gym.Env = gym.make(cfg.env.name) # seeding; this must be done before creating the neural network so that params are deterministic across processes rs, my_seed, global_seed = utils.seed(comm, cfg.general.seed, env) all_seeds = comm.alltoall( [my_seed] * comm.size) # simply for saving/viewing the seeds used on each proc print(f'seeds:{all_seeds}') # initializing obstat, policy, optimizer, noise and ranker nn = FeedForward(cfg.policy.layer_sizes, torch.nn.Tanh(), env, cfg.policy.ac_std, cfg.policy.ob_clip) policy: Policy = Policy(nn, cfg.noise.std, Adam(len(Policy.get_flat(nn)), cfg.policy.lr)) nt: NoiseTable = NoiseTable.create_shared(comm, cfg.noise.tbl_size, len(policy), None, cfg.general.seed) ranker = CenteredRanker() def r_fn(model: torch.nn.Module) -> TrainingResult: save_obs = (rs.random() if rs is not None else np.random.random()) < cfg.policy.save_obs_chance rews, behv, obs, steps = gym_runner.run_model(model, env, 10000, rs) return RewardResult( rews, behv, obs if save_obs else np.array( [np.zeros(env.observation_space.shape)]), steps) assert cfg.general.policies_per_gen % comm.size == 0 and ( cfg.general.policies_per_gen / comm.size) % 2 == 0
def main(cfg: Munch): full_name = f'{cfg.env.name}-{cfg.general.name}' comm: MPI.Comm = MPI.COMM_WORLD env: gym.Env = gym.make(cfg.env.name) mlflow_reporter = MLFlowReporter(comm, cfg) if cfg.general.mlflow else None reporter = DefaultMpiReporterSet(comm, full_name, LoggerReporter(comm, full_name), StdoutReporter(comm), mlflow_reporter) # seeding rs, my_seed, global_seed = utils.seed(comm, cfg.general.seed, env) all_seeds = comm.alltoall( [my_seed] * comm.size) # simply for saving the seeds used on each proc reporter.print(f'seeds:{all_seeds}') if cfg.nsr.adaptive: reporter.print("NSRA") elif cfg.nsr.progressive: reporter.print("P-NSRA") archive: Optional[np.ndarray] = None def ns_fn(model: torch.nn.Module, use_ac_noise=True) -> NSRResult: """Reward function""" save_obs = rs.random() < cfg.policy.save_obs_chance rews, behv, obs, steps = gym_runner.run_model( model, env, cfg.env.max_steps, rs if use_ac_noise else None) return NSRResult( rews, behv[-3:], obs if save_obs else np.array([np.zeros(env.observation_space.shape)]), steps, archive, cfg.novelty.k) # init population population = [] nns = [] for _ in range(cfg.general.n_policies): nns.append( FeedForward(cfg.policy.layer_sizes, torch.nn.Tanh(), env, cfg.policy.ac_std, cfg.policy.ob_clip)) population.append( Policy(nns[-1], cfg.noise.std, Adam(len(Policy.get_flat(nns[-1])), cfg.policy.lr))) # init optimizer and noise table nt: NoiseTable = NoiseTable.create_shared(comm, cfg.noise.tbl_size, len(population[0]), reporter, cfg.general.seed) policies_best_rewards = [-np.inf] * cfg.general.n_policies time_since_best = [0 for _ in range(cfg.general.n_policies) ] # TODO should this be per individual? obj_weight = [cfg.nsr.initial_w for _ in range(cfg.general.n_policies)] best_rew = -np.inf best_dist = -np.inf archive, policies_novelties = init_archive(comm, cfg, population, ns_fn) for gen in range(cfg.general.gens): # main loop # picking the policy from the population idx = random.choices(list(range(len(policies_novelties))), weights=policies_novelties, k=1)[0] if cfg.nsr.progressive: idx = gen % cfg.general.n_policies idx = comm.scatter([idx] * comm.size) ranker = MultiObjectiveRanker(CenteredRanker(), obj_weight[idx]) # reporting if cfg.general.mlflow: mlflow_reporter.set_active_run(idx) reporter.start_gen() reporter.log({'idx': idx}) reporter.log({'w': obj_weight[idx]}) reporter.log({'time since best': time_since_best[idx]}) # running es tr, gen_obstat = es.step(cfg, comm, population[idx], nt, env, ns_fn, rs, ranker, reporter) for policy in population: policy.update_obstat(gen_obstat) # shared obstat tr = comm.scatter([tr] * comm.size) # sharing result # updating the weighting for choosing the next policy to be evaluated behv = comm.scatter( [mean_behv(population[idx], ns_fn, cfg.novelty.rollouts)] * comm.size) nov = comm.scatter([novelty(behv, archive, cfg.novelty.k)] * comm.size) archive = update_archive( comm, behv, archive) # adding new behaviour and sharing archive policies_novelties[idx] = nov dist = np.linalg.norm(np.array(tr.positions[-3:-1])) rew = tr.reward if cfg.nsr.adaptive: obj_weight[idx], policies_best_rewards[idx], time_since_best[ idx] = nsra(cfg, rew, obj_weight[idx], policies_best_rewards[idx], time_since_best[idx]) elif cfg.nsr.progressive: obj_weight[ idx] = 1 if gen > cfg.nsr.end_progression_gen else gen / cfg.nsr.end_progression_gen # Saving policy if it obtained a better reward or distance if (rew > best_rew or dist > best_dist) and comm.rank == 0: best_rew = max(rew, best_rew) best_dist = max(dist, best_dist) # Only need to save the archive, policy is saved by DefaultMpiReportedSet archive_path = path.join('saved', full_name, 'archives') if not path.exists(archive_path): os.makedirs(archive_path) np.save(path.join(archive_path, f'{gen}.np'), archive) reporter.end_gen() mlflow.end_run() # ending the outer mlflow run
def main(cfg): comm: MPI.Comm = MPI.COMM_WORLD full_name = f'{cfg.env.name}-{cfg.general.name}' mlflow_reporter = MLFlowReporter(comm, cfg) if cfg.general.mlflow else None reporter = DefaultMpiReporterSet(comm, full_name, LoggerReporter(comm, full_name), StdoutReporter(comm), mlflow_reporter) env: gym.Env = gym.make(cfg.env.name) # seeding rs, my_seed, global_seed = utils.seed(comm, cfg.general.seed, env) all_seeds = comm.alltoall( [my_seed] * comm.size) # simply for saving the seeds used on each proc reporter.print(f'seeds:{all_seeds}') # initializing policy, optimizer, noise and env if 'load' in cfg.policy: policy: Policy = Policy.load(cfg.policy.load) nn: BaseNet = policy._module else: nn: BaseNet = FeedForward(cfg.policy.layer_sizes, torch.nn.Tanh(), env, cfg.policy.ac_std, cfg.policy.ob_clip) policy: Policy = Policy(nn, cfg.noise.std, Adam(len(Policy.get_flat(nn)), cfg.policy.lr)) nt: NoiseTable = NoiseTable.create_shared(comm, cfg.noise.tbl_size, len(policy), reporter, global_seed) ranker = CenteredRanker() if 0 < cfg.experimental.elite < 1: ranker = EliteRanker(CenteredRanker(), cfg.experimental.elite) best_max_rew = -np.inf # highest achieved in any gen def r_fn(model: torch.nn.Module, use_ac_noise=True) -> TrainingResult: save_obs = rs.random() < cfg.policy.save_obs_chance rews = np.zeros(cfg.env.max_steps) for _ in range(max(1, cfg.general.eps_per_policy)): rew, behv, obs, steps = gym_runner.run_model( model, env, cfg.env.max_steps, rs if use_ac_noise else None) rews[:len(rew)] += np.array(rew) rews /= max(1, cfg.general.eps_per_policy) return RewardResult( rews.tolist(), behv, obs if save_obs else np.array( [np.zeros(env.observation_space.shape)]), steps) time_since_best = 0 noise_std_inc = 0.08 for gen in range(cfg.general.gens): if cfg.general.mlflow: mlflow_reporter.set_active_run(0) reporter.start_gen() if cfg.noise.std_decay != 1: reporter.log({'noise std': policy.std}) if cfg.policy.lr_decay != 1: reporter.log({'lr': policy.optim.lr}) if cfg.policy.ac_std_decay != 1: reporter.log({'ac std': nn._action_std}) tr, gen_obstat = es.step(cfg, comm, policy, nt, env, r_fn, rs, ranker, reporter) policy.update_obstat(gen_obstat) cfg.policy.ac_std = nn._action_std = nn._action_std * cfg.policy.ac_std_decay cfg.noise.std = policy.std = max(cfg.noise.std * cfg.noise.std_decay, cfg.noise.std_limit) cfg.policy.lr = policy.optim.lr = max( cfg.policy.lr * cfg.policy.lr_decay, cfg.policy.lr_limit) reporter.log({'obs recorded': policy.obstat.count}) max_rew_ind = np.argmax(ranker.fits[:, 0]) max_rew = ranker.fits[:, 0][max_rew_ind] time_since_best = 0 if max_rew > best_max_rew else time_since_best + 1 reporter.log({'time since best': time_since_best}) # increasing noise std if policy is stuck if time_since_best > cfg.experimental.max_time_since_best and cfg.experimental.explore_with_large_noise: cfg.noise.std = policy.std = policy.std + noise_std_inc if 0 < cfg.experimental.elite < 1: # using elite extension if time_since_best > cfg.experimental.max_time_since_best and cfg.experimental.elite < 1: ranker.elite_percent = cfg.experimental.elite if time_since_best == 0: ranker.elite_percent = 1 reporter.print(f'elite percent: {ranker.elite_percent}') # Saving max rew if it obtained best ever rew if max_rew > best_max_rew and comm.rank == 0: best_max_rew = max_rew coeff = 1 if max_rew_ind < ranker.n_fits_ranked // 2 else -1 # checking if pos or neg noise ind used # TODO save this as a policy torch.save( policy.pheno(coeff * ranker.noise_inds[max_rew_ind % (ranker.n_fits_ranked // 2)]), path.join('saved', full_name, 'weights', f'gen{gen}-rew{best_max_rew:0.0f}.pt')) reporter.print(f'saving max policy with rew:{best_max_rew:0.2f}') reporter.end_gen() mlflow.end_run( ) # in the case where mlflow is the reporter, just ending its run
# seeding; this must be done before creating the neural network so that params are deterministic across processes cfg.general.seed = (generate_seed(comm) if cfg.general.seed is None else cfg.general.seed) rs = utils.seed(comm, cfg.general.seed, env) # initializing obstat, policy, optimizer, noise and ranker obstats: List[ObStat] = [ ObStat(env.observation_space[i].shape, 1e-2) for i in range(2) ] neuralnets = [ FeedForward(cfg.policy.layer_sizes, torch.nn.Tanh(), env, cfg.policy.ac_std, cfg.policy.ob_clip) ] policies: List[Policy] = [ Policy(nn, cfg.noise.std, Adam(len(Policy.get_flat(nn)), cfg.policy.lr)) for nn in neuralnets ] nt: NoiseTable = NoiseTable.create_shared(comm, cfg.noise.tbl_size, len(policies[0]), None, cfg.general.seed) ranker = CenteredRanker() def r_fn(models: List[torch.nn.Module], use_ac_noise=True) -> TrainingResult: save_obs = rs.random() < cfg.policy.save_obs_chance rews, behv, obs, steps = gym_runner.multi_agent_gym_runner( models, env, cfg.env.max_steps, rs if use_ac_noise else None) return MultiAgentTrainingResult( rews, behv, obs if save_obs else np.array( [np.zeros(env.observation_space.shape)]), steps)