def test_create_shared(comm): size = 5 seed = 1 noise = np.array(NoiseTable.create_shared(comm, size, 0, seed=seed).noise) all_noise = comm.alltoall([noise] * comm.size) assert np.isclose(all_noise, all_noise[0]).all() assert np.isclose( all_noise[0], np.random.RandomState(seed).randn(size).astype(np.float32)).all()
def main(cfg: Munch): full_name = f'{cfg.env.name}-{cfg.general.name}' comm: MPI.Comm = MPI.COMM_WORLD env: gym.Env = gym.make(cfg.env.name) mlflow_reporter = MLFlowReporter(comm, cfg) if cfg.general.mlflow else None reporter = DefaultMpiReporterSet(comm, full_name, LoggerReporter(comm, full_name), StdoutReporter(comm), mlflow_reporter) # seeding rs, my_seed, global_seed = utils.seed(comm, cfg.general.seed, env) all_seeds = comm.alltoall( [my_seed] * comm.size) # simply for saving the seeds used on each proc reporter.print(f'seeds:{all_seeds}') if cfg.nsr.adaptive: reporter.print("NSRA") elif cfg.nsr.progressive: reporter.print("P-NSRA") archive: Optional[np.ndarray] = None def ns_fn(model: torch.nn.Module, use_ac_noise=True) -> NSRResult: """Reward function""" save_obs = rs.random() < cfg.policy.save_obs_chance rews, behv, obs, steps = gym_runner.run_model( model, env, cfg.env.max_steps, rs if use_ac_noise else None) return NSRResult( rews, behv[-3:], obs if save_obs else np.array([np.zeros(env.observation_space.shape)]), steps, archive, cfg.novelty.k) # init population population = [] nns = [] for _ in range(cfg.general.n_policies): nns.append( FeedForward(cfg.policy.layer_sizes, torch.nn.Tanh(), env, cfg.policy.ac_std, cfg.policy.ob_clip)) population.append( Policy(nns[-1], cfg.noise.std, Adam(len(Policy.get_flat(nns[-1])), cfg.policy.lr))) # init optimizer and noise table nt: NoiseTable = NoiseTable.create_shared(comm, cfg.noise.tbl_size, len(population[0]), reporter, cfg.general.seed) policies_best_rewards = [-np.inf] * cfg.general.n_policies time_since_best = [0 for _ in range(cfg.general.n_policies) ] # TODO should this be per individual? obj_weight = [cfg.nsr.initial_w for _ in range(cfg.general.n_policies)] best_rew = -np.inf best_dist = -np.inf archive, policies_novelties = init_archive(comm, cfg, population, ns_fn) for gen in range(cfg.general.gens): # main loop # picking the policy from the population idx = random.choices(list(range(len(policies_novelties))), weights=policies_novelties, k=1)[0] if cfg.nsr.progressive: idx = gen % cfg.general.n_policies idx = comm.scatter([idx] * comm.size) ranker = MultiObjectiveRanker(CenteredRanker(), obj_weight[idx]) # reporting if cfg.general.mlflow: mlflow_reporter.set_active_run(idx) reporter.start_gen() reporter.log({'idx': idx}) reporter.log({'w': obj_weight[idx]}) reporter.log({'time since best': time_since_best[idx]}) # running es tr, gen_obstat = es.step(cfg, comm, population[idx], nt, env, ns_fn, rs, ranker, reporter) for policy in population: policy.update_obstat(gen_obstat) # shared obstat tr = comm.scatter([tr] * comm.size) # sharing result # updating the weighting for choosing the next policy to be evaluated behv = comm.scatter( [mean_behv(population[idx], ns_fn, cfg.novelty.rollouts)] * comm.size) nov = comm.scatter([novelty(behv, archive, cfg.novelty.k)] * comm.size) archive = update_archive( comm, behv, archive) # adding new behaviour and sharing archive policies_novelties[idx] = nov dist = np.linalg.norm(np.array(tr.positions[-3:-1])) rew = tr.reward if cfg.nsr.adaptive: obj_weight[idx], policies_best_rewards[idx], time_since_best[ idx] = nsra(cfg, rew, obj_weight[idx], policies_best_rewards[idx], time_since_best[idx]) elif cfg.nsr.progressive: obj_weight[ idx] = 1 if gen > cfg.nsr.end_progression_gen else gen / cfg.nsr.end_progression_gen # Saving policy if it obtained a better reward or distance if (rew > best_rew or dist > best_dist) and comm.rank == 0: best_rew = max(rew, best_rew) best_dist = max(dist, best_dist) # Only need to save the archive, policy is saved by DefaultMpiReportedSet archive_path = path.join('saved', full_name, 'archives') if not path.exists(archive_path): os.makedirs(archive_path) np.save(path.join(archive_path, f'{gen}.np'), archive) reporter.end_gen() mlflow.end_run() # ending the outer mlflow run
env: gym.Env = gym.make(cfg.env.name) # seeding; this must be done before creating the neural network so that params are deterministic across processes rs, my_seed, global_seed = utils.seed(comm, cfg.general.seed, env) all_seeds = comm.alltoall( [my_seed] * comm.size) # simply for saving/viewing the seeds used on each proc print(f'seeds:{all_seeds}') # initializing obstat, policy, optimizer, noise and ranker nn = FeedForward(cfg.policy.layer_sizes, torch.nn.Tanh(), env, cfg.policy.ac_std, cfg.policy.ob_clip) policy: Policy = Policy(nn, cfg.noise.std, Adam(len(Policy.get_flat(nn)), cfg.policy.lr)) nt: NoiseTable = NoiseTable.create_shared(comm, cfg.noise.tbl_size, len(policy), None, cfg.general.seed) ranker = CenteredRanker() def r_fn(model: torch.nn.Module) -> TrainingResult: save_obs = (rs.random() if rs is not None else np.random.random()) < cfg.policy.save_obs_chance rews, behv, obs, steps = gym_runner.run_model(model, env, 10000, rs) return RewardResult( rews, behv, obs if save_obs else np.array( [np.zeros(env.observation_space.shape)]), steps) assert cfg.general.policies_per_gen % comm.size == 0 and ( cfg.general.policies_per_gen / comm.size) % 2 == 0 eps_per_proc = int((cfg.general.policies_per_gen / comm.size) / 2) for gen in range(cfg.general.gens): # main loop
def main(cfg): comm: MPI.Comm = MPI.COMM_WORLD full_name = f'{cfg.env.name}-{cfg.general.name}' mlflow_reporter = MLFlowReporter(comm, cfg) if cfg.general.mlflow else None reporter = DefaultMpiReporterSet(comm, full_name, LoggerReporter(comm, full_name), StdoutReporter(comm), mlflow_reporter) env: gym.Env = gym.make(cfg.env.name) # seeding rs, my_seed, global_seed = utils.seed(comm, cfg.general.seed, env) all_seeds = comm.alltoall( [my_seed] * comm.size) # simply for saving the seeds used on each proc reporter.print(f'seeds:{all_seeds}') # initializing policy, optimizer, noise and env if 'load' in cfg.policy: policy: Policy = Policy.load(cfg.policy.load) nn: BaseNet = policy._module else: nn: BaseNet = FeedForward(cfg.policy.layer_sizes, torch.nn.Tanh(), env, cfg.policy.ac_std, cfg.policy.ob_clip) policy: Policy = Policy(nn, cfg.noise.std, Adam(len(Policy.get_flat(nn)), cfg.policy.lr)) nt: NoiseTable = NoiseTable.create_shared(comm, cfg.noise.tbl_size, len(policy), reporter, global_seed) ranker = CenteredRanker() if 0 < cfg.experimental.elite < 1: ranker = EliteRanker(CenteredRanker(), cfg.experimental.elite) best_max_rew = -np.inf # highest achieved in any gen def r_fn(model: torch.nn.Module, use_ac_noise=True) -> TrainingResult: save_obs = rs.random() < cfg.policy.save_obs_chance rews = np.zeros(cfg.env.max_steps) for _ in range(max(1, cfg.general.eps_per_policy)): rew, behv, obs, steps = gym_runner.run_model( model, env, cfg.env.max_steps, rs if use_ac_noise else None) rews[:len(rew)] += np.array(rew) rews /= max(1, cfg.general.eps_per_policy) return RewardResult( rews.tolist(), behv, obs if save_obs else np.array( [np.zeros(env.observation_space.shape)]), steps) time_since_best = 0 noise_std_inc = 0.08 for gen in range(cfg.general.gens): if cfg.general.mlflow: mlflow_reporter.set_active_run(0) reporter.start_gen() if cfg.noise.std_decay != 1: reporter.log({'noise std': policy.std}) if cfg.policy.lr_decay != 1: reporter.log({'lr': policy.optim.lr}) if cfg.policy.ac_std_decay != 1: reporter.log({'ac std': nn._action_std}) tr, gen_obstat = es.step(cfg, comm, policy, nt, env, r_fn, rs, ranker, reporter) policy.update_obstat(gen_obstat) cfg.policy.ac_std = nn._action_std = nn._action_std * cfg.policy.ac_std_decay cfg.noise.std = policy.std = max(cfg.noise.std * cfg.noise.std_decay, cfg.noise.std_limit) cfg.policy.lr = policy.optim.lr = max( cfg.policy.lr * cfg.policy.lr_decay, cfg.policy.lr_limit) reporter.log({'obs recorded': policy.obstat.count}) max_rew_ind = np.argmax(ranker.fits[:, 0]) max_rew = ranker.fits[:, 0][max_rew_ind] time_since_best = 0 if max_rew > best_max_rew else time_since_best + 1 reporter.log({'time since best': time_since_best}) # increasing noise std if policy is stuck if time_since_best > cfg.experimental.max_time_since_best and cfg.experimental.explore_with_large_noise: cfg.noise.std = policy.std = policy.std + noise_std_inc if 0 < cfg.experimental.elite < 1: # using elite extension if time_since_best > cfg.experimental.max_time_since_best and cfg.experimental.elite < 1: ranker.elite_percent = cfg.experimental.elite if time_since_best == 0: ranker.elite_percent = 1 reporter.print(f'elite percent: {ranker.elite_percent}') # Saving max rew if it obtained best ever rew if max_rew > best_max_rew and comm.rank == 0: best_max_rew = max_rew coeff = 1 if max_rew_ind < ranker.n_fits_ranked // 2 else -1 # checking if pos or neg noise ind used # TODO save this as a policy torch.save( policy.pheno(coeff * ranker.noise_inds[max_rew_ind % (ranker.n_fits_ranked // 2)]), path.join('saved', full_name, 'weights', f'gen{gen}-rew{best_max_rew:0.0f}.pt')) reporter.print(f'saving max policy with rew:{best_max_rew:0.2f}') reporter.end_gen() mlflow.end_run( ) # in the case where mlflow is the reporter, just ending its run