def experiment(variant): from rlkit.envs.gym_minigrid.gym_minigrid import envs expl_env = DeerEnv(**variant['env_kwargs']) eval_env = DeerEnv(**variant['env_kwargs']) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.n layer_size = variant['algo_kwargs']['layer_size'] lifetime = variant['env_kwargs'].get('time_horizon', 0) == 0 qf = gen_network(variant['algo_kwargs'], action_dim, layer_size) target_qf = gen_network(variant['algo_kwargs'], action_dim, layer_size) qf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(qf) # eval_policy = SoftmaxQPolicy(qf) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedyDecay(expl_env.action_space, variant['algo_kwargs']['eps_decay_rate'], 1, 0.1), eval_policy, ) if lifetime: eval_policy = expl_policy # expl_policy = PolicyWrappedWithExplorationStrategy( # EpsilonGreedy(expl_env.action_space, 0.5), # eval_policy, # ) if eval_env.time_horizon == 0: collector_class = LifetimeMdpPathCollector if lifetime else MdpPathCollector else: collector_class = MdpPathCollectorConfig eval_path_collector = collector_class( eval_env, eval_policy, # render=True ) expl_path_collector = collector_class( expl_env, expl_policy, ) trainer = DoubleDQNTrainer(qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant['algo_kwargs']['trainer_kwargs']) replay_buffer = EnvReplayBuffer( variant['algo_kwargs']['replay_buffer_size'], expl_env) algo_class = TorchLifetimeRLAlgorithm if lifetime else TorchBatchRLAlgorithm algorithm = algo_class(trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def gen_validation_envs(n, filename, **kwargs): envs = [] seeds = np.random.randint(0, 100000, n).tolist() for idx in range(n): env_kwargs = dict( grid_size=8, # start agent at random pos agent_start_pos=None, health_cap=1000, gen_resources=True, fully_observed=False, task='make food', make_rtype='sparse', fixed_reset=False, only_partial_obs=True, init_resources={ 'axe': 1, 'deer': 1 }, deer_move_prob=0.1, fixed_expected_resources=True, end_on_task_completion=True, time_horizon=100, seed=seeds[idx]) env_kwargs.update(**kwargs) env = DeerEnv(**env_kwargs) envs.append(env) pickle.dump({'envs': envs, 'seeds': seeds}, open(filename, 'wb')) json.dump(env_kwargs, open(filename.strip('.pkl') + '.json', 'w'), indent=4, sort_keys=True) print('Generated %d envs at file: %s' % (n, filename))
def gen_validation_envs(n, filename, **kwargs): envs = [] seeds = np.random.randint(0, 100000, n).tolist() for idx in range(n): env_kwargs = dict(grid_size=10, agent_start_pos=None, health_cap=1000, gen_resources=True, fully_observed=False, task='make food', make_rtype='sparse', fixed_reset=False, only_partial_obs=True, init_resources={ 'metal': 3, 'wood': 3, 'deer': 3 }, default_lifespan=0, fixed_expected_resources=True, end_on_task_completion=False, time_horizon=1000, replenish_low_resources={ 'metal': 3, 'wood': 3, 'deer': 3 }, seed=seeds[idx]) env_kwargs.update(**kwargs) env = DeerEnv(**env_kwargs) envs.append(env) pickle.dump({'envs': envs, 'seeds': seeds}, open(filename, 'wb')) print('Generated %d envs at file: %s' % (n, filename))