def experiment_factory(opt, only_env=False): env = gym_wrapper.GymFromDMEnv(bsuite.load_from_id(opt.env.name)) env = TorchWrapper(env, opt.device) if only_env: return env replay = ExperienceReplay(**opt.replay) layers = [ reduce(lambda x, y: x * y, env.observation_space.shape), # input *opt.estimator["layers"], # hidden env.action_space.n, # output ] estimator = MLP(layers, spectral=opt.spectral, **opt.estimator) estimator.to(opt.device) optimizer = getattr(torch.optim, opt.optim.name)( estimator.parameters(), **opt.optim.kwargs ) policy_improvement = C51PolicyImprovement( estimator, opt.epsilon, env.action_space.n ) policy_evaluation = C51PolicyEvaluation(estimator, optimizer, opt.gamma) rlog.info(replay) rlog.info(estimator) return env, (replay, policy_improvement, policy_evaluation)
def main(_): # Create an environment and grab the spec. raw_environment = bsuite.load_from_id(FLAGS.bsuite_id) environment = single_precision.SinglePrecisionWrapper(raw_environment) environment_spec = specs.make_environment_spec(environment) # Build demonstration dataset. if hasattr(raw_environment, 'raw_env'): raw_environment = raw_environment.raw_env batch_dataset = bsuite_demonstrations.make_dataset(raw_environment) # Combine with demonstration dataset. transition = functools.partial(_n_step_transition_from_episode, n_step=1, additional_discount=1.) dataset = batch_dataset.map(transition) # Batch and prefetch. dataset = dataset.batch(FLAGS.batch_size, drop_remainder=True) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) # Create the networks to optimize. policy_network = make_policy_network(environment_spec.actions) # If the agent is non-autoregressive use epsilon=0 which will be a greedy # policy. evaluator_network = snt.Sequential([ policy_network, lambda q: trfl.epsilon_greedy(q, epsilon=FLAGS.epsilon).sample(), ]) # Ensure that we create the variables before proceeding (maybe not needed). tf2_utils.create_variables(policy_network, [environment_spec.observations]) counter = counting.Counter() learner_counter = counting.Counter(counter, prefix='learner') # Create the actor which defines how we take actions. evaluation_network = actors_tf2.FeedForwardActor(evaluator_network) eval_loop = acme.EnvironmentLoop(environment=environment, actor=evaluation_network, counter=counter, logger=loggers.TerminalLogger( 'evaluation', time_delta=1.)) # The learner updates the parameters (and initializes them). learner = learning.BCLearner(network=policy_network, learning_rate=FLAGS.learning_rate, dataset=dataset, counter=learner_counter) # Run the environment loop. while True: for _ in range(FLAGS.evaluate_every): learner.step() learner_counter.increment(learner_steps=FLAGS.evaluate_every) eval_loop.run(FLAGS.evaluation_episodes)
def __init__(self, name): super().__init__() self.env = bsuite.load_from_id(name) self.cache = None self.reset() print('Env', name, 'is with', self.env.observation_spec(), 'observed space') print('Env', name, 'is with', self.env.action_spec(), 'action space')
def __init__(self, env_id): self.id = env_id self.env = bsuite.load_from_id(env_id) self.action_space = ActionSpace(np.random.RandomState(0), self.env.action_spec()) shape = self.env.observation_spec().shape if shape[0] < 2: self.observation_space = np.zeros(shape=shape[1:]) else: self.observation_space = np.zeros(shape=shape)
def main(_): # Create an environment and grab the spec. environment = bsuite.load_from_id('catch/0') environment = wrappers.SinglePrecisionWrapper(environment) environment_spec = specs.make_environment_spec(environment) network = snt.Sequential([ snt.Flatten(), snt.nets.MLP([50, 50, environment_spec.actions.num_values]) ]) # Construct the agent. agent = dqn.DQN(environment_spec=environment_spec, network=network) # Run the environment loop. loop = acme.EnvironmentLoop(environment, agent) loop.run(num_episodes=environment.bsuite_num_episodes) # pytype: disable=attribute-error
def main(_): # Create an environment and grab the spec. raw_environment = bsuite.load_from_id(FLAGS.bsuite_id) environment = wrappers.SinglePrecisionWrapper(raw_environment) environment_spec = specs.make_environment_spec(environment) # Construct the agent. agent = dqfd.DQfD( environment_spec=environment_spec, network=make_network(environment_spec.actions), demonstration_dataset=bsuite_demonstrations.make_dataset(raw_environment), demonstration_ratio=FLAGS.demonstration_ratio, samples_per_insert=FLAGS.samples_per_insert, learning_rate=FLAGS.learning_rate) # Run the environment loop. loop = acme.EnvironmentLoop(environment, agent) loop.run(num_episodes=environment.bsuite_num_episodes) # pytype: disable=attribute-error
def __init__(self, id: str, exp_kwargs: dict = None, external_logging: str = 'none', save_path: str = '', overwrite: bool = True): assert (id in VALID_ENV_SWEEP_IDS) or ( id in VALID_ENV_IDS and exp_kwargs is not None ) # Either using one of presets or using base experiment with other settings aug_path = osp.join(LOG_DIR, save_path) # LOG_DIR + save_path if id in VALID_ENV_SWEEP_IDS: # Pre-parameterized experiments if external_logging == 'none': env = bsuite.load_from_id(id) # No recording else: env = bsuite.load_and_record( id, aug_path, external_logging, overwrite=overwrite ) # Record in sql or csv. same sql for each id self.num_episodes = env.bsuite_num_episodes else: noise_scale = exp_kwargs.pop('noise_scale', 0.) noise_scale_seed = exp_kwargs.pop('noise_scale_seed', 0.) reward_scale = exp_kwargs.pop('reward_scale', 0.) env = bsuite.load(id, **exp_kwargs) if noise_scale: env = RewardNoise(env, noise_scale, noise_scale_seed) if reward_scale: env = RewardScale(env, reward_scale) self.num_episodes = 1e4 # Default self.env = env self._action_space = IntBox(low=0, high=self.env.action_spec().num_values) o_spec = self.env.observation_spec() if isinstance(o_spec, specs.BoundedArray): self._observation_space = FloatBox(low=o_spec.minimum.item(), high=o_spec.maximum.item(), shape=o_spec.shape, dtype=o_spec.dtype) else: self._observation_space = FloatBox(low=-float('inf'), high=float('inf'), shape=o_spec.shape, dtype=o_spec.dtype) self._last_observation = None self.game_over = False, self.viewer = None
def load_offline_bsuite_dataset( bsuite_id: str, random_prob: float, path: str, batch_size: int, valid_batch_size: int, num_shards: int = 1, num_valid_shards: int = 1, num_threads: int = 1, single_precision_wrapper: bool = True, shuffle_buffer_size: int = 100000, shuffle: bool = True, repeat: bool = True ) -> Tuple[tf.data.Dataset, tf.data.Dataset, dm_env.Environment]: """Load bsuite offline dataset.""" # Data file path format: {path}-?????-of-{num_shards:05d} # The dataset is not deterministic and not repeated if shuffle = False. environment = bsuite.load_from_id(bsuite_id) if single_precision_wrapper: environment = single_precision.SinglePrecisionWrapper(environment) if random_prob > 0.: environment = RandomActionWrapper(environment, random_prob) params = bsuite_offline_dataset.dataset_params(environment) if os.path.basename(path): path += '_' train_path = path + 'train' train_dataset = bsuite_offline_dataset.dataset( path=train_path, num_threads=num_threads, batch_size=batch_size, num_shards=num_shards, shuffle_buffer_size=shuffle_buffer_size, shuffle=shuffle, repeat=repeat, **params) valid_path = path + 'valid' valid_dataset = bsuite_offline_dataset.dataset(path=valid_path, num_threads=num_threads, batch_size=valid_batch_size, num_shards=num_valid_shards, shuffle=False, repeat=False, **params) return train_dataset, valid_dataset, environment
def make_env_and_model( bsuite_id: str, results_dir: str, overwrite: bool) -> Tuple[dm_env.Environment, models.Model]: """Create environment and corresponding model (learned or simulator).""" raw_env = bsuite.load_from_id(bsuite_id) if FLAGS.simulator: model = simulator.Simulator(raw_env) # pytype: disable=attribute-error else: model = mlp.MLPModel( specs.make_environment_spec(raw_env), replay_capacity=1000, batch_size=16, hidden_sizes=(50, ), ) environment = csv_logging.wrap_environment(raw_env, bsuite_id, results_dir, overwrite) environment = wrappers.SinglePrecisionWrapper(environment) return environment, model
def _thunk(): random_seed(seed) if env_id.startswith('bsuite'): id = env_id.split('bsuite-')[1] self.video_enabled = False bsuite_env = bsuite.load_from_id(id) env = gym_wrapper.GymFromDMEnv(bsuite_env) elif env_id.startswith("dm"): import dm_control2gym _, domain, task = env_id.split('-') env = dm_control2gym.make(domain_name=domain, task_name=task) else: if special_args is not None: if 'NChain' in special_args[0]: print('starting chain N = ', special_args[1]) env = gym.make(env_id, n=special_args[1]) else: env = gym.make(env_id) if self.video_enabled: env = Monitor(env, self.log_dir, video_callable=self.video_callable) is_atari = hasattr(gym.envs, 'atari') and isinstance( env.unwrapped, gym.envs.atari.atari_env.AtariEnv) if is_atari: env = make_atari(env_id) env.seed(seed + rank) env = OriginalReturnWrapper(env) if is_atari: env = wrap_deepmind(env, episode_life=episode_life, clip_rewards=False, frame_stack=False, scale=False) obs_shape = env.observation_space.shape if len(obs_shape) == 3: env = TransposeImage(env) env = FrameStack(env, 4) return env
import bsuite from bsuite import sweep # Valid Ids across all experiments: print('All possible values for bsuite_id:') print(sweep.SWEEP) # Ids for an example experiment: print('List bsuite_id for "bandit_noise" experiment:') print(sweep.BANDIT_NOISE) # List the configurations for the given experiment for bsuite_id in sweep.BANDIT_NOISE: env = bsuite.load_from_id(bsuite_id) print('bsuite_id={}, settings={}, num_episodes={}'.format( bsuite_id, sweep.SETTINGS[bsuite_id], env.bsuite_num_episodes))
def load_env(env_id): env = bsuite.load_from_id(env_id) env = gym_wrapper.GymFromDMEnv(env) return env
# # Gym Wrapper (Incomplete) # import bsuite from bsuite import sweep import gym from bsuite.utils import gym_wrapper raw_env = bsuite.load_from_id(bsuite_id='memory_len/0') env = gym_wrapper.GymWrapper(raw_env) isinstance(env, gym.Env)
import torch import os from base_dqns import Agent import numpy as np import bsuite from utils import save_results if __name__ == '__main__': path = 'saves/' env = bsuite.load_from_id('deep_sea/0') num_actions = env.action_spec().num_values agent = Agent(gamma=0.99, eps=1.0, lr=(0.0002*0.0001), input_dims=100, output_dims=2, batch_size=128, n_actions=2, max_mem_size=100000, eps_end=0.01, eps_dec=1e-4, langevin=True) scores = [] avg_scores = [] eps_history = [] episodes = 10000 try: for i in range(episodes): score = 0 eps_history.append(agent.eps) timestep = env.reset() while not timestep.last(): observation = timestep.observation observation = np.reshape(observation, (-1)) action = agent.choose_action(observation) timestep_ = env.step(action)