def build_policy(args, env, latent_sampler=None): if args.use_infogail: if latent_sampler is None: latent_sampler = UniformlyRandomLatentSampler( scheduler=ConstantIntervalScheduler(k=args.scheduler_k), name='latent_sampler', dim=args.latent_dim ) if args.policy_recurrent: policy = GaussianLatentVarGRUPolicy( name="policy", latent_sampler=latent_sampler, env_spec=env.spec, hidden_dim=args.recurrent_hidden_dim, ) else: policy = GaussianLatentVarMLPPolicy( name="policy", latent_sampler=latent_sampler, env_spec=env.spec, hidden_sizes=args.policy_mean_hidden_layer_dims, std_hidden_sizes=args.policy_std_hidden_layer_dims ) else: if args.policy_recurrent: policy = GaussianGRUPolicy( name="policy", env_spec=env.spec, hidden_dim=args.recurrent_hidden_dim, output_nonlinearity=None, learn_std=True ) else: policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=args.policy_mean_hidden_layer_dims, std_hidden_sizes=args.policy_std_hidden_layer_dims, adaptive_std=True, output_nonlinearity=None, learn_std=True ) return policy
def rllab_envpolicy_parser(env, args): if isinstance(args, dict): args = tonamedtuple(args) env = RLLabEnv(env, mode=args.control) if args.algo[:2] == 'tf': env = TfEnv(env) # Policy if args.recurrent: if args.feature_net: feature_network = MLP( name='feature_net', input_shape=(env.spec.observation_space.flat_dim + env.spec.action_space.flat_dim, ), output_dim=args.feature_output, hidden_sizes=tuple(args.feature_hidden), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None) elif args.conv: strides = tuple(args.conv_strides) chans = tuple(args.conv_channels) filts = tuple(args.conv_filters) assert len(strides) == len(chans) == len( filts), "strides, chans and filts not equal" # only discrete actions supported, should be straightforward to extend to continuous assert isinstance( env.spec.action_space, Discrete), "Only discrete action spaces support conv" feature_network = ConvNetwork( name='feature_net', input_shape=env.spec.observation_space.shape, output_dim=args.feature_output, conv_filters=chans, conv_filter_sizes=filts, conv_strides=strides, conv_pads=('VALID', ) * len(chans), hidden_sizes=tuple(args.feature_hidden), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=None) else: feature_network = None if args.recurrent == 'gru': if isinstance(env.spec.action_space, Box): policy = GaussianGRUPolicy(env_spec=env.spec, feature_network=feature_network, hidden_dim=int( args.policy_hidden[0]), name='policy') elif isinstance(env.spec.action_space, Discrete): policy = CategoricalGRUPolicy( env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden[0]), name='policy', state_include_action=False if args.conv else True) else: raise NotImplementedError(env.spec.observation_space) elif args.recurrent == 'lstm': if isinstance(env.spec.action_space, Box): policy = GaussianLSTMPolicy( env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden), name='policy') elif isinstance(env.spec.action_space, Discrete): policy = CategoricalLSTMPolicy( env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden), name='policy') else: raise NotImplementedError(env.spec.action_space) else: raise NotImplementedError(args.recurrent) elif args.conv: strides = tuple(args.conv_strides) chans = tuple(args.conv_channels) filts = tuple(args.conv_filters) assert len(strides) == len(chans) == len( filts), "strides, chans and filts not equal" # only discrete actions supported, should be straightforward to extend to continuous assert isinstance( env.spec.action_space, Discrete), "Only discrete action spaces support conv" feature_network = ConvNetwork( name='feature_net', input_shape=env.spec.observation_space.shape, output_dim=env.spec.action_space.n, conv_filters=chans, conv_filter_sizes=filts, conv_strides=strides, conv_pads=('VALID', ) * len(chans), hidden_sizes=tuple(args.policy_hidden), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.softmax) policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, prob_network=feature_network) else: if isinstance(env.spec.action_space, Box): policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=tuple( args.policy_hidden), min_std=args.min_std, name='policy') elif isinstance(env.spec.action_space, Discrete): policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=tuple( args.policy_hidden), name='policy') else: raise NotImplementedError(env.spec.action_space) elif args.algo[:2] == 'th': # Policy if args.recurrent: if args.feature_net: feature_network = thMLP( input_shape=(env.spec.observation_space.flat_dim + env.spec.action_space.flat_dim, ), output_dim=args.feature_output, hidden_sizes=tuple(args.feature_hidden), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None) else: feature_network = None if args.recurrent == 'gru': if isinstance(env.spec.observation_space, thBox): policy = thGaussianGRUPolicy( env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden[0]), ) elif isinstance(env.spec.observation_space, thDiscrete): policy = thCategoricalGRUPolicy( env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden[0]), ) else: raise NotImplementedError(env.spec.observation_space) # elif args.recurrent == 'lstm': # if isinstance(env.spec.action_space, thBox): # policy = thGaussianLSTMPolicy(env_spec=env.spec, # feature_network=feature_network, # hidden_dim=int(args.policy_hidden), # name='policy') # elif isinstance(env.spec.action_space, thDiscrete): # policy = thCategoricalLSTMPolicy(env_spec=env.spec, # feature_network=feature_network, # hidden_dim=int(args.policy_hidden), # name='policy') # else: # raise NotImplementedError(env.spec.action_space) else: raise NotImplementedError(args.recurrent) else: if args.algo == 'thddpg': assert isinstance(env.spec.action_space, thBox) policy = thDeterministicMLPPolicy( env_spec=env.spec, hidden_sizes=tuple(args.policy_hidden), ) else: if isinstance(env.spec.action_space, thBox): policy = thGaussianMLPPolicy(env_spec=env.spec, hidden_sizes=tuple( args.policy_hidden), min_std=args.min_std) elif isinstance(env.spec.action_space, thDiscrete): policy = thCategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=tuple( args.policy_hidden), min_std=args.min_std) else: raise NotImplementedError(env.spec.action_space) if args.control == 'concurrent': return env, policies else: return env, policy
from sandbox.rocky.tf.algos.trpo import TRPO from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.envs.box2d.cartpole_env import CartpoleEnv from rllab.envs.normalized_env import normalize from sandbox.rocky.tf.policies.gaussian_gru_policy import GaussianGRUPolicy from sandbox.rocky.tf.envs.base import TfEnv from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer, FiniteDifferenceHvp from rllab.misc.instrument import stub, run_experiment_lite stub(globals()) env = TfEnv(normalize(CartpoleEnv())) policy = GaussianGRUPolicy( name="policy", env_spec=env.spec, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=10, discount=0.99, step_size=0.01, optimizer=ConjugateGradientOptimizer( hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))) run_experiment_lite(
def main(): now = datetime.datetime.now(dateutil.tz.tzlocal()) rand_id = str(uuid.uuid4())[:5] timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z') default_exp_name = 'experiment_%s_%s' % (timestamp, rand_id) parser = argparse.ArgumentParser() parser.add_argument('--exp_name', type=str, default=default_exp_name, help='Name of the experiment.') parser.add_argument('--discount', type=float, default=0.95) parser.add_argument('--gae_lambda', type=float, default=0.99) parser.add_argument('--reward_scale', type=float, default=1.0) parser.add_argument('--enable_obsnorm', action='store_true', default=False) parser.add_argument('--chunked', action='store_true', default=False) parser.add_argument('--n_iter', type=int, default=250) parser.add_argument('--sampler_workers', type=int, default=1) parser.add_argument('--max_traj_len', type=int, default=250) parser.add_argument('--update_curriculum', action='store_true', default=False) parser.add_argument('--anneal_step_size', type=int, default=0) parser.add_argument('--n_timesteps', type=int, default=8000) parser.add_argument('--control', type=str, default='centralized') parser.add_argument('--buffer_size', type=int, default=1) parser.add_argument('--radius', type=float, default=0.015) parser.add_argument('--n_evaders', type=int, default=10) parser.add_argument('--n_pursuers', type=int, default=8) parser.add_argument('--n_poison', type=int, default=10) parser.add_argument('--n_coop', type=int, default=4) parser.add_argument('--n_sensors', type=int, default=30) parser.add_argument('--sensor_range', type=str, default='0.2') parser.add_argument('--food_reward', type=float, default=5) parser.add_argument('--poison_reward', type=float, default=-1) parser.add_argument('--encounter_reward', type=float, default=0.05) parser.add_argument('--reward_mech', type=str, default='local') parser.add_argument('--recurrent', type=str, default=None) parser.add_argument('--baseline_type', type=str, default='linear') parser.add_argument('--policy_hidden_sizes', type=str, default='128,128') parser.add_argument('--baseline_hidden_sizes', type=str, default='128,128') parser.add_argument('--max_kl', type=float, default=0.01) parser.add_argument('--log_dir', type=str, required=False) parser.add_argument('--tabular_log_file', type=str, default='progress.csv', help='Name of the tabular log file (in csv).') parser.add_argument('--text_log_file', type=str, default='debug.log', help='Name of the text log file (in pure text).') parser.add_argument('--params_log_file', type=str, default='params.json', help='Name of the parameter log file (in json).') parser.add_argument('--seed', type=int, help='Random seed for numpy') parser.add_argument('--args_data', type=str, help='Pickled data for stub objects') parser.add_argument('--snapshot_mode', type=str, default='all', help='Mode to save the snapshot. Can be either "all" ' '(all iterations will be saved), "last" (only ' 'the last iteration will be saved), or "none" ' '(do not save snapshots)') parser.add_argument( '--log_tabular_only', type=ast.literal_eval, default=False, help= 'Whether to only print the tabular log information (in a horizontal format)' ) args = parser.parse_args() parallel_sampler.initialize(n_parallel=args.sampler_workers) if args.seed is not None: set_seed(args.seed) parallel_sampler.set_seed(args.seed) args.hidden_sizes = tuple(map(int, args.policy_hidden_sizes.split(','))) centralized = True if args.control == 'centralized' else False sensor_range = np.array(map(float, args.sensor_range.split(','))) if len(sensor_range) == 1: sensor_range = sensor_range[0] else: assert sensor_range.shape == (args.n_pursuers, ) env = MAWaterWorld(args.n_pursuers, args.n_evaders, args.n_coop, args.n_poison, radius=args.radius, n_sensors=args.n_sensors, food_reward=args.food_reward, poison_reward=args.poison_reward, encounter_reward=args.encounter_reward, reward_mech=args.reward_mech, sensor_range=sensor_range, obstacle_loc=None) env = TfEnv( RLLabEnv(StandardizedEnv(env, scale_reward=args.reward_scale, enable_obsnorm=args.enable_obsnorm), mode=args.control)) if args.buffer_size > 1: env = ObservationBuffer(env, args.buffer_size) if args.recurrent: feature_network = MLP( name='feature_net', input_shape=(env.spec.observation_space.flat_dim + env.spec.action_space.flat_dim, ), output_dim=16, hidden_sizes=(128, 64, 32), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None) if args.recurrent == 'gru': policy = GaussianGRUPolicy(env_spec=env.spec, feature_network=feature_network, hidden_dim=int( args.policy_hidden_sizes), name='policy') elif args.recurrent == 'lstm': policy = GaussianLSTMPolicy(env_spec=env.spec, feature_network=feature_network, hidden_dim=int( args.policy_hidden_sizes), name='policy') else: policy = GaussianMLPPolicy( name='policy', env_spec=env.spec, hidden_sizes=tuple(map(int, args.policy_hidden_sizes.split(','))), min_std=10e-5) if args.baseline_type == 'linear': baseline = LinearFeatureBaseline(env_spec=env.spec) elif args.baseline_type == 'mlp': raise NotImplementedError() # baseline = GaussianMLPBaseline( # env_spec=env.spec, hidden_sizes=tuple(map(int, args.baseline_hidden_sizes.split(',')))) else: baseline = ZeroBaseline(env_spec=env.spec) # logger default_log_dir = config.LOG_DIR if args.log_dir is None: log_dir = osp.join(default_log_dir, args.exp_name) else: log_dir = args.log_dir tabular_log_file = osp.join(log_dir, args.tabular_log_file) text_log_file = osp.join(log_dir, args.text_log_file) params_log_file = osp.join(log_dir, args.params_log_file) logger.log_parameters_lite(params_log_file, args) logger.add_text_output(text_log_file) logger.add_tabular_output(tabular_log_file) prev_snapshot_dir = logger.get_snapshot_dir() prev_mode = logger.get_snapshot_mode() logger.set_snapshot_dir(log_dir) logger.set_snapshot_mode(args.snapshot_mode) logger.set_log_tabular_only(args.log_tabular_only) logger.push_prefix("[%s] " % args.exp_name) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=args.n_timesteps, max_path_length=args.max_traj_len, #max_path_length_limit=args.max_path_length_limit, update_max_path_length=args.update_curriculum, anneal_step_size=args.anneal_step_size, n_itr=args.n_iter, discount=args.discount, gae_lambda=args.gae_lambda, step_size=args.max_kl, optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp( base_eps=1e-5)) if args.recurrent else None, mode=args.control if not args.chunked else 'chunk_{}'.format(args.control), ) algo.train()
def parse_env_args(self, env, args): if isinstance(args, dict): args = to_named_tuple(args) # Multi-agent wrapper env = RLLabEnv(env, ma_mode=args.control) env = MATfEnv(env) # Policy if args.recurrent: if args.feature_net: feature_network = MLP( name='feature_net', input_shape=(env.spec.observation_space.flat_dim + env.spec.action_space.flat_dim, ), output_dim=args.feature_output, hidden_sizes=tuple(args.feature_hidden), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None) elif args.conv: strides = tuple(args.conv_strides) chans = tuple(args.conv_channels) filts = tuple(args.conv_filters) assert len(strides) == len(chans) == len( filts), "strides, chans and filts not equal" # only discrete actions supported, should be straightforward to extend to continuous assert isinstance( env.spec.action_space, Discrete), "Only discrete action spaces support conv" feature_network = ConvNetwork( name='feature_net', input_shape=env.spec.observation_space.shape, output_dim=args.feature_output, conv_filters=chans, conv_filter_sizes=filts, conv_strides=strides, conv_pads=('VALID', ) * len(chans), hidden_sizes=tuple(args.feature_hidden), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=None) else: feature_network = None if args.recurrent == 'gru': if isinstance(env.spec.action_space, Box): if args.control == 'concurrent': policies = [ GaussianGRUPolicy(env_spec=env.spec, feature_network=feature_network, hidden_dim=int( args.policy_hidden[0]), name='policy_{}'.format(agid)) for agid in range(len(env.agents)) ] policy = GaussianGRUPolicy(env_spec=env.spec, feature_network=feature_network, hidden_dim=int( args.policy_hidden[0]), name='policy') elif isinstance(env.spec.action_space, Discrete): if args.control == 'concurrent': policies = [ CategoricalGRUPolicy( env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden[0]), name='policy_{}'.format(agid), state_include_action=False if args.conv else True) for agid in range(len(env.agents)) ] q_network = CategoricalGRUPolicy( env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden[0]), name='q_network', state_include_action=False if args.conv else True) target_q_network = CategoricalGRUPolicy( env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden[0]), name='target_q_network', state_include_action=False if args.conv else True) policy = { 'q_network': q_network, 'target_q_network': target_q_network } else: raise NotImplementedError(env.spec.observation_space) elif args.recurrent == 'lstm': if isinstance(env.spec.action_space, Box): if args.control == 'concurrent': policies = [ GaussianLSTMPolicy(env_spec=env.spec, feature_network=feature_network, hidden_dim=int( args.policy_hidden), name='policy_{}'.format(agid)) for agid in range(len(env.agents)) ] policy = GaussianLSTMPolicy( env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden), name='policy') elif isinstance(env.spec.action_space, Discrete): if args.control == 'concurrent': policies = [ CategoricalLSTMPolicy( env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden), name='policy_{}'.format(agid)) for agid in range(len(env.agents)) ] q_network = CategoricalLSTMPolicy( env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden), name='q_network') target_q_network = CategoricalLSTMPolicy( env_spec=env.spec, feature_network=feature_network, hidden_dim=int(args.policy_hidden), name='target_q_network') policy = { 'q_network': q_network, 'target_q_network': target_q_network } else: raise NotImplementedError(env.spec.action_space) else: raise NotImplementedError(args.recurrent) elif args.conv: strides = tuple(args.conv_strides) chans = tuple(args.conv_channels) filts = tuple(args.conv_filters) assert len(strides) == len(chans) == len( filts), "strides, chans and filts not equal" # only discrete actions supported, should be straightforward to extend to continuous assert isinstance( env.spec.action_space, Discrete), "Only discrete action spaces support conv" feature_network = ConvNetwork( name='feature_net', input_shape=env.spec.observation_space.shape, output_dim=env.spec.action_space.n, conv_filters=chans, conv_filter_sizes=filts, conv_strides=strides, conv_pads=(args.conv_pads, ) * len(chans), hidden_sizes=tuple(args.policy_hidden), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.softmax, batch_normalization=args.batch_normalization) if args.algo == 'dqn': q_network = CategoricalMLPPolicy(name='q_network', env_spec=env.spec, prob_network=feature_network) target_q_network = CategoricalMLPPolicy( name='target_q_network', env_spec=env.spec, prob_network=feature_network) policy = { 'q_network': q_network, 'target_q_network': target_q_network } else: policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, prob_network=feature_network) else: if env.spec is None: networks = [ DQNNetwork(i, env, target_network_update_freq=self.args. target_network_update, discount_factor=self.args.discount, batch_size=self.args.batch_size, learning_rate=self.args.qfunc_lr) for i in range(env.n) ] policy = networks elif isinstance(env.spec.action_space, Box): policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=tuple( args.policy_hidden), min_std=args.min_std, name='policy') elif isinstance(env.spec.action_space, Discrete): policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=tuple( args.policy_hidden), name='policy') else: raise NotImplementedError(env.spec.action_space) return env, policy
from madrl_environments import StandardizedEnv from madrl_environments.pursuit import MAWaterWorld from rllabwrapper import RLLabEnv from rllab.sampler import parallel_sampler from sandbox.rocky.tf.algos.ma_trpo import MATRPO from sandbox.rocky.tf.envs.base import MATfEnv from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from sandbox.rocky.tf.policies.gaussian_gru_policy import GaussianGRUPolicy from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer, FiniteDifferenceHvp parallel_sampler.initialize(n_parallel=2) env = StandardizedEnv(MAWaterWorld(3, 10, 2, 5)) env = MATfEnv(RLLabEnv(env, ma_mode='decentralized')) policy = GaussianGRUPolicy(env_spec=env.spec, name='policy') baseline = LinearFeatureBaseline(env_spec=env.spec) algo = MATRPO(env=env, policy_or_policies=policy, baseline_or_baselines=baseline, batch_size=8000, max_path_length=200, n_itr=500, discount=0.99, step_size=0.01, optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)), ma_mode='decentralized') # policies = [GaussianGRUPolicy(env_spec=env.spec, name='policy_{}'.format(i)) for i in range(3)] # baselines = [LinearFeatureBaseline(env_spec=env.spec) for _ in range(3)] # algo = MATRPO(env=env, policy_or_policies=policies, baseline_or_baselines=baselines, # batch_size=8000, max_path_length=200, n_itr=500, discount=0.99, step_size=0.01, # optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)), # ma_mode='concurrent') algo.train()
# tabular mdp parameters from paper N_STATES = 10 N_ACTIONS = 5 EPISODE_HORIZON = 10 N_EPISODES = 50 # they try several different numbers here EPISODE_LENGTH = 32 # they don't have a number for this parameter in the paper? # mean parameters sampled from N(1,1) tabular_env = RandomTabularMDPEnv(N_STATES, N_ACTIONS, N_EPISODES, EPISODE_LENGTH) env = TfEnv(normalize(tabular_env)) policy = GaussianGRUPolicy( name="policy", env_spec=env.spec, hidden_sizes=(GRU_UNITS,), hidden_nonlinearity=NL.rectify, output_nonlinearity=NL.softmax # gru_layer_cls=L.GRULayer, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=BATCH_SIZE, max_path_length=100, n_itr=POLICY_ITERS, discount=DISCOUNT, step_size=0.01,