def doit(mode): from rllab.envs.box2d.cartpole_env import CartpoleEnv from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.baselines.zero_baseline import ZeroBaseline from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy from rllab.envs.normalized_env import normalize import numpy as np import theano import theano.tensor as TT from lasagne.updates import adam # normalize() makes sure that the actions for the environment lies # within the range [-1, 1] (only works for environments with continuous actions) env = normalize(CartpoleEnv()) # Initialize a neural network policy with a single hidden layer of 8 hidden units policy = GaussianMLPPolicy(env.spec, hidden_sizes=(8,)) # Initialize a linear baseline estimator using default hand-crafted features if "linbaseline" in mode: print('linear baseline') baseline = LinearFeatureBaseline(env.spec) elif "vanilla" in mode: print("zero baseline") baseline = ZeroBaseline(env.spec) elif mode == "batchavg": print('batch average baseline') # use a zero baseline but subtract the mean of the discounted returns (see below) baseline = ZeroBaseline(env.spec) if "_ztrans" in mode: print('z transform advantages') else: print('no z transform') # We will collect 100 trajectories per iteration N = 50 # Each trajectory will have at most 100 time steps T = 50 # Number of iterations n_itr = 50 # Set the discount factor for the problem discount = 0.99 # Learning rate for the gradient update learning_rate = 0.1 # Construct the computation graph # Create a Theano variable for storing the observations # We could have simply written `observations_var = TT.matrix('observations')` instead for this example. However, # doing it in a slightly more abstract way allows us to delegate to the environment for handling the correct data # type for the variable. For instance, for an environment with discrete observations, we might want to use integer # types if the observations are represented as one-hot vectors. observations_var = env.observation_space.new_tensor_variable( 'observations', # It should have 1 extra dimension since we want to represent a list of observations extra_dims=1 ) actions_var = env.action_space.new_tensor_variable( 'actions', extra_dims=1 ) advantages_var = TT.vector('advantages') # policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the # distribution of the actions. For a Gaussian policy, it contains the mean and (log) standard deviation. dist_info_vars = policy.dist_info_sym(observations_var) # policy.distribution returns a distribution object under rllab.distributions. It contains many utilities for computing # distribution-related quantities, given the computed dist_info_vars. Below we use dist.log_likelihood_sym to compute # the symbolic log-likelihood. For this example, the corresponding distribution is an instance of the class # rllab.distributions.DiagonalGaussian dist = policy.distribution # Note that we negate the objective, since most optimizers assume a # minimization problem surr = - TT.mean(dist.log_likelihood_sym(actions_var, dist_info_vars) * advantages_var) # Get the list of trainable parameters. params = policy.get_params(trainable=True) grads = theano.grad(surr, params) f_train = theano.function( inputs=[observations_var, actions_var, advantages_var], outputs=None, updates=adam(grads, params, learning_rate=learning_rate), allow_input_downcast=True ) results = [] for _ in range(n_itr): paths = [] for _ in range(N): observations = [] actions = [] rewards = [] observation = env.reset() for _ in range(T): # policy.get_action() returns a pair of values. The second one returns a dictionary, whose values contains # sufficient statistics for the action distribution. It should at least contain entries that would be # returned by calling policy.dist_info(), which is the non-symbolic analog of policy.dist_info_sym(). # Storing these statistics is useful, e.g., when forming importance sampling ratios. In our case it is # not needed. action, _ = policy.get_action(observation) # Recall that the last entry of the tuple stores diagnostic information about the environment. In our # case it is not needed. next_observation, reward, terminal, _ = env.step(action) observations.append(observation) actions.append(action) rewards.append(reward) observation = next_observation if terminal: # Finish rollout if terminal state reached break # We need to compute the empirical return for each time step along the # trajectory path = dict( observations=np.array(observations), actions=np.array(actions), rewards=np.array(rewards), ) path_baseline = baseline.predict(path) advantages = [] returns = [] return_so_far = 0 for t in range(len(rewards) - 1, -1, -1): return_so_far = rewards[t] + discount * return_so_far returns.append(return_so_far) advantage = return_so_far - path_baseline[t] advantages.append(advantage) # The advantages are stored backwards in time, so we need to revert it advantages = np.array(advantages[::-1]) # And we need to do the same thing for the list of returns returns = np.array(returns[::-1]) if "_ztrans" in mode: advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8) path["advantages"] = advantages path["returns"] = returns paths.append(path) baseline.fit(paths) observations = np.concatenate([p["observations"] for p in paths]) actions = np.concatenate([p["actions"] for p in paths]) advantages = np.concatenate([p["advantages"] for p in paths]) if mode == 'batchavg': # in this case `advantages` up to here are just our good old returns, without baseline or z transformation. # now we subtract their mean across all episodes. advantages = advantages - np.mean(advantages) f_train(observations, actions, advantages) avgr = np.mean([sum(p["rewards"]) for p in paths]) print(('Average Return:',avgr)) results.append(avgr) return results
def train(variant): set_global_seeds(variant['seed']) if variant['mode'] == 'local': import colored_traceback.always ''' Set-up folder and files ''' snapshot_dir = logger.get_snapshot_dir() working_dir = config.PROJECT_PATH param_path = os.path.join(working_dir, 'params/params.json') # copyfile(param_path, os.path.join(snapshot_dir,'params.json')) try: ''' Save parameters ''' if 'params' in variant: logger.log('Load params from variant.') params = variant['params'] else: logger.log('Load params from file.') with open(param_path, 'r') as f: params = json.load(f) # Save to snapshot dir new_param_path = os.path.join(snapshot_dir, 'params.json') with open(new_param_path, 'w') as f: json.dump(params, f, sort_keys=True, indent=4, separators=(',', ': ')) # TODO: can use variant to modify here. dynamics_opt_params = params['dynamics_opt_params'] dynamics_opt_params['stop_critereon'] = stop_critereon( threshold=dynamics_opt_params['stop_critereon']['threshold'], offset=dynamics_opt_params['stop_critereon']['offset']) dynamics_opt_params = Dynamics_opt_params(**dynamics_opt_params) policy_opt_params = params['policy_opt_params'] policy_opt_params['stop_critereon'] = stop_critereon( threshold=policy_opt_params['stop_critereon']['threshold'], offset=policy_opt_params['stop_critereon']['offset'], percent_models_threshold=policy_opt_params['stop_critereon'] ['percent_models_threshold']) policy_opt_params = Policy_opt_params(**policy_opt_params) rollout_params = params['rollout_params'] rollout_params['monitorpath'] = os.path.join(snapshot_dir, 'videos') rollout_params = Rollout_params(**rollout_params) assert params['rollout_params']['max_timestep'] == \ params['policy_opt_params']['oracle_maxtimestep'] == \ params['policy_opt_params']['T'] ''' Policy model ''' def build_policy_from_rllab(scope_name='training_policy'): ''' Return both rllab policy and policy model function. ''' sess = tf.get_default_session() ### Initialize training_policy to copy from policy from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy output_nonlinearity = eval(params['policy']['output_nonlinearity']) training_policy = GaussianMLPPolicy( name=scope_name, env_spec=env.spec, hidden_sizes=params['policy']['hidden_layers'], init_std=policy_opt_params.trpo['init_std'], output_nonlinearity=output_nonlinearity) training_policy_vars = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope='training_policy') sess.run([tf.variables_initializer(training_policy_vars)]) ### Compute policy model function using the same weights. training_layers = training_policy._mean_network.layers def policy_model(x, stochastic=0.0, collect_summary=False): assert (training_layers[0].shape[1] == x.shape[1]) h = x for i, layer in enumerate(training_layers[1:]): w = layer.W b = layer.b pre_h = tf.matmul(h, w) + b h = layer.nonlinearity(pre_h, name='policy_out') if collect_summary: with tf.name_scope(scope_name + '/observation'): variable_summaries(x) with tf.name_scope(scope_name + '/layer%d' % i): with tf.name_scope('weights'): variable_summaries(w) with tf.name_scope('biases'): variable_summaries(b) with tf.name_scope('Wx_plus_b'): tf.summary.histogram('pre_activations', pre_h) tf.summary.histogram('activations', h) std = training_policy._l_std_param.param h += stochastic * tf.random_normal( shape=(tf.shape(x)[0], n_actions)) * tf.exp(std) return h return training_policy, policy_model ''' Dynamics model ''' def get_value(key, dict): return key in dict and dict[key] def prepare_input(xgu, xgu_norm, scope_name, variable_name, collect_summary, prediction_type): name_scope = '%s/%s' % (scope_name, variable_name) assert n_states > 1 and n_actions > 1 \ and xgu.shape[1] == n_states + n_actions + n_goals xu = tf.concat([xgu[:, :n_states], xgu[:, n_states + n_goals:]], axis=1) xu_norm = tf.concat( [xgu_norm[:, :n_states], xgu_norm[:, n_states + n_goals:]], axis=1) # Collect data summaries if collect_summary: with tf.name_scope(name_scope + '/inputs'): with tf.name_scope('states'): data_summaries(xgu[:, :n_states]) with tf.name_scope('goals'): data_summaries(xgu[:, n_states:n_states + n_goals]) with tf.name_scope('actions'): data_summaries(xgu[:, n_states + n_goals:]) # Ignore xy in the current state. if get_value('ignore_xy_input', params['dynamics_model']): n_inputs = n_states + n_actions - 2 nn_input = xu_norm[:, 2:] elif get_value('ignore_x_input', params['dynamics_model']): n_inputs = n_states + n_actions - 1 nn_input = xu_norm[:, 1:] else: n_inputs = n_states + n_actions nn_input = xu_norm hidden_layers = list(params['dynamics_model']['hidden_layers']) nonlinearity = [ eval(_x) for _x in params['dynamics_model']['nonlinearity'] ] assert (len(nonlinearity) == len(hidden_layers)) # Verify if the input type is valid. if prediction_type == 'state_change' or \ prediction_type == 'state_change_goal': n_outputs = n_states else: assert prediction_type == 'second_derivative' or \ prediction_type == 'second_derivative_goal' n_outputs = int(n_states / 2) nonlinearity.append(tf.identity) hidden_layers.append(n_outputs) return xu, nn_input, n_inputs, n_outputs, \ nonlinearity, hidden_layers def build_ff_neural_net(nn_input, n_inputs, hidden_layers, nonlinearity, scope_name, variable_name, collect_summary, logit_weights=None, initializer=layers.xavier_initializer(), dropout=False): assert len(hidden_layers) == len(nonlinearity) name_scope = '%s/%s' % (scope_name, variable_name) h = nn_input n_hiddens = n_inputs n_hiddens_next = hidden_layers[0] for i in range(len(hidden_layers)): w = get_scope_variable(scope_name, "%s/layer%d/weights" % (variable_name, i), shape=(n_hiddens, n_hiddens_next), initializer=initializer) b = get_scope_variable(scope_name, "%s/layer%d/biases" % (variable_name, i), shape=(n_hiddens_next), initializer=initializer) if collect_summary: with tf.name_scope(name_scope + '/layer%d' % i): with tf.name_scope('weights'): variable_summaries(w) with tf.name_scope('biases'): variable_summaries(b) with tf.name_scope('Wx_plus_b'): pre_h = tf.matmul(h, w) + b # Yunfei: dropout option is useless now if dropout: # if i == 0: # pre_h = tf.nn.dropout(tf.matmul(h,w), keep_prob=0.8) + b # else: pre_h = tf.nn.dropout(tf.matmul(h, w), keep_prob=dropout) + b tf.summary.histogram('pre_activations', pre_h) h = nonlinearity[i](pre_h, name='activation') tf.summary.histogram('activations', h) else: pre_h = tf.matmul(h, w) + b h = nonlinearity[i](pre_h, name='activation') n_hiddens = hidden_layers[i] if i + 1 < len(hidden_layers): n_hiddens_next = hidden_layers[i + 1] if logit_weights is not None and i == len(hidden_layers) - 2: h *= logit_weights return h def build_dynamics_model(n_states, n_actions, n_goals, dt=None, input_rms=None, diff_rms=None): prediction_type = params['dynamics_model']['prediction_type'] def dynamics_model(xgu, scope_name, variable_name, collect_summary=False): ''' :param xu: contains states, goals, actions :param scope_name: :param variable_name: :param dt: :return: ''' xu, nn_input, n_inputs, n_outputs, nonlinearity, hidden_layers = \ prepare_input(xgu, (xgu - input_rms.mean)/input_rms.std, scope_name, variable_name, collect_summary, prediction_type) if "use_logit_weights" in params["dynamics_model"] and params[ "dynamics_model"]["use_logit_weights"]: logit_weights = build_ff_neural_net( nn_input, n_inputs, hidden_layers[:-1], nonlinearity[:-2] + [tf.nn.sigmoid], scope_name, variable_name + '_sig', collect_summary) else: logit_weights = None if "dropout" in params["dynamics_model"]: dropout_keep_prob = params["dynamics_model"]["dropout"] else: dropout_keep_prob = False nn_output = build_ff_neural_net(nn_input, n_inputs, hidden_layers, nonlinearity, scope_name, variable_name, collect_summary, logit_weights=logit_weights, dropout=dropout_keep_prob) # predict the delta instead (x_next-x_current) if 'state_change' in prediction_type: next_state = tf.add( diff_rms.mean[:n_states] + diff_rms.std[:n_outputs] * nn_output, xu[:, :n_states]) else: assert 'second_derivative' in prediction_type # We train 'out' to match state_dot_dot # Currently only works for swimmer. qpos = xu[:, :n_outputs] + dt * xu[:, n_outputs:n_states] qvel = xu[:, n_outputs:n_states] + dt * nn_output next_state = tf.concat([qpos, qvel], axis=1) if '_goal' in prediction_type: assert n_goals > 1 g = xgu[:, n_states:n_states + n_goals] next_state = tf.concat([next_state, g], axis=1) return tf.identity(next_state, name='%s/%s/dynamics_out' % (scope_name, variable_name)) return dynamics_model def get_regularizer_loss(scope_name, variable_name): if params['dynamics_model']['regularization']['method'] in [ None, '' ]: return tf.constant(0.0, dtype=tf.float32) constant = params['dynamics_model']['regularization']['constant'] regularizer = eval( params['dynamics_model']['regularization']['method']) hidden_layers = params['dynamics_model']['hidden_layers'] reg_loss = 0.0 for i in range(len(hidden_layers) + 1): w = get_scope_variable( scope_name, "%s/layer%d/weights" % (variable_name, i)) b = get_scope_variable( scope_name, "%s/layer%d/biases" % (variable_name, i)) reg_loss += regularizer(w) + regularizer(b) return constant * reg_loss ''' Main ''' # with get_session() as sess: if variant['mode'] == 'local': sess = get_session(interactive=True, mem_frac=0.1) else: sess = get_session(interactive=True, mem_frac=1.0, use_gpu=variant['use_gpu']) # data = joblib.load(os.path.join(working_dir, params['trpo_path'])) env = get_env(variant['params']['env']) # policy = data['policy'] training_policy, policy_model = build_policy_from_rllab() if hasattr(env._wrapped_env, '_wrapped_env'): inner_env = env._wrapped_env._wrapped_env else: inner_env = env._wrapped_env.env.unwrapped n_obs = inner_env.observation_space.shape[0] n_actions = inner_env.action_space.shape[0] cost_np = inner_env.cost_np cost_tf = inner_env.cost_tf cost_np_vec = inner_env.cost_np_vec if hasattr(inner_env, 'n_goals'): n_goals = inner_env.n_goals n_states = inner_env.n_states assert n_goals + n_states == n_obs else: n_goals = 0 n_states = n_obs dt = None # Only necessary for second_derivative if hasattr(inner_env, 'model') and hasattr(inner_env, 'frame_skip'): dt = inner_env.model.opt.timestep * inner_env.frame_skip from running_mean_std import RunningMeanStd with tf.variable_scope('input_rms'): input_rms = RunningMeanStd(epsilon=0.0, shape=(n_states + n_goals + n_actions)) with tf.variable_scope('diff_rms'): diff_rms = RunningMeanStd(epsilon=0.0, shape=(n_states + n_goals)) dynamics_model = build_dynamics_model(n_states=n_states, n_actions=n_actions, n_goals=n_goals, dt=dt, input_rms=input_rms, diff_rms=diff_rms) kwargs = {} kwargs['input_rms'] = input_rms kwargs['diff_rms'] = diff_rms kwargs['mode'] = variant['mode'] if params['algo'] == 'vpg': from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from algos.vpg import VPG baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG( env=env, policy=training_policy, baseline=baseline, batch_size=policy_opt_params.vpg['batch_size'], max_path_length=policy_opt_params.T, discount=policy_opt_params.vpg['discount'], ) kwargs['rllab_algo'] = algo if params["policy_opt_params"]["vpg"]["reset"]: kwargs['reset_opt'] = tf.assign( training_policy._l_std_param.param, np.log(params["policy_opt_params"]["vpg"]["init_std"]) * np.ones(n_actions)) elif params['algo'] == 'trpo': ### Write down baseline and algo from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from algos.trpo import TRPO baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=training_policy, baseline=baseline, batch_size=policy_opt_params.trpo['batch_size'], max_path_length=policy_opt_params.T, discount=policy_opt_params.trpo['discount'], step_size=policy_opt_params.trpo['step_size'], ) kwargs['rllab_algo'] = algo if params["policy_opt_params"]["trpo"]["reset"]: kwargs['reset_opt'] = tf.assign( training_policy._l_std_param.param, np.log(params["policy_opt_params"]["trpo"]["init_std"]) * np.ones(n_actions)) # if "decay_rate" in params["policy_opt_params"]["trpo"]: # kwargs['trpo_std_decay'] = tf.assign_sub(training_policy._l_std_param.param, # np.log(params["policy_opt_params"]["trpo"]["decay_rate"])*np.ones(n_actions)) kwargs['inner_env'] = inner_env kwargs['algo_name'] = params['algo'] kwargs['logstd'] = training_policy._l_std_param.param # Save initial policy joblib.dump(training_policy, os.path.join(snapshot_dir, 'params-initial.pkl')) train_models(env=env, dynamics_model=dynamics_model, dynamics_opt_params=dynamics_opt_params, get_regularizer_loss=get_regularizer_loss, policy_model=policy_model, policy_opt_params=policy_opt_params, rollout_params=rollout_params, cost_np=cost_np, cost_np_vec=cost_np_vec, cost_tf=cost_tf, snapshot_dir=snapshot_dir, working_dir=working_dir, n_models=params['n_models'], sweep_iters=params['sweep_iters'], sample_size=params['sample_size'], verbose=False, variant=variant, saved_policy=training_policy, **kwargs) # Make sure not to reinitialize TRPO policy. # Save the final policy joblib.dump(training_policy, os.path.join(snapshot_dir, 'params.pkl')) except Exception as e: rmtree(snapshot_dir) import sys, traceback # traceback.print_exception(*sys.exc_info()) from IPython.core.ultratb import ColorTB c = ColorTB() exc = sys.exc_info() print(''.join(c.structured_traceback(*exc))) print('Removed the experiment folder %s.' % snapshot_dir)
# policies if load_policy is None: policy_list = [BMAMLGaussianMLPPolicy( name="policy", env_spec=env.spec, grad_step_size=fast_lr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100,100), particle_idx=n) for n in range(num_particles)] else: # will be loaded policy_list = [0]*num_particles # baseline baseline_list = [LinearFeatureBaseline(env_spec=env.spec) for n in range(num_particles)] # meta learning methods if meta_method == 'chaser': algo = BMAMLCHASER( env=env, policy_list=policy_list, baseline_list=baseline_list, batch_size=fast_batch_size, # number of trajs for grad update max_path_length=max_path_length, meta_batch_size=meta_batch_size, num_grad_updates=num_grad_updates, num_leader_grad_updates=num_leader_grad_updates, random_seed=random_seed, svpg=svpg, svpg_alpha=svpg_alpha,
from rllab.envs.box2d.cartpole_env import CartpoleEnv from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy from rllab.envs.normalized_env import normalize import numpy as np import theano import theano.tensor as TT from lasagne.updates import adam # normalize() makes sure that the actions for the environment lies # within the range [-1, 1] (only works for environments with continuous actions) env = normalize(CartpoleEnv()) # Initialize a neural network policy with a single hidden layer of 8 hidden units policy = GaussianMLPPolicy(env.spec, hidden_sizes=(8, )) # #使用默认手工设置的特征,初始化线性基线估计器 baseline = LinearFeatureBaseline(env.spec) # We will collect 100 trajectories per iteration N = 100 # Each trajectory will have at most 100 time steps T = 100 # Number of iterations n_itr = 100 # Set the discount factor for the problem discount = 0.99 # Learning rate for the gradient update learning_rate = 0.1 # Construct the computation graph # Create a Theano variable for storing the observations
def __init__( self, optimizer=None, optimizer_args=None, step_size=0.003, num_latents=6, latents=None, # some sort of iterable of the actual latent vectors period=10, # how often I choose a latent truncate_local_is_ratio=None, epsilon=0.1, train_pi_iters=10, use_skill_dependent_baseline=False, mlp_skill_dependent_baseline=False, freeze_manager=False, freeze_skills=False, **kwargs): if optimizer is None: if optimizer_args is None: # optimizer_args = dict() optimizer_args = dict(batch_size=None) self.optimizer = FirstOrderOptimizer(learning_rate=step_size, max_epochs=train_pi_iters, **optimizer_args) self.step_size = step_size self.truncate_local_is_ratio = truncate_local_is_ratio self.epsilon = epsilon super(ConcurrentContinuousPPO, self).__init__(**kwargs) # not sure if this line is correct self.num_latents = kwargs['policy'].latent_dim self.latents = latents self.period = period self.freeze_manager = freeze_manager self.freeze_skills = freeze_skills assert (not freeze_manager) or (not freeze_skills) # todo: fix this sampler stuff # import pdb; pdb.set_trace() self.sampler = HierBatchSampler(self, self.period) # self.sampler = BatchSampler(self) # i hope this is right self.diagonal = DiagonalGaussian( self.policy.low_policy.action_space.flat_dim) self.debug_fns = [] assert isinstance(self.policy, HierarchicalPolicy) self.period = self.policy.period assert self.policy.period == self.period self.continuous_latent = self.policy.continuous_latent assert self.continuous_latent # self.old_policy = copy.deepcopy(self.policy) # skill dependent baseline self.use_skill_dependent_baseline = use_skill_dependent_baseline self.mlp_skill_dependent_baseline = mlp_skill_dependent_baseline if use_skill_dependent_baseline: curr_env = kwargs['env'] skill_dependent_action_space = curr_env.action_space new_obs_space_no_bi = curr_env.observation_space.shape[ 0] + 1 # 1 for the t_remaining skill_dependent_obs_space_dim = (new_obs_space_no_bi * (self.num_latents + 1) + self.num_latents, ) skill_dependent_obs_space = Box( -1.0, 1.0, shape=skill_dependent_obs_space_dim) skill_dependent_env_spec = EnvSpec(skill_dependent_obs_space, skill_dependent_action_space) if self.mlp_skill_dependent_baseline: self.skill_dependent_baseline = GaussianMLPBaseline( env_spec=skill_dependent_env_spec) else: self.skill_dependent_baseline = LinearFeatureBaseline( env_spec=skill_dependent_env_spec)
def run_task(*_): env = normalize(GymEnv(args.env)) # env.wrapped_env.env.env.env.reward_flag = 'absolute' env.wrapped_env.env.env.reward_flag = args.reward baseline = LinearFeatureBaseline(env_spec=env.spec) learn_std = True init_std = 2 # hidden_sizes=(8,) hidden_sizes = (32, 32) # hidden_sizes=(100, 50, 25) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=hidden_sizes, learn_std=learn_std, init_std=init_std) # ======================= # Defining the algorithm # ======================= batch_size = 5000 n_itr = args.n_itr gamma = .9 step_size = 0.01 if args.algorithm == 0: algo = VPG(env=env, policy=policy, baseline=baseline, batch_size=batch_size, n_itr=n_itr, discount=gamma, step_size=step_size) if args.algorithm == 1: algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=batch_size, n_itr=n_itr, discount=gamma, step_size=step_size) if args.algorithm == 2: algo = TNPG(env=env, policy=policy, baseline=baseline, batch_size=batch_size, n_itr=n_itr, discount=gamma, step_size=step_size) # if args.algorithm == 4: # algo = DDPG( # env=env, # policy=policy, # baseline=baseline, # batch_size=batch_size, # n_itr=n_itr, # discount=gamma, # step_size=step_size # ) algo.train() return algo
def run_task(*_): v_enter = 30 inner_length = 800 long_length = 100 short_length = 800 n = 1 m = 5 num_cars_left = 3 num_cars_right = 3 num_cars_top = 15 num_cars_bot = 15 tot_cars = (num_cars_left + num_cars_right) * m \ + (num_cars_bot + num_cars_top) * n grid_array = { "short_length": short_length, "inner_length": inner_length, "long_length": long_length, "row_num": n, "col_num": m, "cars_left": num_cars_left, "cars_right": num_cars_right, "cars_top": num_cars_top, "cars_bot": num_cars_bot } sumo_params = SumoParams(sim_step=1, sumo_binary="sumo-gui") vehicles = Vehicles() vehicles.add(veh_id="idm", acceleration_controller=(SumoCarFollowingController, {}), sumo_car_following_params=SumoCarFollowingParams( minGap=2.5, max_speed=v_enter, ), routing_controller=(GridRouter, {}), num_vehicles=tot_cars, speed_mode="all_checks") additional_env_params = { "target_velocity": 50, "num_steps": 500, "control-length": 150, "switch_time": 3.0 } env_params = EnvParams(additional_params=additional_env_params) additional_net_params = { "speed_limit": 35, "grid_array": grid_array, "horizontal_lanes": 1, "vertical_lanes": 1, "traffic_lights": True } initial_config, net_params = get_non_flow_params(10, additional_net_params) scenario = SimpleGridScenario(name="grid-intersection", generator_class=SimpleGridGenerator, vehicles=vehicles, net_params=net_params, initial_config=initial_config) env_name = "GreenWaveEnv" pass_params = (env_name, sumo_params, vehicles, env_params, net_params, initial_config, scenario) env = GymEnv(env_name, record_video=False, register_params=pass_params) horizon = env.horizon env = normalize(env) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=40000, max_path_length=horizon, # whole_paths=True, n_itr=800, discount=0.999, # step_size=0.01, ) algo.train()
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res'] samples_per_cell = 10 # for the oracle rejection sampling # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! if log_dir is None: log_dir = "/home/davheld/repos/rllab_goal_rl/data/local/debug" report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=5) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize(PointMazeEnv(maze_id=v['maze_id'])) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) uniform_start_generator = UniformStateGenerator(state_size=v['start_size'], bounds=v['start_range'], center=v['start_center']) env = GoalStartExplorationEnv( env=inner_env, start_generator=uniform_start_generator, obs2start_transform=lambda x: x[:v['start_size']], goal_generator=fixed_goal_generator, obs2goal_transform=lambda x: x[:v['goal_size']], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], only_feasible=v['only_feasible'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) # initialize all logging arrays on itr0 outer_iter = 0 logger.log('Generating the Initial Heatmap...') plot_policy_means(policy, env, sampling_res=sampling_res, report=report, limit=v['goal_range'], center=v['goal_center']) test_and_plot_policy(policy, env, as_goals=False, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, center=v['goal_center'], limit=v['goal_range']) report.new_row() all_starts = StateCollection(distance_threshold=v['coll_eps']) # Use asymmetric self-play to run Alice to generate starts for Bob. # Use a double horizon because the horizon is shared between Alice and Bob. env_alice = AliceEnv(env_alice=env, env_bob=env, policy_bob=policy, max_path_length=v['alice_horizon'], alice_factor=v['alice_factor'], alice_bonus=v['alice_bonus'], gamma=1, stop_threshold=v['stop_threshold']) policy_alice = GaussianMLPPolicy( env_spec=env_alice.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain_alice'], init_std=v['policy_init_std_alice'], ) baseline_alice = LinearFeatureBaseline(env_spec=env_alice.spec) algo_alice = TRPO( env=env_alice, policy=policy_alice, baseline=baseline_alice, batch_size=v['pg_batch_size_alice'], max_path_length=v['alice_horizon'], n_itr=v['inner_iters_alice'], step_size=0.01, discount=v['discount_alice'], plot=False, ) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling starts") starts, t_alices = generate_starts_alice( env_alice=env_alice, algo_alice=algo_alice, start_states=[v['start_goal']], num_new_starts=v['num_new_starts'], log_dir=log_dir) labels = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string_base='initial starts labels:\n') report.save() if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0: old_starts = all_starts.sample(v['num_old_starts']) starts = np.vstack([starts, old_starts]) with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment start generator") env.update_start_generator( UniformListStateGenerator( starts.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], )) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=v['step_size'], discount=v['discount'], plot=False, ) # We don't use these labels anyway, so we might as well take them from training. #trpo_paths = algo.train() algo.train() # logger.log("labeling starts with trpo rollouts") # [starts, labels] = label_states_from_paths(trpo_paths, n_traj=2, key='goal_reached', # using the min n_traj # as_goal=False, env=env) # paths = [path for paths in trpo_paths for path in paths] with logger.tabular_prefix('Outer_'): logger.record_tabular('t_alices', np.mean(t_alices)) logger.log('Generating the Heatmap...') plot_policy_means(policy, env, sampling_res=sampling_res, report=report, limit=v['goal_range'], center=v['goal_center']) test_and_plot_policy(policy, env, as_goals=False, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, center=v['goal_center'], limit=v['goal_range']) logger.log("Labeling the starts") labels = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id']) # ###### extra for deterministic: # logger.log("Labeling the goals deterministic") # with policy.set_std_to_0(): # labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1) # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) logger.dump_tabular(with_prefix=False) report.new_row() # append new states to list of all starts (replay buffer): Not the low reward ones!! filtered_raw_starts = [ start for start, label in zip(starts, labels) if label[0] == 1 ] if len( filtered_raw_starts ) == 0: # add a tone of noise if all the states I had ended up being high_reward! logger.log("Bad Alice! All goals are high reward!") # seed_starts = filtered_raw_starts # else: # seed_starts = generate_starts(env, starts=starts, horizon=v['horizon'] * 2, subsample=v['num_new_starts'], # variance=v['brownian_variance'] * 10) all_starts.append(filtered_raw_starts)
def run_task(*_): """Implement the run_task method needed to run experiments with rllab.""" sim_params = SumoParams(sim_step=0.1, render=True) vehicles = VehicleParams() vehicles.add( veh_id="rl", acceleration_controller=(RLController, {}), routing_controller=(ContinuousRouter, {}), car_following_params=SumoCarFollowingParams( speed_mode="obey_safe_speed", ), num_vehicles=1) vehicles.add( veh_id="idm", acceleration_controller=(IDMController, { "noise": 0.2 }), routing_controller=(ContinuousRouter, {}), car_following_params=SumoCarFollowingParams( speed_mode="obey_safe_speed", ), num_vehicles=13) additional_env_params = { "target_velocity": 20, "max_accel": 3, "max_decel": 3, "sort_vehicles": False } env_params = EnvParams( horizon=HORIZON, additional_params=additional_env_params) additional_net_params = { "radius_ring": 30, "lanes": 1, "speed_limit": 30, "resolution": 40 } net_params = NetParams( no_internal_links=False, additional_params=additional_net_params) initial_config = InitialConfig(spacing="uniform") print("XXX name", exp_tag) scenario = Figure8Scenario( exp_tag, vehicles, net_params, initial_config=initial_config) env_name = "AccelEnv" pass_params = (env_name, sim_params, vehicles, env_params, net_params, initial_config, scenario) env = GymEnv(env_name, record_video=False, register_params=pass_params) horizon = env.horizon env = normalize(env) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(16, 16)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=15000, max_path_length=horizon, n_itr=500, # whole_paths=True, discount=0.999, # step_size=v["step_size"], ) algo.train(),
def run_task(*_): """Implement the run_task method needed to run experiments with rllab.""" sumo_params = SumoParams(sim_step=0.1, render=False, seed=0) vehicles = Vehicles() vehicles.add(veh_id="rl", acceleration_controller=(RLController, {}), routing_controller=(ContinuousRouter, {}), num_vehicles=1) vehicles.add(veh_id="idm", acceleration_controller=(IDMController, {}), routing_controller=(ContinuousRouter, {}), num_vehicles=21) additional_env_params = { "target_velocity": 8, "ring_length": [220, 270], "max_accel": 1, "max_decel": 1 } env_params = EnvParams(horizon=HORIZON, additional_params=additional_env_params, warmup_steps=750) additional_net_params = { "length": 260, "lanes": 1, "speed_limit": 30, "resolution": 40 } net_params = NetParams(additional_params=additional_net_params) initial_config = InitialConfig(spacing="uniform", bunching=50) print("XXX name", exp_tag) scenario = LoopScenario(exp_tag, vehicles, net_params, initial_config=initial_config) env_name = "WaveAttenuationPOEnv" pass_params = (env_name, sumo_params, vehicles, env_params, net_params, initial_config, scenario) env = GymEnv(env_name, record_video=False, register_params=pass_params) horizon = env.horizon env = normalize(env) policy = GaussianGRUPolicy( env_spec=env.spec, hidden_sizes=(5, ), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=3600 * 72 * 2, max_path_length=horizon, n_itr=5, # whole_paths=True, discount=0.999, # step_size=v["step_size"], ) algo.train(),
def experiment_compare_scratch_100(): # k = 100 for seed in range(1, 10): env = StandardControllerEnv(k=4, seed=seed, noise=0.05, num_dynamics=4, num_points=100) now = datetime.datetime.now() timestamp = now.strftime('%Y_%m_%d_%H_%M_%S') policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=( 32, 32, ), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=1000, max_path_length=env.horizon, n_itr=100, discount=0.995, step_size=0.001, plot=False, ) run_experiment_lite( algo.train(), # Number of parallel workers for sampling n_parallel=4, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", # script="scripts/run_experiment_lite_rl.py", script="scripts/run_experiment_lite.py", exp_name=os.path.join("Baseline %d" % seed, timestamp), log_dir=os.path.join( "Results/Controls/Seed_Baseline/Baseline/%d" % seed, timestamp) # Specifies the seed for the experiment. If this is not provided, a random seed # will be used # plot=True, ) env = ControllerEnv(k=4, seed=seed, noise=0.05, num_dynamics=4, num_points=100) now = datetime.datetime.now() timestamp = now.strftime('%Y_%m_%d_%H_%M_%S') policy = CategoricalMLPPolicy( env_spec=env.spec, hidden_sizes=( 32, 32, ), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=1000, max_path_length=env.horizon, n_itr=100, discount=0.995, step_size=0.001, plot=False, ) run_experiment_lite( algo.train(), # Number of parallel workers for sampling n_parallel=4, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", # script="scripts/run_experiment_lite_rl.py", script="scripts/run_experiment_lite.py", exp_name=os.path.join("Meta %d" % seed, timestamp), log_dir=os.path.join( "Results/Controls/Seed_Baseline/Meta/%d" % seed, timestamp) # Specifies the seed for the experiment. If this is not provided, a random seed # will be used # plot=True, )
def run_task(_): """Implement the run_task method needed to run experiments with rllab.""" sumo_params = SumoParams( render=True, sim_step=0.2, restart_instance=True) # RL vehicles constitute 5% of the total number of vehicles vehicles = Vehicles() vehicles.add( veh_id="human", acceleration_controller=(IDMController, { "noise": 0.2 }), speed_mode="no_collide", num_vehicles=5) vehicles.add( veh_id="rl", acceleration_controller=(RLController, {}), speed_mode="no_collide", num_vehicles=0) # Vehicles are introduced from both sides of merge, with RL vehicles # entering from the highway portion as well inflow = InFlows() inflow.add( veh_type="human", edge="inflow_highway", vehs_per_hour=(1 - RL_PENETRATION) * FLOW_RATE, departLane="free", departSpeed=10) inflow.add( veh_type="rl", edge="inflow_highway", vehs_per_hour=RL_PENETRATION * FLOW_RATE, departLane="free", departSpeed=10) inflow.add( veh_type="human", edge="inflow_merge", vehs_per_hour=100, departLane="free", departSpeed=7.5) additional_env_params = { "target_velocity": 25, "num_rl": NUM_RL, "max_accel": 1.5, "max_decel": 1.5 } env_params = EnvParams( horizon=HORIZON, sims_per_step=5, warmup_steps=0, additional_params=additional_env_params) additional_net_params = ADDITIONAL_NET_PARAMS.copy() additional_net_params["merge_lanes"] = 1 additional_net_params["highway_lanes"] = 1 additional_net_params["pre_merge_length"] = 500 net_params = NetParams( inflows=inflow, no_internal_links=False, additional_params=additional_net_params) initial_config = InitialConfig( spacing="uniform", lanes_distribution=float("inf")) scenario = MergeScenario( name="merge-rl", vehicles=vehicles, net_params=net_params, initial_config=initial_config) env_name = "WaveAttenuationMergePOEnv" pass_params = (env_name, sumo_params, vehicles, env_params, net_params, initial_config, scenario) env = GymEnv(env_name, record_video=False, register_params=pass_params) env = normalize(env) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(32, 32, 32), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=HORIZON * N_ROLLOUTS, max_path_length=HORIZON, n_itr=1000, # whole_paths=True, discount=0.999, ) algo.train(),
def experiment(variant): seed = variant['seed'] ; log_dir = variant['log_dir'] ; n_parallel = variant['n_parallel'] setup(seed, n_parallel , log_dir) init_file = variant['init_file'] ; taskIndex = variant['taskIndex'] n_itr = variant['n_itr'] ; default_step = variant['default_step'] policyType = variant['policyType'] ; envType = variant['envType'] tasksFile = path_to_multiworld+'/multiworld/envs/goals/' + variant['tasksFile']+'.pkl' tasks = pickle.load(open(tasksFile, 'rb')) max_path_length = variant['max_path_length'] use_images = 'conv' in policyType if 'MultiDomain' in envType: baseEnv = Sawyer_MultiDomainEnv(tasks = tasks , image = use_images , mpl = max_path_length) elif 'Push' in envType: baseEnv = SawyerPushEnv(tasks = tasks , image = use_images , mpl = max_path_length) elif 'PickPlace' in envType: baseEnv = SawyerPickPlaceEnv( tasks = tasks , image = use_images , mpl = max_path_length) elif 'Door' in envType: baseEnv = SawyerDoorOpenEnv(tasks = tasks , image = use_images , mpl = max_path_length) elif 'Ant' in envType: env = TfEnv(normalize(AntEnvRandGoalRing())) elif 'Coffee' in envType: baseEnv = SawyerCoffeeEnv(mpl = max_path_length) else: raise AssertionError('') if envType in ['Push', 'PickPlace', 'Door']: if use_images: obs_keys = ['img_observation'] else: obs_keys = ['state_observation'] env = TfEnv(NormalizedBoxEnv(FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=obs_keys), reset_mode='idx'))) baseline = ZeroBaseline(env_spec=env.spec) #baseline = LinearFeatureBaseline(env_spec = env.spec) batch_size = variant['batch_size'] if policyType == 'fullAda_Bias': baseline = LinearFeatureBaseline(env_spec = env.spec) algo = vpg_fullADA( env=env, policy=None, load_policy = init_file, baseline=baseline, batch_size = batch_size, # 2x max_path_length=max_path_length, n_itr=n_itr, #noise_opt = True, default_step = default_step, sampler_cls=VectorizedSampler, # added by RK 6/19 sampler_args = dict(n_envs=1), #reset_arg=np.asscalar(taskIndex), reset_arg = taskIndex, log_dir = log_dir ) elif policyType == 'biasAda_Bias': algo = vpg_biasADA( env=env, policy=None, load_policy = init_file, baseline=baseline, batch_size= batch_size, # 2x max_path_length=max_path_length, n_itr=n_itr, #noise_opt = True, default_step = default_step, sampler_cls=VectorizedSampler, # added by RK 6/19 sampler_args = dict(n_envs=1), #reset_arg=np.asscalar(taskIndex), reset_arg = taskIndex, log_dir = log_dir ) elif policyType == 'basic': algo = vpg_basic( env=env, policy=None, load_policy=init_file, baseline=baseline, batch_size=batch_size, max_path_length=max_path_length, n_itr=n_itr, #step_size=10.0, sampler_cls=VectorizedSampler, # added by RK 6/19 sampler_args = dict(n_envs=1), reset_arg=taskIndex, optimizer=None, optimizer_args={'init_learning_rate': default_step, 'tf_optimizer_args': {'learning_rate': 0.5*default_step}, 'tf_optimizer_cls': tf.train.GradientDescentOptimizer}, log_dir = log_dir # extra_input="onehot_exploration", # added by RK 6/19 # extra_input_dim=5, # added by RK 6/19 ) elif 'conv' in policyType: algo = vpg_conv( env=env, policy=None, load_policy = init_file, baseline=baseline, batch_size=batch_size, # 2x max_path_length=max_path_length, n_itr=n_itr, sampler_cls=VectorizedSampler, # added by RK 6/19 sampler_args = dict(n_envs=1), #noise_opt = True, default_step = default_step, #reset_arg=np.asscalar(taskIndex), reset_arg = taskIndex, log_dir = log_dir ) else: raise AssertionError('Policy Type must be fullAda_Bias or biasAda_Bias') algo.train()
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res'] # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! if log_dir is None: log_dir = "/home/davheld/repos/rllab_goal_rl/data/local/debug" debug = True else: debug = False report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=5) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize(AntMazeEnv(maze_id=v['maze_id'])) uniform_goal_generator = UniformStateGenerator(state_size=v['goal_size'], bounds=v['goal_range'], center=v['goal_center']) env = GoalExplorationEnv( env=inner_env, goal_generator=uniform_goal_generator, obs2goal_transform=lambda x: x[:v['goal_size']], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], only_feasible=v['only_feasible'], goal_weight=v['goal_weight'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) outer_iter = 0 if not debug and not v['fast_mode']: logger.log('Generating the Initial Heatmap...') test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center']) report.new_row() sagg_riac = SaggRIAC(state_size=v['goal_size'], state_range=v['goal_range'], state_center=v['goal_center'], max_goals=v['max_goals'], max_history=v['max_history']) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) raw_goals = sagg_riac.sample_states(num_samples=v['num_new_goals']) goals = raw_goals with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment goal generator") env.update_goal_generator( UniformListStateGenerator( goals, persistence=v['persistence'], with_replacement=v['with_replacement'], )) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, discount=v['discount'], plot=False, ) all_paths = algo.train() if v['use_competence_ratio']: [goals, rewards ] = compute_rewards_from_paths(all_paths, key='competence', as_goal=True, env=env, terminal_eps=v['terminal_eps']) else: [goals, rewards] = compute_rewards_from_paths(all_paths, key='rewards', as_goal=True, env=env) [goals_with_labels, labels] = label_states_from_paths(all_paths, n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(goals_with_labels, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id']) logger.log('Generating the Heatmap...') test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center']) sagg_riac.plot_regions_interest(maze_id=v['maze_id'], report=report) sagg_riac.plot_regions_states(maze_id=v['maze_id'], report=report) logger.log("Updating SAGG-RIAC") sagg_riac.add_states(goals, rewards) # Find final states "accidentally" reached by the agent. final_goals = compute_final_states_from_paths(all_paths, as_goal=True, env=env) sagg_riac.add_accidental_states(final_goals, v['extend_dist_rew']) logger.dump_tabular(with_prefix=False) report.new_row()
def run_task(v): env, _ = create_env(v["which_agent"]) fw_learning_rate = v['fw_learning_rate'] # 0.0005! yaml_path = os.path.abspath('yaml_files/' + v['yaml_file'] + '.yaml') assert (os.path.exists(yaml_path)) with open(yaml_path, 'r') as f: params = yaml.load(f) num_fc_layers = params['dyn_model']['num_fc_layers'] depth_fc_layers = params['dyn_model']['depth_fc_layers'] batchsize = params['dyn_model']['batchsize'] lr = params['dyn_model']['lr'] print_minimal = v['print_minimal'] nEpoch = params['dyn_model']['nEpoch'] save_dir = os.path.join(args.save_dir, v['exp_name']) inputSize = env.spec.action_space.flat_dim + env.spec.observation_space.flat_dim outputSize = env.spec.observation_space.flat_dim #Initialize the forward policy policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64)) #learn_std=False, #v['learn_std'], #adaptive_std=False, #v['adaptive_std'], #output_gain=1, #v['output_gain'], #init_std=1) #v['polic) baseline = LinearFeatureBaseline(env_spec=env.spec) #Update function for the forward policy (immitation learning loss!) fwd_obs = TT.matrix('fwd_obs') fwd_act_out = TT.matrix('act_out') policy_dist = policy.dist_info_sym(fwd_obs) fw_loss = -TT.sum( policy.distribution.log_likelihood_sym(fwd_act_out, policy_dist)) fw_params = policy.get_params_internal() fw_update = lasagne.updates.adam(fw_loss, fw_params, learning_rate=fw_learning_rate) fw_func = theano.function([fwd_obs, fwd_act_out], fw_loss, updates=fw_update, allow_input_downcast=True) log_dir = v['yaml_file'] print('Logging Tensorboard to: %s' % log_dir) hist_logger = hist_logging(log_dir) optimizer_params = dict(base_eps=1e-5) if not os.path.exists(save_dir): os.makedirs(save_dir) os.makedirs(save_dir + '/losses') os.makedirs(save_dir + '/models') os.makedirs(save_dir + '/saved_forwardsim') os.makedirs(save_dir + '/saved_trajfollow') os.makedirs(save_dir + '/training_data') x_index, y_index, z_index, yaw_index,\ joint1_index, joint2_index, frontleg_index,\ frontshin_index, frontfoot_index, xvel_index, orientation_index = get_indices(v['which_agent']) dyn_model = Bw_Trans_Model(inputSize, outputSize, env, v, lr, batchsize, v['which_agent'], x_index, y_index, num_fc_layers, depth_fc_layers, print_minimal) for outer_iter in range(1, v['outer_iters']): algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v["batch_size"], max_path_length=v["steps_per_rollout"], n_itr=v["num_trpo_iters"], discount=0.995, optimizer=v["ConjugateGradientOptimizer"]( hvp_approach=v["FiniteDifferenceHvp"](**optimizer_params)), step_size=0.05, plot_true=True) all_paths = algo.train() #Collect the trajectories, using these trajectories which leads to high value states # learn a backwards model! observations_list = [] actions_list = [] rewards_list = [] returns_list = [] for indexing in all_paths: for paths in indexing: observations = [] actions = [] returns = [] reward_for_rollout = 0 for i_ in range(len(paths['observations'])): #since, we are building backwards model using trajectories, #so, reversing the trajectories. index_ = len(paths['observations']) - i_ - 1 observations.append(paths['observations'][index_]) actions.append(paths['actions'][index_]) returns.append(paths['returns'][index_]) reward_for_rollout += paths['rewards'][index_] #if something_ == 1: # actions_bw.append(path['actions'][::-1]) # observations_bw.append(path['observations'][::-1]) observations_list.append(observations) actions_list.append(actions) rewards_list.append(reward_for_rollout) returns_list.append(returns) hist_logger.log_scalar(save_dir, np.sum(rewards_list) / len(rewards_list), outer_iter * v["num_trpo_iters"]) selected_observations_list = [] selected_observations_list_for_state_seletection = [] selected_actions_list = [] selected_returns_list = [] #Figure out how to build the backwards model. #Conjecture_1 #------- Take quantile sample of trajectories which recieves highest cumulative rewards! number_of_trajectories = int( np.floor(v['top_k_trajectories'] * len(rewards_list) / 100)) rewards_list_np = np.asarray(rewards_list) trajectory_indices = rewards_list_np.argsort( )[-number_of_trajectories:][::-1] for index_ in range(len(trajectory_indices)): selected_observations_list.append( observations_list[trajectory_indices[index_]]) selected_actions_list.append( actions_list[trajectory_indices[index_]]) selected_observations_list_for_state_selection = [] number_of_trajectories = int( np.floor(v['top_k_trajectories_state_selection'] * len(rewards_list) / 100)) rewards_list_np = np.asarray(rewards_list) trajectory_indices = rewards_list_np.argsort( )[-number_of_trajectories:][::-1] for index_ in range(len(trajectory_indices)): selected_observations_list_for_state_seletection.append( observations_list[trajectory_indices[index_]]) selected_returns_list.append( returns_list[trajectory_indices[index_]]) #Figure out from where to start the backwards model. #Conjecture_1 #------ Take quantile sample of high value states, and start the backwards model from them! #which amounts to just taking a non parametric buffer of high values states, which should be #fine! if v['use_good_trajectories'] == 1: returns_list = selected_returns_list observations_list = selected_observations_list_for_state_selection flatten_ret_list = np.asarray(returns_list).flatten() flatten_obs_list = np.vstack(np.asarray(observations_list)) number_of_bw_samples = int( np.floor(v['top_k_bw_samples'] * len(flatten_ret_list) / 100)) samples_indices = flatten_ret_list.argsort( )[-number_of_bw_samples:][::-1] bw_samples = [] for bw_index in range(len(samples_indices)): bw_samples.append(flatten_obs_list[samples_indices[bw_index]]) #Not all parts of the state are actually used. states = from_observation_to_usablestate(selected_observations_list, v["which_agent"], False) controls = selected_actions_list dataX, dataY = generate_training_data_inputs(states, controls) states = np.asarray(states) dataZ = generate_training_data_outputs(states, v['which_agent']) #every component (i.e. x position) should become mean 0, std 1 dataX, mean_x, std_x = zero_mean_unit_std(dataX) dataY, mean_y, std_y = zero_mean_unit_std(dataY) dataZ, mean_z, std_z = zero_mean_unit_std(dataZ) ## concatenate state and action, to be used for training dynamics inputs = np.concatenate((dataX, dataY), axis=1) outputs = np.copy(dataZ) assert inputs.shape[0] == outputs.shape[0] if v['num_imagination_steps'] == 10: nEpoch = 20 elif v['num_imagination_steps'] == 50: nEpoch = 20 elif v['num_imagination_steps'] == 100: nEpoch = 30 else: nEpoch = 20 nEpoch = v['nEpoch'] training_loss = dyn_model.train(inputs, outputs, inputs, outputs, nEpoch, save_dir, 1) print("Training Loss for Backwards model", training_loss) if v['running_baseline'] == False: for goal_ind in range(min(v['fw_iter'], len(bw_samples))): #train the backwards model #Give inital state, perform rollouts from backwards model.Right now, state is random, but it should #be selected from some particular list forwardsim_x_true = bw_samples[goal_ind] state_list, action_list = dyn_model.do_forward_sim( forwardsim_x_true, v['num_imagination_steps'], False, env, v['which_agent'], mean_x, mean_y, mean_z, std_x, std_y, std_z) #Incorporate the backwards trace into model based system. fw_func(np.vstack(state_list), np.vstack(action_list)) #print("Immitation Learning loss", loss) else: print('running TRPO baseline')
def main(): parser = argparse.ArgumentParser() # Hyperparameters parser.add_argument('--fw_ratio', type=float, default=0.1) parser.add_argument('--init_lr', type=float, default=5e-4) parser.add_argument('--tfboard_path', type=str, default='/tmp/tfboard') parser.add_argument('--gpu_ratio', type=float, default=0.99) args = parser.parse_args() # Param ranges seeds = range(2) for seed in seeds: mdp = TfEnv(normalize(env=GymEnv('Box3dReach-v12',record_video=False, \ log_dir='/tmp/gym_test',record_log=False))) name = 'trpo-state-v12-tf-icm-fw{}-initlr-{}'.format( args.fw_ratio, args.init_lr) policy = GaussianMLPPolicy( "mlp_policy", env_spec=mdp.spec, hidden_sizes=(64, 64, 32), output_nonlinearity=tf.nn.tanh, clip_action=False, ) baseline = LinearFeatureBaseline(mdp.spec, ) batch_size = 50000 algo = TRPO( env=mdp, policy=policy, baseline=baseline, batch_size=batch_size, whole_paths=True, max_path_length=1000, n_itr=2000, step_size=0.01, subsample_factor=1.0, sampler_cls=BatchSampler, ) algorithm = ICM( mdp, algo, args.tfboard_path + "/%s_%d" % (name, seed), feature_dim=mdp.spec.observation_space.flat_dim, forward_weight=args.fw_ratio, external_reward_weight=0.0, replay_pool_size=1000000, init_learning_rate=args.init_lr, n_updates_per_iter=1000, ) run_experiment_lite(algorithm.train(), exp_prefix=name, n_parallel=8, snapshot_mode="gap", snapshot_gap=200, seed=seed, mode="local")
def main(): np.random.seed(args.seed) tf.set_random_seed(args.seed) # create the environment env = _create_env(args) # create expert data expert_data_T, expert_data_V = _create_expert_data(args) expert_data = dict( train = expert_data_T, valid = expert_data_V ) # create policy policy, init_ops = _create_policy(args, env) # create auxiliary networks (invdyn, reward, variational posterior) invdyn_model, reward_model, info_model, env = _create_aux_networks(args, env) # create baseline if args.baseline_type == "linear": baseline = LinearFeatureBaseline(env_spec=None) else: assert False # use date and time to create new logging directory for each run date= calendar.datetime.date.today().strftime('%y-%m-%d') if date not in os.listdir(model_path): os.mkdir(model_path+'/'+date) c = 0 exp_name = '{}-'.format(args.exp_name) + str(c) while exp_name in os.listdir(model_path+'/'+date+'/'): c += 1 exp_name = '{}-'.format(args.exp_name)+str(c) exp_dir = date+'/'+exp_name log_dir = osp.join(config.LOG_DIR, exp_dir) policy.set_log_dir(log_dir) if info_model is not None: info_model.set_log_dir(log_dir) _create_log(args) # run GAIL algorithm models = {"policy":policy, "info":info_model, "reward":reward_model} bpo_args = dict( n_itr=args.n_itr, env=env, policy=policy, baseline=baseline, batch_size=args.trpo_batch_size, max_path_length=args.max_path_length, discount=args.discount, step_size=args.trpo_step_size, force_batch_sampler=True, whole_paths=True, init_ops=init_ops, optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)), save_models=[models[model_name] for model_name in args.save_models] ) vae_args = dict( kl_weight=args.kl_weight, ) curriculum = dict( start = args.curr_start, add = args.curr_add, step = args.curr_step ) if not args.model_all : curriculum = {} kwargs = {k:v for k, v in bpo_args.items() + vae_args.items()} algo = GAIL( args.exp_name, exp_name, expert_data, reward_model, args.gail_batch_size, invdyn_model=invdyn_model, info_model=info_model, debug=args.debug, model_all=args.model_all, curriculum=curriculum, rew_aug=args.rew_aug, use_replay_buffer=args.use_replay_buffer, **kwargs ) runner = RLLabRunner(algo, args, exp_dir) runner.train()
def run_task(*_): """Implement the run_task method needed to run experiments with rllab.""" sim_params = SumoParams(sim_step=0.2, render=True) # note that the vehicles are added sequentially by the scenario, # so place the merging vehicles after the vehicles in the ring vehicles = VehicleParams() # Inner ring vehicles vehicles.add( veh_id="human", acceleration_controller=(IDMController, { "noise": 0.2 }), lane_change_controller=(SimLaneChangeController, {}), routing_controller=(ContinuousRouter, {}), num_vehicles=6, car_following_params=SumoCarFollowingParams(minGap=0.0, tau=0.5), lane_change_params=SumoLaneChangeParams()) # A single learning agent in the inner ring vehicles.add( veh_id="rl", acceleration_controller=(RLController, {}), lane_change_controller=(SimLaneChangeController, {}), routing_controller=(ContinuousRouter, {}), num_vehicles=1, car_following_params=SumoCarFollowingParams( minGap=0.01, tau=0.5, speed_mode="obey_safe_speed" ), lane_change_params=SumoLaneChangeParams()) # Outer ring vehicles vehicles.add( veh_id="merge-human", acceleration_controller=(IDMController, { "noise": 0.2 }), lane_change_controller=(SimLaneChangeController, {}), routing_controller=(ContinuousRouter, {}), num_vehicles=10, car_following_params=SumoCarFollowingParams(minGap=0.0, tau=0.5), lane_change_params=SumoLaneChangeParams()) env_params = EnvParams( horizon=HORIZON, additional_params={ "target_velocity": 10, "max_accel": 3, "max_decel": 3, "sort_vehicles": False }) additional_net_params = ADDITIONAL_NET_PARAMS.copy() additional_net_params["ring_radius"] = 50 additional_net_params["inner_lanes"] = 1 additional_net_params["outer_lanes"] = 1 additional_net_params["lane_length"] = 75 net_params = NetParams( no_internal_links=False, additional_params=additional_net_params) initial_config = InitialConfig(x0=50, spacing="uniform") scenario = TwoLoopsOneMergingScenario( name=exp_tag, vehicles=vehicles, net_params=net_params, initial_config=initial_config) env_name = "AccelEnv" pass_params = (env_name, sim_params, vehicles, env_params, net_params, initial_config, scenario) env = GymEnv(env_name, record_video=False, register_params=pass_params) horizon = env.horizon env = normalize(env) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(100, 50, 25)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=64 * 3 * horizon, max_path_length=horizon, # whole_paths=True, n_itr=1000, discount=0.999, # step_size=0.01, ) algo.train()
def run_task(*_): """Implement the run_task method needed to run experiments with rllab.""" v_enter = 10 inner_length = 300 long_length = 100 short_length = 300 n = 3 m = 3 num_cars_left = 1 num_cars_right = 1 num_cars_top = 1 num_cars_bot = 1 tot_cars = (num_cars_left + num_cars_right) * m \ + (num_cars_bot + num_cars_top) * n grid_array = { "short_length": short_length, "inner_length": inner_length, "long_length": long_length, "row_num": n, "col_num": m, "cars_left": num_cars_left, "cars_right": num_cars_right, "cars_top": num_cars_top, "cars_bot": num_cars_bot } sumo_params = SumoParams(sim_step=1, render=True) vehicles = Vehicles() vehicles.add(veh_id="idm", acceleration_controller=(SumoCarFollowingController, {}), sumo_car_following_params=SumoCarFollowingParams( min_gap=2.5, tau=1.1, max_speed=v_enter), routing_controller=(GridRouter, {}), num_vehicles=tot_cars, speed_mode="all_checks") tl_logic = TrafficLights(baseline=False) additional_env_params = { "target_velocity": 50, "switch_time": 3.0, "num_observed": 2, "discrete": False, "tl_type": "controlled" } env_params = EnvParams(additional_params=additional_env_params) additional_net_params = { "speed_limit": 35, "grid_array": grid_array, "horizontal_lanes": 1, "vertical_lanes": 1 } initial_config, net_params = get_flow_params(10, 300, n, m, additional_net_params) scenario = SimpleGridScenario(name="grid-intersection", vehicles=vehicles, net_params=net_params, initial_config=initial_config, traffic_lights=tl_logic) env_name = "PO_TrafficLightGridEnv" pass_params = (env_name, sumo_params, vehicles, env_params, net_params, initial_config, scenario) env = GymEnv(env_name, record_video=False, register_params=pass_params) horizon = env.horizon env = normalize(env) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=40000, max_path_length=horizon, # whole_paths=True, n_itr=800, discount=0.999, # step_size=0.01, ) algo.train()
def experiment(variant): seed = variant['seed'] tf.set_random_seed(seed) np.random.seed(seed) random.seed(seed) fast_learning_rate = variant['flr'] fast_batch_size = variant[ 'fbs'] # 10 works for [0.1, 0.2], 20 doesn't improve much for [0,0.2] meta_batch_size = 20 # 10 also works, but much less stable, 20 is fairly stable, 40 is more stable max_path_length = 150 num_grad_updates = 1 meta_step_size = variant['mlr'] regionSize = variant['regionSize'] if regionSize == '20X20': tasksFile = '/root/code/multiworld/multiworld/envs/goals/pickPlace_20X20_6_8.pkl' else: assert regionSize == '60X30' tasksFile = '/root/code/multiworld/multiworld/envs/goals/pickPlace_60X30.pkl' tasks = pickle.load(open(tasksFile, 'rb')) envType = variant['envType'] if envType == 'Push': baseEnv = SawyerPushEnv(tasks=tasks) else: assert (envType) == 'PickPlace' baseEnv = SawyerPickPlaceEnv(tasks=tasks) env = FinnMamlEnv( FlatGoalEnv(baseEnv, obs_keys=['state_observation', 'state_desired_goal'])) env = TfEnv(NormalizedBoxEnv(env)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = MAMLTRPO( env=env, policy=None, load_policy=variant['init_param_file'], baseline=baseline, batch_size=fast_batch_size, # number of trajs for grad update max_path_length=max_path_length, meta_batch_size=meta_batch_size, num_grad_updates=num_grad_updates, n_itr=1000, use_maml=True, step_size=meta_step_size, plot=False, ) import os saveDir = variant['saveDir'] if os.path.isdir(saveDir) == False: os.mkdir(saveDir) logger.set_snapshot_dir(saveDir) logger.add_tabular_output(saveDir + 'progress.csv') algo.train()
# Param ranges seeds = range(5) for seed in seeds: mdp = TfEnv(normalize(env=GymEnv('Box3dReach-v11',record_video=False, \ log_dir='/tmp/gym_test',record_log=False))) policy = GaussianMLPPolicy( "mlp_policy", env_spec=mdp.spec, hidden_sizes=(64, 64, 32), output_nonlinearity=tf.nn.tanh, ) baseline = LinearFeatureBaseline( mdp.spec, ) batch_size = 50000 algo = TRPO( env=mdp, policy=policy, baseline=baseline, batch_size=batch_size, whole_paths=True, max_path_length=200, n_itr=1000, step_size=0.01, subsample_factor=1.0, sampler_cls=BatchSampler, )
def experiment(variant, comet_logger=comet_logger): from sandbox.rocky.tf.algos.maml_il import MAMLIL from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.baselines.gaussian_mlp_baseline import GaussianMLPBaseline from rllab.baselines.maml_gaussian_mlp_baseline import MAMLGaussianMLPBaseline from rllab.baselines.zero_baseline import ZeroBaseline from rllab.envs.normalized_env import normalize from rllab.misc.instrument import stub, run_experiment_lite from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy import MAMLGaussianMLPPolicy as basic_policy #from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_adaptivestep import MAMLGaussianMLPPolicy as fullAda_basic_policy from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_adaptivestep_biastransform import MAMLGaussianMLPPolicy as fullAda_Bias_policy from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_biasonlyadaptivestep_biastransform import MAMLGaussianMLPPolicy as biasAda_Bias_policy from sandbox.rocky.tf.policies.maml_minimal_conv_gauss_mlp_policy import MAMLGaussianMLPPolicy as conv_policy from sandbox.rocky.tf.optimizers.quad_dist_expert_optimizer import QuadDistExpertOptimizer from sandbox.rocky.tf.optimizers.first_order_optimizer import FirstOrderOptimizer from sandbox.rocky.tf.envs.base import TfEnv import sandbox.rocky.tf.core.layers as L from rllab.envs.mujoco.ant_env_rand_goal_ring import AntEnvRandGoalRing from multiworld.envs.mujoco.sawyer_xyz.push.sawyer_push import SawyerPushEnv from multiworld.envs.mujoco.sawyer_xyz.pickPlace.sawyer_pick_and_place import SawyerPickPlaceEnv from multiworld.envs.mujoco.sawyer_xyz.door.sawyer_door_open import SawyerDoorOpenEnv from multiworld.core.flat_goal_env import FlatGoalEnv from multiworld.core.finn_maml_env import FinnMamlEnv from multiworld.core.wrapper_env import NormalizedBoxEnv import tensorflow as tf import time from rllab.envs.gym_env import GymEnv from maml_examples.maml_experiment_vars import MOD_FUNC import numpy as np import random as rd import pickle import rllab.misc.logger as logger from rllab.misc.ext import set_seed import os seed = variant['seed'] n_parallel = 1 log_dir = variant['log_dir'] def setup(seed, n_parallel, log_dir): if seed is not None: set_seed(seed) if n_parallel > 0: from rllab.sampler import parallel_sampler parallel_sampler.initialize(n_parallel=n_parallel) if seed is not None: parallel_sampler.set_seed(seed) if os.path.isdir(log_dir) == False: os.makedirs(log_dir, exist_ok=True) logger.set_snapshot_dir(log_dir) logger.add_tabular_output(log_dir + '/progress.csv') setup(seed, n_parallel, log_dir) fast_batch_size = variant['fbs'] meta_batch_size = variant['mbs'] adam_steps = variant['adam_steps'] max_path_length = variant['max_path_length'] dagger = variant['dagger'] expert_policy_loc = variant['expert_policy_loc'] ldim = variant['ldim'] init_flr = variant['init_flr'] policyType = variant['policyType'] use_maesn = variant['use_maesn'] EXPERT_TRAJ_LOCATION = variant['expertDataLoc'] envType = variant['envType'] tasksFile = path_to_multiworld + 'multiworld/envs/goals/' + variant[ 'tasksFile'] + '.pkl' all_tasks = pickle.load(open(tasksFile, 'rb')) assert meta_batch_size <= len(all_tasks) tasks = all_tasks[:meta_batch_size] use_images = 'conv' in policyType if 'Push' == envType: baseEnv = SawyerPushEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif envType == 'sparsePush': baseEnv = SawyerPushEnv(tasks=tasks, image=use_images, mpl=max_path_length, rewMode='l2Sparse') elif 'PickPlace' in envType: baseEnv = SawyerPickPlaceEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'Door' in envType: baseEnv = SawyerDoorOpenEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'Ant' in envType: env = TfEnv(normalize(AntEnvRandGoalRing())) elif 'claw' in envType: env = TfEnv(DClawScrewRandGoal()) else: assert True == False if envType in ['Push', 'PickPlace', 'Door']: if use_images: obs_keys = ['img_observation'] else: obs_keys = ['state_observation'] env = TfEnv( NormalizedBoxEnv( FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=obs_keys), reset_mode='idx'))) algoClass = MAMLIL baseline = LinearFeatureBaseline(env_spec=env.spec) load_policy = variant['load_policy'] if load_policy != None: policy = None load_policy = variant['load_policy'] # if 'conv' in load_policy: # baseline = ZeroBaseline(env_spec=env.spec) elif 'fullAda_Bias' in policyType: policy = fullAda_Bias_policy(name="policy", env_spec=env.spec, grad_step_size=init_flr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), init_flr_full=init_flr, latent_dim=ldim) elif 'biasAda_Bias' in policyType: policy = biasAda_Bias_policy(name="policy", env_spec=env.spec, grad_step_size=init_flr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), init_flr_full=init_flr, latent_dim=ldim) elif 'basic' in policyType: policy = basic_policy( name="policy", env_spec=env.spec, grad_step_size=init_flr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), extra_input_dim=(0 if extra_input is "" else extra_input_dim), ) elif 'conv' in policyType: baseline = ZeroBaseline(env_spec=env.spec) policy = conv_policy( name="policy", latent_dim=ldim, policyType=policyType, env_spec=env.spec, init_flr=init_flr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), extra_input_dim=(0 if extra_input is "" else extra_input_dim), ) algo = algoClass( env=env, policy=policy, load_policy=load_policy, baseline=baseline, batch_size=fast_batch_size, # number of trajs for alpha grad update max_path_length=max_path_length, meta_batch_size= meta_batch_size, # number of tasks sampled for beta grad update num_grad_updates=num_grad_updates, # number of alpha grad updates n_itr=variant['iterations'], make_video=False, use_maml=True, use_pooled_goals=True, use_corr_term=use_corr_term, test_on_training_goals=test_on_training_goals, metalearn_baseline=False, # metalearn_baseline=False, limit_demos_num=limit_demos_num, test_goals_mult=1, step_size=meta_step_size, plot=False, beta_steps=beta_steps, adam_curve=None, adam_steps=adam_steps, pre_std_modifier=pre_std_modifier, l2loss_std_mult=l2loss_std_mult, importance_sampling_modifier=MOD_FUNC[''], post_std_modifier=post_std_modifier, expert_trajs_dir=EXPERT_TRAJ_LOCATION, expert_trajs_suffix='', seed=seed, extra_input=extra_input, extra_input_dim=(0 if extra_input is "" else extra_input_dim), plotDirPrefix=None, latent_dim=ldim, dagger=dagger, expert_policy_loc=expert_policy_loc, comet_logger=comet_logger) algo.train() tf.reset_default_graph()
def train(num_experiments, thread_id, queue): ############ DEFAULT PARAMETERS ############ env_name = None #Name of adversarial environment path_length = 1000 #Maximum episode length layer_size = tuple([100, 100, 100]) #Layer definition ifRender = False #Should we render? afterRender = 100 #After how many to animate n_exps = 1 #Number of training instances to run n_itr = 25 #Number of iterations of the alternating optimization n_pro_itr = 1 #Number of iterations for the protaginist n_adv_itr = 1 #Number of interations for the adversary batch_size = 4000 #Number of training samples for each iteration ifSave = True #Should we save? save_every = 100 #Save checkpoint every save_every iterations n_process = 1 #Number of parallel threads for sampling environment adv_fraction = 0.25 #Fraction of maximum adversarial force to be applied step_size = 0.01 #kl step size for TRPO gae_lambda = 0.97 #gae_lambda for learner save_dir = './results' #folder to save result in ############ ENV SPECIFIC PARAMETERS ############ env_name = 'Walker2dAdv-v1' layer_size = tuple([64, 64]) step_size = 0.1 gae_lambda = 0.97 batch_size = 25000 n_exps = num_experiments n_itr = 500 ifSave = False n_process = 4 adv_fraction = 5.0 adv_strengths = [] for i in range(0, int(adv_fraction) + 1, 1): adv_strengths.append(i) save_dir = './../results/AdvWalker' args = [ env_name, path_length, layer_size, ifRender, afterRender, n_exps, n_itr, n_pro_itr, n_adv_itr, batch_size, save_every, n_process, adv_fraction, step_size, gae_lambda, save_dir ] ############ ADVERSARIAL POLICY LOAD ############ filepath = './../initial_results/Walker/env-Walker2dAdv-v1_Exp1_Itr1500_BS25000_Adv0.25_stp0.01_lam0.97_507500.p' res_D = pickle.load(open(filepath, 'rb')) pretrained_adv_policy = res_D['adv_policy'] ############ MAIN LOOP ############ ## Initializing summaries for the tests ## const_test_rew_summary = [] rand_test_rew_summary = [] step_test_rew_summary = [] rand_step_test_rew_summary = [] adv_test_rew_summary = [] ## Preparing file to save results in ## save_prefix = 'static_env-{}_Exp{}_Itr{}_BS{}_Adv{}_stp{}_lam{}_{}'.format( env_name, n_exps, n_itr, batch_size, adv_fraction, step_size, gae_lambda, random.randint(0, 1000000)) save_name = save_dir + '/' + save_prefix ## Looping over experiments to carry out ## for ne in range(n_exps): ## Environment definition ## ## The second argument in GymEnv defines the relative magnitude of adversary. For testing we set this to 1.0. env = normalize(GymEnv(env_name, adv_fraction)) env_orig = normalize(GymEnv(env_name, 1.0)) ## Protagonist policy definition ## pro_policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=layer_size, is_protagonist=True) pro_baseline = LinearFeatureBaseline(env_spec=env.spec) ## Zero Adversary for the protagonist training ## zero_adv_policy = ConstantControlPolicy(env_spec=env.spec, is_protagonist=False, constant_val=0.0) ## Adversary policy definition ## adv_policy = pretrained_adv_policy adv_baseline = LinearFeatureBaseline(env_spec=env.spec) ## Initializing the parallel sampler ## parallel_sampler.initialize(n_process) ## Setting up summaries for testing for a specific training instance ## pro_rews = [] adv_rews = [] all_rews = [] const_testing_rews = [] const_testing_rews.append( test_const_adv(env_orig, pro_policy, path_length=path_length)) rand_testing_rews = [] rand_testing_rews.append( test_rand_adv(env_orig, pro_policy, path_length=path_length)) step_testing_rews = [] step_testing_rews.append( test_step_adv(env_orig, pro_policy, path_length=path_length)) rand_step_testing_rews = [] rand_step_testing_rews.append( test_rand_step_adv(env_orig, pro_policy, path_length=path_length)) adv_testing_rews = [] adv_testing_rews.append( test_learnt_adv(env, pro_policy, adv_policy, path_length=path_length)) ## Loops through adversary strength levels n_loopsize = int(n_itr / len(adv_strengths)) for adv_index, adv_strength in enumerate(adv_strengths): env = normalize(GymEnv(env_name, adv_strength)) ## Optimizer for the Protagonist ## pro_algo = TRPO(env=env, pro_policy=pro_policy, adv_policy=adv_policy, pro_baseline=pro_baseline, adv_baseline=adv_baseline, batch_size=batch_size, max_path_length=path_length, n_itr=n_pro_itr, discount=0.995, gae_lambda=gae_lambda, step_size=step_size, is_protagonist=True) logger.log( '\n\nAdversarial Level: {} Adversarial Strength: {}\n'.format( adv_index, adv_strength)) ## Beginning alternating optimization ## for ni in range(n_loopsize): logger.log( '\n\nThread: {} Experiment: {} Iteration: {}\n'.format( thread_id, ne, ni + n_loopsize * adv_index, )) ## Train Protagonist pro_algo.train() pro_rews += pro_algo.rews all_rews += pro_algo.rews logger.log('Protag Reward: {}'.format( np.array(pro_algo.rews).mean())) ## Test the learnt policies const_testing_rews.append( test_const_adv(env, pro_policy, path_length=path_length)) rand_testing_rews.append( test_rand_adv(env, pro_policy, path_length=path_length)) step_testing_rews.append( test_step_adv(env, pro_policy, path_length=path_length)) rand_step_testing_rews.append( test_rand_step_adv(env, pro_policy, path_length=path_length)) adv_testing_rews.append( test_learnt_adv(env, pro_policy, adv_policy, path_length=path_length)) if ni % afterRender == 0 and ifRender == True: test_const_adv(env, pro_policy, path_length=path_length, n_traj=1, render=True) if ni != 0 and ni % save_every == 0 and ifSave == True: ## SAVING CHECKPOINT INFO ## pickle.dump( { 'args': args, 'pro_policy': pro_policy, 'adv_policy': adv_policy, 'zero_test': [const_testing_rews], 'rand_test': [rand_testing_rews], 'step_test': [step_testing_rews], 'rand_step_test': [rand_step_testing_rews], 'iter_save': ni, 'exp_save': ne, 'adv_test': [adv_testing_rews] }, open( save_name + '_' + str(ni + n_loopsize * adv_index) + '.p', 'wb')) ## Shutting down the optimizer ## pro_algo.shutdown_worker() ## Updating the test summaries over all training instances const_test_rew_summary.append(const_testing_rews) rand_test_rew_summary.append(rand_testing_rews) step_test_rew_summary.append(step_testing_rews) rand_step_test_rew_summary.append(rand_step_testing_rews) adv_test_rew_summary.append(adv_testing_rews) queue.put([ const_test_rew_summary, rand_test_rew_summary, step_test_rew_summary, rand_step_test_rew_summary, adv_test_rew_summary ]) ############ SAVING MODEL ############ '''
z.append(y * t) t *= discount return np.array(z) load_policy = True # normalize() makes sure that the actions for the environment lies # within the range [-1, 1] (only works for environments with continuous actions) env = normalize(CartpoleEnv()) #env = GymEnv("InvertedPendulum-v1") # Initialize a neural network policy with a single hidden layer of 8 hidden units policy = GaussianMLPPolicy(env.spec, hidden_sizes=(100, 50, 25)) snap_policy = GaussianMLPPolicy(env.spec, hidden_sizes=(100, 50, 25)) back_up_policy = GaussianMLPPolicy(env.spec, hidden_sizes=(100, 50, 25)) parallel_sampler.populate_task(env, policy) baseline = LinearFeatureBaseline(env.spec) baseline_snap = LinearFeatureBaseline(env.spec) # policy.distribution returns a distribution object under rllab.distributions. It contains many utilities for computing # distribution-related quantities, given the computed dist_info_vars. Below we use dist.log_likelihood_sym to compute # the symbolic log-likelihood. For this example, the corresponding distribution is an instance of the class # rllab.distributions.DiagonalGaussian dist = policy.distribution snap_dist = snap_policy.distribution # We will collect 100 trajectories per iteration N = 100 # Each trajectory will have at most 100 time steps T = 500 #We will collect M secondary trajectories M = 20 #Number of sub-iterations
def run_FaReLI(input_feed=None): beta_adam_steps_list = [(1,50)] # beta_curve = [250,250,250,250,250,5,5,5,5,1,1,1,1,] # make sure to check maml_experiment_vars # beta_curve = [1000] # make sure to check maml_experiment_vars adam_curve = [250,249,248,247,245,50,50,10] # make sure to check maml_experiment_vars # adam_curve = None fast_learning_rates = [1.0] baselines = ['linear',] # linear GaussianMLP MAMLGaussianMLP zero env_option = '' # mode = "ec2" mode = "local" extra_input = "onehot_exploration" # "onehot_exploration" "gaussian_exploration" # extra_input = None extra_input_dim = 5 # extra_input_dim = None goals_suffixes = ["_200_40_1"] #,"_200_40_2", "_200_40_3","_200_40_4"] # goals_suffixes = ["_1000_40"] fast_batch_size_list = [20] # 20 # 10 works for [0.1, 0.2], 20 doesn't improve much for [0,0.2] #inner grad update size meta_batch_size_list = [40] # 40 @ 10 also works, but much less stable, 20 is fairly stable, 40 is more stable max_path_length = 100 # 100 num_grad_updates = 1 meta_step_size = 0.01 pre_std_modifier_list = [1.0] post_std_modifier_train_list = [0.00001] post_std_modifier_test_list = [0.00001] l2loss_std_mult_list = [1.0] importance_sampling_modifier_list = [''] #'', 'clip0.5_' limit_demos_num_list = [1] # 40 test_goals_mult = 1 bas_lr = 0.01 # baseline learning rate momentum=0.5 bas_hnl = tf.nn.relu baslayers_list = [(32,32), ] basas = 60 # baseline adam steps use_corr_term = True seeds = [1] #,2,3,4,5] envseeds = [6] use_maml = True test_on_training_goals = False for goals_suffix in goals_suffixes: for envseed in envseeds: for seed in seeds: for baslayers in baslayers_list: for fast_batch_size in fast_batch_size_list: for meta_batch_size in meta_batch_size_list: for ism in importance_sampling_modifier_list: for limit_demos_num in limit_demos_num_list: for l2loss_std_mult in l2loss_std_mult_list: for post_std_modifier_train in post_std_modifier_train_list: for post_std_modifier_test in post_std_modifier_test_list: for pre_std_modifier in pre_std_modifier_list: for fast_learning_rate in fast_learning_rates: for beta_steps, adam_steps in beta_adam_steps_list: for bas in baselines: stub(globals()) tf.set_random_seed(seed) np.random.seed(seed) rd.seed(seed) env = TfEnv(normalize(Reacher7DofMultitaskEnv(envseed=envseed))) exp_name = str( 'R7_IL' # +time.strftime("%D").replace("/", "")[0:4] + goals_suffix + "_" + str(seed) # + str(envseed) + ("" if use_corr_term else "nocorr") # + str(int(use_maml)) + ('_fbs' + str(fast_batch_size) if fast_batch_size!=20 else "") + ('_mbs' + str(meta_batch_size) if meta_batch_size!=40 else "") + ('_flr' + str(fast_learning_rate) if fast_learning_rate!=1.0 else "") + '_dem' + str(limit_demos_num) + ('_ei' + str(extra_input_dim) if type( extra_input_dim) == int else "") # + '_tgm' + str(test_goals_mult) # +'metalr_'+str(meta_step_size) # +'_ngrad'+str(num_grad_updates) + ("_bs" + str(beta_steps) if beta_steps != 1 else "") + "_as" + str(adam_steps) # +"_net" + str(net_size[0]) # +"_L2m" + str(l2loss_std_mult) + ("_prsm" + str( pre_std_modifier) if pre_std_modifier != 1 else "") # + "_pstr" + str(post_std_modifier_train) # + "_posm" + str(post_std_modifier_test) # + "_l2m" + str(l2loss_std_mult) + ("_" + ism if len(ism) > 0 else "") + "_bas" + bas[0] # +"_tfbe" # TF backend for baseline # +"_qdo" # quad dist optimizer + (("_bi" if bas_hnl == tf.identity else ( "_brel" if bas_hnl == tf.nn.relu else "_bth")) # identity or relu or tanh for baseline # + "_" + str(baslayers) # size + "_baslr" + str(bas_lr) + "_basas" + str(basas) if bas[0] in ["G", "M"] else "") # baseline adam steps + ("r" if test_on_training_goals else "") + "_" + time.strftime("%d%m_%H_%M")) policy = MAMLGaussianMLPPolicy( name="policy", env_spec=env.spec, grad_step_size=fast_learning_rate, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), std_modifier=pre_std_modifier, # metalearn_baseline=(bas == "MAMLGaussianMLP"), extra_input_dim=(0 if extra_input is None else extra_input_dim), ) if bas == 'zero': baseline = ZeroBaseline(env_spec=env.spec) elif bas == 'MAMLGaussianMLP': baseline = MAMLGaussianMLPBaseline(env_spec=env.spec, learning_rate=bas_lr, hidden_sizes=baslayers, hidden_nonlinearity=bas_hnl, repeat=basas, repeat_sym=basas, momentum=momentum, extra_input_dim=( 0 if extra_input is None else extra_input_dim), # learn_std=False, # use_trust_region=False, # optimizer=QuadDistExpertOptimizer( # name="bas_optimizer", # # tf_optimizer_cls=tf.train.GradientDescentOptimizer, # # tf_optimizer_args=dict( # # learning_rate=bas_lr, # # ), # # # tf_optimizer_cls=tf.train.AdamOptimizer, # # max_epochs=200, # # batch_size=None, # adam_steps=basas # ) ) elif bas == 'linear': baseline = LinearFeatureBaseline(env_spec=env.spec) elif "GaussianMLP" in bas: baseline = GaussianMLPBaseline(env_spec=env.spec, regressor_args=dict( hidden_sizes=baslayers, hidden_nonlinearity=bas_hnl, learn_std=False, # use_trust_region=False, # normalize_inputs=False, # normalize_outputs=False, optimizer=QuadDistExpertOptimizer( name="bas_optimizer", # tf_optimizer_cls=tf.train.GradientDescentOptimizer, # tf_optimizer_args=dict( # learning_rate=bas_lr, # ), # # tf_optimizer_cls=tf.train.AdamOptimizer, # max_epochs=200, # batch_size=None, adam_steps=basas, use_momentum_optimizer=True, ))) algo = MAMLIL( env=env, policy=policy, baseline=baseline, batch_size=fast_batch_size, # number of trajs for alpha grad update max_path_length=max_path_length, meta_batch_size=meta_batch_size, # number of tasks sampled for beta grad update num_grad_updates=num_grad_updates, # number of alpha grad updates n_itr=800, #100 make_video=True, use_maml=use_maml, use_pooled_goals=True, use_corr_term=use_corr_term, test_on_training_goals=test_on_training_goals, metalearn_baseline=(bas=="MAMLGaussianMLP"), # metalearn_baseline=False, limit_demos_num=limit_demos_num, test_goals_mult=test_goals_mult, step_size=meta_step_size, plot=False, beta_steps=beta_steps, adam_curve=adam_curve, adam_steps=adam_steps, pre_std_modifier=pre_std_modifier, l2loss_std_mult=l2loss_std_mult, importance_sampling_modifier=MOD_FUNC[ism], post_std_modifier_train=post_std_modifier_train, post_std_modifier_test=post_std_modifier_test, expert_trajs_dir=EXPERT_TRAJ_LOCATION_DICT[env_option+"."+mode+goals_suffix+("_"+str(extra_input_dim) if type(extra_input_dim) == int else "")], expert_trajs_suffix=("_"+str(extra_input_dim) if type(extra_input_dim) == int else ""), seed=seed, extra_input=extra_input, extra_input_dim=(0 if extra_input is None else extra_input_dim), input_feed=input_feed, run_on_pr2=False, ) run_experiment_lite( algo.train(), n_parallel=1, snapshot_mode="last", python_command='python3', seed=seed, exp_prefix=str('R7_IL_' +time.strftime("%D").replace("/", "")[0:4]), exp_name=exp_name, plot=False, sync_s3_pkl=True, mode=mode, terminate_machine=True, )
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res'] # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=4) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) tf_session = tf.Session() inner_env = normalize(PointMazeEnv(maze_id=v['maze_id'])) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) uniform_start_generator = UniformStateGenerator(state_size=v['start_size'], bounds=v['start_range'], center=v['start_center']) env = GoalStartExplorationEnv( env=inner_env, start_generator=uniform_start_generator, obs2start_transform=lambda x: x[:v['start_size']], goal_generator=fixed_goal_generator, obs2goal_transform=lambda x: x[:v['goal_size']], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], only_feasible=v['only_feasible'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) # initialize all logging arrays on itr0 outer_iter = 0 logger.log('Generating the Initial Heatmap...') plot_policy_means(policy, env, sampling_res=2, report=report, limit=v['start_range'], center=v['start_center']) # test_and_plot_policy(policy, env, as_goals=False, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], # itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center']) # GAN logger.log("Instantiating the GAN...") gan_configs = {key[4:]: value for key, value in v.items() if 'GAN_' in key} for key, value in gan_configs.items(): if value is tf.train.AdamOptimizer: gan_configs[key] = tf.train.AdamOptimizer(gan_configs[key + '_stepSize']) if value is tflearn.initializations.truncated_normal: gan_configs[key] = tflearn.initializations.truncated_normal( stddev=gan_configs[key + '_stddev']) gan = StateGAN( state_size=v['start_size'], evaluater_size=v['num_labels'], state_range=v['start_range'], state_center=v['start_center'], state_noise_level=v['start_noise_level'], generator_layers=v['gan_generator_layers'], discriminator_layers=v['gan_discriminator_layers'], noise_size=v['gan_noise_size'], tf_session=tf_session, configs=gan_configs, ) logger.log("pretraining the GAN...") if v['smart_init']: feasible_starts = generate_starts( env, starts=[v['ultimate_goal']], horizon=50) # without giving the policy it does brownian mo. labels = np.ones((feasible_starts.shape[0], 2)).astype(np.float32) # make them all good goals plot_labeled_states(feasible_starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id']) dis_loss, gen_loss = gan.pretrain(states=feasible_starts, outer_iters=v['gan_outer_iters']) print("Loss of Gen and Dis: ", gen_loss, dis_loss) else: gan.pretrain_uniform(outer_iters=500, report=report) # v['gan_outer_iters']) # log first samples form the GAN initial_starts, _ = gan.sample_states_with_noise(v['num_new_starts']) logger.log("Labeling the starts") labels = label_states(initial_starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(initial_starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id']) report.new_row() all_starts = StateCollection(distance_threshold=v['coll_eps']) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) # Sample GAN logger.log("Sampling starts from the GAN") raw_starts, _ = gan.sample_states_with_noise(v['num_new_starts']) if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0: old_starts = all_starts.sample(v['num_old_starts']) starts = np.vstack([raw_starts, old_starts]) else: starts = raw_starts with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment start generator") env.update_start_generator( UniformListStateGenerator( starts.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], )) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, discount=v['discount'], plot=False, ) trpo_paths = algo.train() if v['use_trpo_paths']: logger.log("labeling starts with trpo rollouts") [starts, labels] = label_states_from_paths( trpo_paths, n_traj=2, key='goal_reached', # using the min n_traj as_goal=False, env=env) paths = [path for paths in trpo_paths for path in paths] else: logger.log("labeling starts manually") labels, paths = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached', full_path=True) with logger.tabular_prefix("OnStarts_"): env.log_diagnostics(paths) plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id']) logger.log('Generating the Heatmap...') plot_policy_means(policy, env, sampling_res=2, report=report, limit=v['start_range'], center=v['start_center']) test_and_plot_policy(policy, env, as_goals=False, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center']) # ###### extra for deterministic: # logger.log("Labeling the goals deterministic") # with policy.set_std_to_0(): # labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1) # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) logger.log("Training the GAN") if np.any(labels): gan.train( starts, labels, v['gan_outer_iters'], ) logger.dump_tabular(with_prefix=False) report.new_row() # append new goals to list of all goals (replay buffer): Not the low reward ones!! filtered_raw_start = [ start for start, label in zip(starts, labels) if label[0] == 1 ] all_starts.append(filtered_raw_start)
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) sampling_res = 0 if 'sampling_res' not in v.keys() else v['sampling_res'] # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) tf_session = tf.Session() inner_env = normalize(AntEnv()) uniform_goal_generator = UniformStateGenerator(state_size=v['goal_size'], bounds=v['goal_range'], center=v['goal_center']) env = GoalExplorationEnv( env=inner_env, goal_generator=uniform_goal_generator, obs2goal_transform=lambda x: x[-3:-1], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], append_transformed_obs=v['append_transformed_obs'], append_extra_info=v['append_extra_info'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) if v['baseline'] == 'g_mlp': baseline = GaussianMLPBaseline(env_spec=env.spec) # initialize all logging arrays on itr0 outer_iter = 0 logger.log('Generating the Initial Heatmap...') test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'], bounds=v['goal_range']) # GAN logger.log("Instantiating the GAN...") gan_configs = {key[4:]: value for key, value in v.items() if 'GAN_' in key} for key, value in gan_configs.items(): if value is tf.train.AdamOptimizer: gan_configs[key] = tf.train.AdamOptimizer(gan_configs[key + '_stepSize']) if value is tflearn.initializations.truncated_normal: gan_configs[key] = tflearn.initializations.truncated_normal( stddev=gan_configs[key + '_stddev']) gan = StateGAN( state_size=v['goal_size'], evaluater_size=v['num_labels'], state_range=v['goal_range'], state_center=v['goal_center'], state_noise_level=v['goal_noise_level'], generator_layers=v['gan_generator_layers'], discriminator_layers=v['gan_discriminator_layers'], noise_size=v['gan_noise_size'], tf_session=tf_session, configs=gan_configs, ) # log first samples form the GAN initial_goals, _ = gan.sample_states_with_noise(v['num_new_goals']) logger.log("Labeling the goals") labels = label_states(initial_goals, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(initial_goals, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) report.new_row() all_goals = StateCollection(distance_threshold=v['coll_eps']) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) feasible_goals = generate_initial_goals(env, policy, v['goal_range'], goal_center=v['goal_center'], horizon=v['horizon']) labels = np.ones((feasible_goals.shape[0], 2)).astype(np.float32) # make them all good goals plot_labeled_states(feasible_goals, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], summary_string_base='On-policy Goals:\n') if v['only_on_policy']: goals = feasible_goals[np.random.choice( feasible_goals.shape[0], v['num_new_goals'], replace=False), :] else: logger.log("Training the GAN") gan.pretrain(feasible_goals, v['gan_outer_iters']) # Sample GAN logger.log("Sampling goals from the GAN") raw_goals, _ = gan.sample_states_with_noise(v['num_new_goals']) if v['replay_buffer'] and outer_iter > 0 and all_goals.size > 0: old_goals = all_goals.sample(v['num_old_goals']) goals = np.vstack([raw_goals, old_goals]) else: goals = raw_goals with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment goal generator") env.update_goal_generator( UniformListStateGenerator( goals.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], )) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, plot=False, ) trpo_paths = algo.train() if v['use_trpo_paths']: logger.log("labeling starts with trpo rollouts") [goals, labels] = label_states_from_paths( trpo_paths, n_traj=2, key='goal_reached', # using the min n_traj as_goal=True, env=env) paths = [path for paths in trpo_paths for path in paths] else: logger.log("labeling starts manually") labels, paths = label_states(goals, env, policy, v['horizon'], as_goals=True, n_traj=v['n_traj'], key='goal_reached', full_path=True) with logger.tabular_prefix("OnStarts_"): env.log_diagnostics(paths) logger.log('Generating the Heatmap...') test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'], bounds=v['goal_range']) plot_labeled_states(goals, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) logger.dump_tabular(with_prefix=False) report.new_row() # append new goals to list of all goals (replay buffer): Not the low reward ones!! filtered_raw_goals = [ goal for goal, label in zip(goals, labels) if label[0] == 1 ] # this is not used if no replay buffer all_goals.append(filtered_raw_goals) if v['add_on_policy']: logger.log("sampling on policy") feasible_goals = generate_initial_goals( env, policy, v['goal_range'], goal_center=v['goal_center'], horizon=v['horizon']) # downsampled_feasible_goals = feasible_goals[np.random.choice(feasible_goals.shape[0], v['add_on_policy']),:] all_goals.append(feasible_goals)
def run_task(*_): auton_cars = 20 sumo_params = SumoParams(time_step=0.1, human_speed_mode="no_collide", rl_speed_mode="no_collide", sumo_binary="sumo-gui") vehicles = Vehicles() vehicles.add_vehicles("idm", (RLController, {}), None, None, 0, 20) intensity = .2 v_enter = 10 env_params = EnvParams(additional_params={ "target_velocity": v_enter, "control-length": 150, "max_speed": v_enter }) additional_net_params = { "horizontal_length_in": 400, "horizontal_length_out": 800, "horizontal_lanes": 1, "vertical_length_in": 400, "vertical_length_out": 800, "vertical_lanes": 1, "speed_limit": { "horizontal": v_enter, "vertical": v_enter } } net_params = NetParams(no_internal_links=False, additional_params=additional_net_params) cfg_params = {"start_time": 0, "end_time": 3000, "cfg_path": "debug/cfg/"} initial_config = InitialConfig(spacing="custom", additional_params={ "intensity": intensity, "enter_speed": v_enter }) scenario = TwoWayIntersectionScenario("two-way-intersection", TwoWayIntersectionGenerator, vehicles, net_params, initial_config=initial_config) env = TwoIntersectionEnvironment(env_params, sumo_params, scenario) env_name = "TwoIntersectionEnvironment" pass_params = (env_name, sumo_params, vehicles, env_params, net_params, initial_config, scenario) env = GymEnv(env_name, record_video=False, register_params=pass_params) horizon = env.horizon env = normalize(env) logging.info("Experiment Set Up complete") print("experiment initialized") env = normalize(env) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=30000, max_path_length=horizon, # whole_paths=True, n_itr=200, discount=0.999, # step_size=0.01, ) algo.train()
if env.spec.action_space == 'Discrete': policy = CategoricalMLPPolicy( name="policy", env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32) ) else: policy = GaussianMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=(100, 50, 25) ) baseline = LinearFeatureBaseline(env_spec=env.spec) iters = args.num_iters algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=args.batch_size, # Mujoco tasks need 20000-50000 max_path_length=env.horizon, # And 500 n_itr=iters, discount=0.99, step_size=0.01, optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)) )
def run_task(*_): """Implement the run_task method needed to run experiments with rllab.""" V_ENTER = 30 INNER_LENGTH = 300 LONG_LENGTH = 100 SHORT_LENGTH = 300 N_ROWS = 3 N_COLUMNS = 3 NUM_CARS_LEFT = 1 NUM_CARS_RIGHT = 1 NUM_CARS_TOP = 1 NUM_CARS_BOT = 1 tot_cars = (NUM_CARS_LEFT + NUM_CARS_RIGHT) * N_COLUMNS \ + (NUM_CARS_BOT + NUM_CARS_TOP) * N_ROWS grid_array = { "short_length": SHORT_LENGTH, "inner_length": INNER_LENGTH, "long_length": LONG_LENGTH, "row_num": N_ROWS, "col_num": N_COLUMNS, "cars_left": NUM_CARS_LEFT, "cars_right": NUM_CARS_RIGHT, "cars_top": NUM_CARS_TOP, "cars_bot": NUM_CARS_BOT } sim_params = SumoParams(sim_step=1, render=True) vehicles = VehicleParams() vehicles.add(veh_id="idm", acceleration_controller=(SimCarFollowingController, {}), car_following_params=SumoCarFollowingParams( min_gap=2.5, tau=1.1, max_speed=V_ENTER, speed_mode="all_checks"), routing_controller=(GridRouter, {}), num_vehicles=tot_cars) tl_logic = TrafficLightParams(baseline=False) additional_env_params = { "target_velocity": 50, "switch_time": 3.0, "num_observed": 2, "discrete": False, "tl_type": "controlled" } env_params = EnvParams(additional_params=additional_env_params) additional_net_params = { "speed_limit": 35, "grid_array": grid_array, "horizontal_lanes": 1, "vertical_lanes": 1 } if USE_INFLOWS: initial_config, net_params = get_flow_params( v_enter=V_ENTER, vehs_per_hour=EDGE_INFLOW, col_num=N_COLUMNS, row_num=N_ROWS, add_net_params=additional_net_params) else: initial_config, net_params = get_non_flow_params( V_ENTER, additional_net_params) scenario = SimpleGridScenario(name="grid-intersection", vehicles=vehicles, net_params=net_params, initial_config=initial_config, traffic_lights=tl_logic) env_name = "PO_TrafficLightGridEnv" pass_params = (env_name, sim_params, vehicles, env_params, net_params, initial_config, scenario) env = GymEnv(env_name, record_video=False, register_params=pass_params) horizon = env.horizon env = normalize(env) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=40000, max_path_length=horizon, # whole_paths=True, n_itr=800, discount=0.999, # step_size=0.01, ) algo.train()
from rllab.envs.box2d.cartpole_env import CartpoleEnv from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy from rllab.envs.normalized_env import normalize import numpy as np import theano import theano.tensor as TT from lasagne.updates import adam # normalize() makes sure that the actions for the environment lies # within the range [-1, 1] (only works for environments with continuous actions) env = normalize(CartpoleEnv()) # Initialize a neural network policy with a single hidden layer of 8 hidden units policy = GaussianMLPPolicy(env.spec, hidden_sizes=(8,)) # Initialize a linear baseline estimator using default hand-crafted features baseline = LinearFeatureBaseline(env.spec) # We will collect 100 trajectories per iteration N = 100 # Each trajectory will have at most 100 time steps T = 100 # Number of iterations n_itr = 100 # Set the discount factor for the problem discount = 0.99 # Learning rate for the gradient update learning_rate = 0.1 # Construct the computation graph # Create a Theano variable for storing the observations
def train(self): expert_env = TfEnv( self.expert_env ) #TfEnv(GymEnv("Pusher3DOF-v1", force_reset=True, record_video=False)) # expert_env = TfEnv(normalize(ReacherEnv())) novice_env = TfEnv( self.novice_env ) #TfEnv(GymEnv("Pusher3DOFNoChange-v1", force_reset=True, record_video=True)) # novice_env = TfEnv(normalize(ReacherTwoEnv(), normalize_obs=True)) expert_fail_pol = RandomPolicy(expert_env.spec) policy = GaussianMLPPolicy( name="novice_policy", env_spec=novice_env.spec, init_std=10, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=expert_env.spec) algo = TRPO(env=novice_env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=self.horizon, n_itr=self.itrs, discount=0.99, step_size=0.01, optimizer=ConjugateGradientOptimizer( hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: #What do the n_itr and start_itr mean? algo.n_itr = 0 algo.start_itr = 0 algo.train(sess=sess) #TODO: What is happening here? im_height = self.imsize[0] im_width = self.imsize[1] im_channels = 3 dim_input = [im_height, im_width, im_channels] disc = ConvDiscriminator(input_dim=dim_input) #data = joblib.load(self.expert_pkl)#"/home/andrewliu/research/viewpoint/rllab-tpil/third_person_im/data/local/experiment/experiment_2017_05_07_20_58_39_0001/itr_123.pkl")#"/home/abhigupta/abhishek_sandbox/viewpoint/third_person_im/data/local/experiment/experiment_2017_05_06_18_07_38_0001/itr_900.pkl") #expert_policy = data['policy'] with open(self.expert_pkl, 'rb') as pfile: expert_policy = pickle.load(pfile) # expert_policy = load_expert_reacher(expert_env, sess) #Load the expert #TODO: Need to train the expert #from rllab.sampler.utils import rollout #while True: # t = rollout(env=expert_env, agent=expert_policy, max_path_length=50, animated=True) algo.n_itr = self.itrs trainer = CyberPunkTrainerGAIL(disc=disc, novice_policy_env=novice_env, expert_env=expert_env, novice_policy=policy, novice_policy_opt_algo=algo, expert_success_pol=expert_policy, im_width=im_width, im_height=im_height, im_channels=im_channels, tf_sess=sess, horizon=self.horizon) iterations = self.itrs for iter_step in range(0, iterations): logger.record_tabular('Iteration', iter_step) trainer.take_iteration(n_trajs_cost=self.trajs, n_trajs_policy=self.trajs) logger.dump_tabular(with_prefix=False) trainer.log_and_finish()