paths = [] for _ in range(N): observations = [] actions = [] rewards = [] observation = env.reset() for _ in range(T): # policy.get_action() returns a pair of values. The second one returns a dictionary, whose values contains # sufficient statistics for the action distribution. It should at least contain entries that would be # returned by calling policy.dist_info(), which is the non-symbolic analog of policy.dist_info_sym(). # Storing these statistics is useful, e.g., when forming importance sampling ratios. In our case it is # not needed. action, _ = policy.get_action(observation) # Recall that the last entry of the tuple stores diagnostic information about the environment. In our # case it is not needed. next_observation, reward, terminal, _ = env.step(action) observations.append(observation) actions.append(action) rewards.append(reward) observation = next_observation if terminal: # Finish rollout if terminal state reached break # We need to compute the empirical return for each time step along the # trajectory path = dict( observations=np.array(observations),
def doit(mode): from rllab.envs.box2d.cartpole_env import CartpoleEnv from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.baselines.zero_baseline import ZeroBaseline from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy from rllab.envs.normalized_env import normalize import numpy as np import theano import theano.tensor as TT from lasagne.updates import adam # normalize() makes sure that the actions for the environment lies # within the range [-1, 1] (only works for environments with continuous actions) env = normalize(CartpoleEnv()) # Initialize a neural network policy with a single hidden layer of 8 hidden units policy = GaussianMLPPolicy(env.spec, hidden_sizes=(8,)) # Initialize a linear baseline estimator using default hand-crafted features if "linbaseline" in mode: print('linear baseline') baseline = LinearFeatureBaseline(env.spec) elif "vanilla" in mode: print("zero baseline") baseline = ZeroBaseline(env.spec) elif mode == "batchavg": print('batch average baseline') # use a zero baseline but subtract the mean of the discounted returns (see below) baseline = ZeroBaseline(env.spec) if "_ztrans" in mode: print('z transform advantages') else: print('no z transform') # We will collect 100 trajectories per iteration N = 50 # Each trajectory will have at most 100 time steps T = 50 # Number of iterations n_itr = 50 # Set the discount factor for the problem discount = 0.99 # Learning rate for the gradient update learning_rate = 0.1 # Construct the computation graph # Create a Theano variable for storing the observations # We could have simply written `observations_var = TT.matrix('observations')` instead for this example. However, # doing it in a slightly more abstract way allows us to delegate to the environment for handling the correct data # type for the variable. For instance, for an environment with discrete observations, we might want to use integer # types if the observations are represented as one-hot vectors. observations_var = env.observation_space.new_tensor_variable( 'observations', # It should have 1 extra dimension since we want to represent a list of observations extra_dims=1 ) actions_var = env.action_space.new_tensor_variable( 'actions', extra_dims=1 ) advantages_var = TT.vector('advantages') # policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the # distribution of the actions. For a Gaussian policy, it contains the mean and (log) standard deviation. dist_info_vars = policy.dist_info_sym(observations_var) # policy.distribution returns a distribution object under rllab.distributions. It contains many utilities for computing # distribution-related quantities, given the computed dist_info_vars. Below we use dist.log_likelihood_sym to compute # the symbolic log-likelihood. For this example, the corresponding distribution is an instance of the class # rllab.distributions.DiagonalGaussian dist = policy.distribution # Note that we negate the objective, since most optimizers assume a # minimization problem surr = - TT.mean(dist.log_likelihood_sym(actions_var, dist_info_vars) * advantages_var) # Get the list of trainable parameters. params = policy.get_params(trainable=True) grads = theano.grad(surr, params) f_train = theano.function( inputs=[observations_var, actions_var, advantages_var], outputs=None, updates=adam(grads, params, learning_rate=learning_rate), allow_input_downcast=True ) results = [] for _ in range(n_itr): paths = [] for _ in range(N): observations = [] actions = [] rewards = [] observation = env.reset() for _ in range(T): # policy.get_action() returns a pair of values. The second one returns a dictionary, whose values contains # sufficient statistics for the action distribution. It should at least contain entries that would be # returned by calling policy.dist_info(), which is the non-symbolic analog of policy.dist_info_sym(). # Storing these statistics is useful, e.g., when forming importance sampling ratios. In our case it is # not needed. action, _ = policy.get_action(observation) # Recall that the last entry of the tuple stores diagnostic information about the environment. In our # case it is not needed. next_observation, reward, terminal, _ = env.step(action) observations.append(observation) actions.append(action) rewards.append(reward) observation = next_observation if terminal: # Finish rollout if terminal state reached break # We need to compute the empirical return for each time step along the # trajectory path = dict( observations=np.array(observations), actions=np.array(actions), rewards=np.array(rewards), ) path_baseline = baseline.predict(path) advantages = [] returns = [] return_so_far = 0 for t in range(len(rewards) - 1, -1, -1): return_so_far = rewards[t] + discount * return_so_far returns.append(return_so_far) advantage = return_so_far - path_baseline[t] advantages.append(advantage) # The advantages are stored backwards in time, so we need to revert it advantages = np.array(advantages[::-1]) # And we need to do the same thing for the list of returns returns = np.array(returns[::-1]) if "_ztrans" in mode: advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8) path["advantages"] = advantages path["returns"] = returns paths.append(path) baseline.fit(paths) observations = np.concatenate([p["observations"] for p in paths]) actions = np.concatenate([p["actions"] for p in paths]) advantages = np.concatenate([p["advantages"] for p in paths]) if mode == 'batchavg': # in this case `advantages` up to here are just our good old returns, without baseline or z transformation. # now we subtract their mean across all episodes. advantages = advantages - np.mean(advantages) f_train(observations, actions, advantages) avgr = np.mean([sum(p["rewards"]) for p in paths]) print(('Average Return:',avgr)) results.append(avgr) return results
def run_task(*_): # normalize() makes sure that the actions for the environment lies # within the range [-1, 1] (only works for environments with continuous actions) # normalize() makes sure that the actions for the environment lies # within the range [-1, 1] (only works for environments with continuous actions) env = normalize( GymEnv(env_name="LunarLanderContinuous-v2", force_reset=True)) # Initialize a neural network policy with a single hidden layer of 8 hidden units policy = GaussianMLPPolicy(env.spec, hidden_sizes=(64, 64)) # Initialize a linear baseline estimator using default hand-crafted features baseline = LinearFeatureBaseline(env.spec) # We will collect 100 trajectories per iteration N = 3 # Each trajectory will have at most 100 time steps T = 400 # Number of iterations n_itr = 1000 # Set the discount factor for the problem discount = 0.99 # Learning rate for the gradient update learning_rate = 0.001 # Construct the computation graph # Create a Theano variable for storing the observations # We could have simply written `observations_var = TT.matrix('observations')` instead for this example. However, # doing it in a slightly more abstract way allows us to delegate to the environment for handling the correct data # type for the variable. For instance, for an environment with discrete observations, we might want to use integer # types if the observations are represented as one-hot vectors. observations_var = env.observation_space.new_tensor_variable( 'observations', # It should have 1 extra dimension since we want to represent a list of observations extra_dims=1) actions_var = env.action_space.new_tensor_variable('actions', extra_dims=1) advantages_var = TT.vector('advantages') # policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the # distribution of the actions. For a Gaussian policy, it contains the mean and (log) standard deviation. dist_info_vars = policy.dist_info_sym(observations_var) # policy.distribution returns a distribution object under rllab.distributions. It contains many utilities for computing # distribution-related quantities, given the computed dist_info_vars. Below we use dist.log_likelihood_sym to compute # the symbolic log-likelihood. For this example, the corresponding distribution is an instance of the class # rllab.distributions.DiagonalGaussian dist = policy.distribution # Note that we negate the objective, since most optimizers assume a # minimization problem surr = -TT.mean( dist.log_likelihood_sym(actions_var, dist_info_vars) * advantages_var) # Get the list of trainable parameters. params = policy.get_params(trainable=True) grads = theano.grad(surr, params) f_train = theano.function( inputs=[observations_var, actions_var, advantages_var], outputs=None, updates=adam(grads, params, learning_rate=learning_rate), allow_input_downcast=True) for epoch in range(n_itr): logger.push_prefix('epoch #%d | ' % epoch) logger.log("Training started") paths = [] for _ in range(N): observations = [] actions = [] rewards = [] observation = env.reset() for _ in range(T): # policy.get_action() returns a pair of values. The second one returns a dictionary, whose values contains # sufficient statistics for the action distribution. It should at least contain entries that would be # returned by calling policy.dist_info(), which is the non-symbolic analog of policy.dist_info_sym(). # Storing these statistics is useful, e.g., when forming importance sampling ratios. In our case it is # not needed. action, _ = policy.get_action(observation) # Recall that the last entry of the tuple stores diagnostic information about the environment. In our # case it is not needed. next_observation, reward, terminal, _ = env.step(action) observations.append(observation) actions.append(action) rewards.append(reward) observation = next_observation if terminal: # Finish rollout if terminal state reached break # We need to compute the empirical return for each time step along the # trajectory path = dict( observations=np.array(observations), actions=np.array(actions), rewards=np.array(rewards), ) path_baseline = baseline.predict(path) advantages = [] returns = [] return_so_far = 0 for t in range(len(rewards) - 1, -1, -1): return_so_far = rewards[t] + discount * return_so_far returns.append(return_so_far) advantage = return_so_far - path_baseline[t] advantages.append(advantage) # The advantages are stored backwards in time, so we need to revert it advantages = np.array(advantages[::-1]) # And we need to do the same thing for the list of returns returns = np.array(returns[::-1]) advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8) path["advantages"] = advantages path["returns"] = returns paths.append(path) baseline.fit(paths) observations = np.concatenate([p["observations"] for p in paths]) actions = np.concatenate([p["actions"] for p in paths]) advantages = np.concatenate([p["advantages"] for p in paths]) f_train(observations, actions, advantages) returns_to_check = [sum(p["rewards"]) for p in paths] print('Average Return:', np.mean(returns_to_check)) ############################################################################ logger.log("Training finished") logger.save_itr_params(epoch, params) logger.dump_tabular(with_prefix=False) logger.pop_prefix() logger.record_tabular('Epoch', epoch) logger.record_tabular('Steps', epoch * N * T) logger.record_tabular('AverageReturn', np.mean(returns_to_check)) logger.record_tabular('StdReturn', np.std(returns_to_check)) logger.record_tabular('MaxReturn', np.max(returns_to_check)) logger.record_tabular('MinReturn', np.min(returns_to_check))
class Bw_Trans_Model: def __init__(self, inputSize, outputSize, env, v, learning_rate, batchsize, which_agent, x_index, y_index, num_fc_layers, depth_fc_layers, print_minimal): #init vars #self.sess = sess self.batchsize = batchsize self.which_agent = which_agent self.x_index = x_index self.y_index = y_index self.inputSize = inputSize self.outputSize = outputSize self.print_minimal = print_minimal LOW = -1000000 HIGH = 1000000 self.act_dim = env.spec.action_space.flat_dim self.obs_dim = env.spec.observation_space.flat_dim obs_to_act_spec = env.spec obsact_to_obs_spec = EnvSpec(observation_space=Box( LOW, HIGH, shape=(self.obs_dim + self.act_dim, )), action_space=Box(LOW, HIGH, shape=(self.obs_dim, ))) #TODO: Think, whether to learn std for backwards policy or not. self.bw_act_pol = GaussianMLPPolicy( env_spec=obs_to_act_spec, hidden_sizes=(64, 64), learn_std=v['bw_variance_learn'], ) self.bw_obs_pol = GaussianMLPPolicy( env_spec=obsact_to_obs_spec, hidden_sizes=(v['bw_model_hidden_size'], v['bw_model_hidden_size']), learn_std=v['bw_variance_learn'], hidden_nonlinearity=NL.rectify, ) self.obs_in = TT.matrix('obs_in') self.obsact_in = TT.matrix('obsact_in') self.act_out = TT.matrix('act_out') self.diff_out = TT.matrix('diff_out') bw_learning_rate = v['bw_learning_rate'] self.bw_act_dist = self.bw_act_pol.dist_info_sym(self.obs_in) self.bw_obs_dist = self.bw_obs_pol.dist_info_sym(self.obsact_in) self.bw_act_loss = -TT.sum( self.bw_act_pol.distribution.log_likelihood_sym( self.act_out, self.bw_act_dist)) bw_obs_loss = -TT.sum( self.bw_obs_pol.distribution.log_likelihood_sym( self.diff_out, self.bw_obs_dist)) bw_act_params = self.bw_act_pol.get_params_internal() bw_obs_params = self.bw_obs_pol.get_params_internal() #bw_params = bw_act_params + bw_obs_params bw_s_to_a_update = lasagne.updates.adam(self.bw_act_loss, bw_act_params, learning_rate=bw_learning_rate) bw_sa_to_s_update = lasagne.updates.adam( bw_obs_loss, bw_obs_params, learning_rate=bw_learning_rate) self.bw_act_train = theano.function([self.obs_in, self.act_out], self.bw_act_loss, updates=bw_s_to_a_update, allow_input_downcast=True) self.bw_obs_train = theano.function([self.obsact_in, self.diff_out], bw_obs_loss, updates=bw_sa_to_s_update, allow_input_downcast=True) def train(self, dataX, dataZ, dataX_new, dataZ_new, nEpoch, save_dir, fraction_use_new): #init vars start = time.time() training_loss_list = [] nData_old = dataX.shape[0] num_new_pts = dataX_new.shape[0] #how much of new data to use per batch if (num_new_pts < (self.batchsize * fraction_use_new)): batchsize_new_pts = num_new_pts #use all of the new ones else: batchsize_new_pts = int(self.batchsize * fraction_use_new) #how much of old data to use per batch batchsize_old_pts = int(self.batchsize - batchsize_new_pts) #training loop for i in range(nEpoch): #reset to 0 avg_loss = 0 num_batches = 0 if (batchsize_old_pts > 0): print("nothing is going on") #train completely from new set else: for batch in range( int(math.floor(num_new_pts / batchsize_new_pts))): #walk through the shuffled new data dataX_batch = dataX_new[batch * batchsize_new_pts:(batch + 1) * batchsize_new_pts, :] dataZ_batch = dataZ_new[batch * batchsize_new_pts:(batch + 1) * batchsize_new_pts, :] data_x = dataX_batch[:, 0:self.obs_dim] data_y = dataX_batch[:, self.obs_dim:] loss = self.bw_act_train(data_x, data_y) bw_obs_losses = self.bw_obs_train(dataX_batch, dataZ_batch) training_loss_list.append(loss) avg_loss += bw_obs_losses #[0] num_batches += 1 #shuffle new dataset after an epoch (if training only on it) p = npr.permutation(dataX_new.shape[0]) dataX_new = dataX_new[p] dataZ_new = dataZ_new[p] #save losses after an epoch np.save(save_dir + '/training_losses.npy', training_loss_list) if (not (self.print_minimal)): if ((i % 10) == 0): print("\n=== Epoch {} ===".format(i)) print("loss: ", avg_loss / num_batches) if (not (self.print_minimal)): print("Training set size: ", (nData_old + dataX_new.shape[0])) print("Training duration: {:0.2f} s".format(time.time() - start)) #done return (avg_loss / num_batches) #, old_loss, new_loss #multistep prediction using the learned dynamics model at each step def do_forward_sim(self, forwardsim_x_true, num_step, many_in_parallel, env_inp, which_agent, mean_x, mean_y, mean_z, std_x, std_y, std_z): #init vars state_list = [] action_list = [] if (many_in_parallel): #init vars print("Future work..") else: curr_state = np.copy( forwardsim_x_true) #curr state is of dim NN input for i in range(num_step): curr_state_preprocessed = curr_state - mean_x curr_state_preprocessed = np.nan_to_num( curr_state_preprocessed / std_x) action = self.bw_act_pol.get_action(curr_state_preprocessed)[0] action_ = action * std_y + mean_y state_difference = self.bw_obs_pol.get_action( np.concatenate((curr_state_preprocessed, action)))[0] state_differences = (state_difference * std_z) + mean_z next_state = curr_state + state_differences #copy the state info curr_state = np.copy(next_state) state_list.append(np.copy(curr_state)) action_list.append(np.copy(action_)) return state_list, action_list