def init_network(self): self.network = NetworkContinous(str(self.thread_id)) if pms.min_std is not None: log_std_var = tf.maximum(self.network.action_dist_logstds_n, np.log(pms.min_std)) self.action_dist_stds_n = tf.exp(log_std_var) self.old_dist_info_vars = dict(mean=self.network.old_dist_means_n, log_std=self.network.old_dist_logstds_n) self.new_dist_info_vars = dict( mean=self.network.action_dist_means_n, log_std=self.network.action_dist_logstds_n) self.likehood_action_dist = self.distribution.log_likelihood_sym( self.network.action_n, self.new_dist_info_vars) self.ratio_n = self.distribution.likelihood_ratio_sym( self.network.action_n, self.new_dist_info_vars, self.old_dist_info_vars) surr = -tf.reduce_mean( self.ratio_n * self.network.advant) # Surrogate loss kl = tf.reduce_mean( self.distribution.kl_sym(self.old_dist_info_vars, self.new_dist_info_vars)) ent = self.distribution.entropy(self.old_dist_info_vars) # ent = tf.reduce_sum(-p_n * tf.log(p_n + eps)) / Nf self.losses = [surr, kl, ent] var_list = self.network.var_list self.gf = GetFlat(self.session, var_list) # get theta from var_list self.sff = SetFromFlat(self.session, var_list) # set theta from var_List # get g self.pg = flatgrad(surr, var_list) # get A # KL divergence where first arg is fixed # replace old->tf.stop_gradient from previous kl kl_firstfixed = kl_sym_gradient(self.network.old_dist_means_n, self.network.old_dist_logstds_n, self.network.action_dist_means_n, self.network.action_dist_logstds_n) grads = tf.gradients(kl, var_list) self.flat_tangent = tf.placeholder(dtype, shape=[None]) shapes = map(var_shape, var_list) start = 0 tangents = [] for shape in shapes: size = np.prod(shape) param = tf.reshape(self.flat_tangent[start:(start + size)], shape) tangents.append(param) start += size self.gvp = [tf.reduce_sum(g * t) for (g, t) in zip(grads, tangents)] self.fvp = flatgrad(tf.reduce_sum(self.gvp), var_list) # get kl''*p
def init_network(self): """ [input] self.obs self.action_n self.advant self.old_dist_means_n self.old_dist_logstds_n [output] self.action_dist_means_n self.action_dist_logstds_n var_list """ self.net = NetworkContinous("network_continous") if pms.min_std is not None: log_std_var = tf.maximum(self.net.action_dist_logstds_n, np.log(pms.min_std)) self.action_dist_stds_n = tf.exp(log_std_var) self.old_dist_info_vars = dict(mean=self.net.old_dist_means_n, log_std=self.net.old_dist_logstds_n) self.new_dist_info_vars = dict(mean=self.net.action_dist_means_n, log_std=self.net.action_dist_logstds_n) self.likehood_action_dist = self.distribution.log_likelihood_sym(self.net.action_n, self.new_dist_info_vars) self.ratio_n = self.distribution.likelihood_ratio_sym(self.net.action_n, self.new_dist_info_vars, self.old_dist_info_vars) surr = -tf.reduce_mean(self.ratio_n * self.net.advant) # Surrogate loss batch_size = tf.shape(self.net.obs)[0] batch_size_float = tf.cast(batch_size , tf.float32) kl = tf.reduce_mean(self.distribution.kl_sym(self.old_dist_info_vars, self.new_dist_info_vars)) ent = self.distribution.entropy(self.old_dist_info_vars) # ent = tf.reduce_sum(-p_n * tf.log(p_n + eps)) / Nf self.losses = [surr, kl, ent] var_list = self.net.var_list self.gf = GetFlat(var_list) # get theta from var_list self.gf.session = self.session self.sff = SetFromFlat(var_list) # set theta from var_List self.sff.session = self.session # get g self.pg = flatgrad(surr, var_list) # get A # KL divergence where first arg is fixed # replace old->tf.stop_gradient from previous kl kl_firstfixed = self.distribution.kl_sym_firstfixed(self.new_dist_info_vars) / batch_size_float grads = tf.gradients(kl_firstfixed, var_list) self.flat_tangent = tf.placeholder(dtype, shape=[None]) shapes = map(var_shape, var_list) start = 0 tangents = [] for shape in shapes: size = np.prod(shape) param = tf.reshape(self.flat_tangent[start:(start + size)], shape) tangents.append(param) start += size self.gvp = [tf.reduce_sum(g * t) for (g, t) in zip(grads, tangents)] self.fvp = flatgrad(tf.reduce_sum(self.gvp), var_list) # get kl''*p self.session.run(tf.initialize_all_variables())
def __init__(self): gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1 / 3.0) self.session = tf.Session(config=tf.ConfigProto( gpu_options=gpu_options)) self.network = NetworkContinous("master") self.gf = GetFlat(self.session, self.network.var_list) # get theta from var_list self.sff = SetFromFlat( self.session, self.network.var_list) # set theta from var_List self.session.run(tf.initialize_all_variables()) self.saver = tf.train.Saver(max_to_keep=10) self.init_jobs() if pms.train_flag: self.init_logger()
def init_network(self): """ [input] self.obs self.action_n self.advant self.old_dist_means_n self.old_dist_logstds_n [output] self.action_dist_means_n self.action_dist_logstds_n var_list """ self.net = NetworkContinous("network_continous_ac") if pms.min_std is not None: log_std_var = tf.maximum(self.net.action_dist_logstds_n, np.log(pms.min_std)) self.action_dist_stds_n = tf.exp(log_std_var) self.old_dist_info_vars = dict(mean=self.net.old_dist_means_n, log_std=self.net.old_dist_logstds_n) self.new_dist_info_vars = dict(mean=self.net.action_dist_means_n, log_std=self.net.action_dist_logstds_n) self.likehood_new_action_dist = self.distribution.log_likelihood_sym( self.net.action_n, self.new_dist_info_vars) # surr = - log(\pi_\theta)*(Q^\pi-V^\pi) value_loss = 0.5 * tf.square(self.net.advant) surr = -tf.reduce_sum( self.likehood_new_action_dist * tf.stop_gradient(self.net.advant) + value_loss) # Surrogate loss batch_size = tf.shape(self.net.obs)[0] batch_size_float = tf.cast(batch_size, tf.float32) kl = tf.reduce_mean( self.distribution.kl_sym(self.old_dist_info_vars, self.new_dist_info_vars)) ent = self.distribution.entropy(self.old_dist_info_vars) # ent = tf.reduce_sum(-p_n * tf.log(p_n + eps)) / Nf self.losses = [surr, kl, ent] var_list = self.net.var_list self.gf = GetFlat(var_list) # get theta from var_list self.gf.session = self.session self.sff = SetFromFlat(var_list) # set theta from var_List self.sff.session = self.session # get g self.pg = flatgrad(surr, var_list) self.session.run(tf.initialize_all_variables())
def run(self): self.env = gym.make(self.args.environment_name) self.env.seed(randint(0, 999999)) if self.monitor: self.env.monitor.start('monitor/', force=True) self.net = NetworkContinous("rollout_network" + str(self.actor_id)) config = tf.ConfigProto(device_count={'GPU': 0}) self.session = tf.Session(config=config) var_list = self.net.var_list self.session.run(tf.initialize_all_variables()) self.set_policy = SetFromFlat(var_list) self.set_policy.session = self.session while True: # get a task, or wait until it gets one next_task = self.task_q.get(block=True) if type(next_task) is int and next_task == 1: # the task is an actor request to collect experience path = self.rollout() # print "single rollout time:"+str(end-start) self.task_q.task_done() self.result_q.put(path) elif type(next_task) is int and next_task == 2: print "kill message" if self.monitor: self.env.monitor.close() self.task_q.task_done() break else: # the task is to set parameters of the actor policy next_task = np.array(next_task) self.set_policy(next_task) # super hacky method to make sure when we fill the queue with set parameter tasks, # an actor doesn't finish updating before the other actors can accept their own tasks. time.sleep(0.1) self.task_q.task_done() return
class TRPOAgentContinousSingleProcess(object): def __init__(self, thread_id): print "create worker %d" % (thread_id) self.thread_id = thread_id self.env = env = Environment(gym.make(pms.environment_name)) # print("Observation Space", env.observation_space) # print("Action Space", env.action_space) # print("Action area, high:%f, low%f" % (env.action_space.high, env.action_space.low)) self.end_count = 0 self.paths = [] self.train = True self.baseline = Baseline() self.storage = Storage(self, self.env, self.baseline) self.distribution = DiagonalGaussian(pms.action_shape) self.session = self.master.session self.init_network() def init_network(self): self.network = NetworkContinous(str(self.thread_id)) if pms.min_std is not None: log_std_var = tf.maximum(self.network.action_dist_logstds_n, np.log(pms.min_std)) self.action_dist_stds_n = tf.exp(log_std_var) self.old_dist_info_vars = dict(mean=self.network.old_dist_means_n, log_std=self.network.old_dist_logstds_n) self.new_dist_info_vars = dict( mean=self.network.action_dist_means_n, log_std=self.network.action_dist_logstds_n) self.likehood_action_dist = self.distribution.log_likelihood_sym( self.network.action_n, self.new_dist_info_vars) self.ratio_n = self.distribution.likelihood_ratio_sym( self.network.action_n, self.new_dist_info_vars, self.old_dist_info_vars) surr = -tf.reduce_mean( self.ratio_n * self.network.advant) # Surrogate loss kl = tf.reduce_mean( self.distribution.kl_sym(self.old_dist_info_vars, self.new_dist_info_vars)) ent = self.distribution.entropy(self.old_dist_info_vars) # ent = tf.reduce_sum(-p_n * tf.log(p_n + eps)) / Nf self.losses = [surr, kl, ent] var_list = self.network.var_list self.gf = GetFlat(self.session, var_list) # get theta from var_list self.sff = SetFromFlat(self.session, var_list) # set theta from var_List # get g self.pg = flatgrad(surr, var_list) # get A # KL divergence where first arg is fixed # replace old->tf.stop_gradient from previous kl kl_firstfixed = kl_sym_gradient(self.network.old_dist_means_n, self.network.old_dist_logstds_n, self.network.action_dist_means_n, self.network.action_dist_logstds_n) grads = tf.gradients(kl, var_list) self.flat_tangent = tf.placeholder(dtype, shape=[None]) shapes = map(var_shape, var_list) start = 0 tangents = [] for shape in shapes: size = np.prod(shape) param = tf.reshape(self.flat_tangent[start:(start + size)], shape) tangents.append(param) start += size self.gvp = [tf.reduce_sum(g * t) for (g, t) in zip(grads, tangents)] self.fvp = flatgrad(tf.reduce_sum(self.gvp), var_list) # get kl''*p # self.load_model() def get_samples(self, path_number): for i in range(pms.paths_number): self.storage.get_single_path() def get_action(self, obs, *args): obs = np.expand_dims(obs, 0) # action_dist_logstd = np.expand_dims([np.log(pms.std)], 0) if pms.use_std_network: action_dist_means_n, action_dist_logstds_n = self.session.run( [self.action_dist_means_n, self.action_dist_logstds_n], {self.obs: obs}) if pms.train_flag: rnd = np.random.normal(size=action_dist_means_n[0].shape) action = rnd * np.exp( action_dist_logstds_n[0]) + action_dist_means_n[0] else: action = action_dist_means_n[0] # action = np.clip(action, pms.min_a, pms.max_a) return action, dict(mean=action_dist_means_n[0], log_std=action_dist_logstds_n[0]) else: action_dist_logstd = np.expand_dims([np.log(pms.std)], 0) action_dist_means_n = self.network.get_action_dist_means_n( self.session, obs) if pms.train_flag: rnd = np.random.normal(size=action_dist_means_n[0].shape) action = rnd * np.exp( action_dist_logstd[0]) + action_dist_means_n[0] else: action = action_dist_means_n[0] # action = np.clip(action, pms.min_a, pms.max_a) return action, dict(mean=action_dist_means_n[0], log_std=action_dist_logstd[0]) def run(self): self.learn() def learn(self): start_time = time.time() numeptotal = 0 while True: i = 0 # Generating paths. # print("Rollout") self.get_samples(pms.paths_number) paths = self.storage.get_paths() # get_paths # Computing returns and estimating advantage function. sample_data = self.storage.process_paths(paths) agent_infos = sample_data["agent_infos"] obs_n = sample_data["observations"] action_n = sample_data["actions"] advant_n = sample_data["advantages"] n_samples = len(obs_n) inds = np.random.choice(n_samples, math.floor(n_samples * pms.subsample_factor), replace=False) obs_n = obs_n[inds] action_n = action_n[inds] advant_n = advant_n[inds] action_dist_means_n = np.array( [agent_info["mean"] for agent_info in agent_infos[inds]]) action_dist_logstds_n = np.array( [agent_info["log_std"] for agent_info in agent_infos[inds]]) feed = { self.network.obs: obs_n, self.network.advant: advant_n, self.network.old_dist_means_n: action_dist_means_n, self.network.old_dist_logstds_n: action_dist_logstds_n, self.network.action_dist_logstds_n: action_dist_logstds_n, self.network.action_n: action_n } episoderewards = np.array( [path["rewards"].sum() for path in paths]) average_episode_std = np.mean(np.exp(action_dist_logstds_n)) # print "\n********** Iteration %i ************" % i for iter_num_per_train in range(pms.iter_num_per_train): # if not self.train: # print("Episode mean: %f" % episoderewards.mean()) # self.end_count += 1 # if self.end_count > 100: # break if self.train: thprev = self.gf() # get theta_old def fisher_vector_product(p): feed[self.flat_tangent] = p return self.session.run(self.fvp, feed) + pms.cg_damping * p g = self.session.run(self.pg, feed_dict=feed) stepdir = krylov.cg(fisher_vector_product, g, cg_iters=pms.cg_iters) shs = 0.5 * stepdir.dot( fisher_vector_product(stepdir)) # theta fullstep = stepdir * np.sqrt(2.0 * pms.max_kl / shs) neggdotstepdir = -g.dot(stepdir) def loss(th): self.sff(th) return self.session.run(self.losses, feed_dict=feed) surr_prev, kl_prev, ent_prev = loss(thprev) mean_advant = np.mean(advant_n) theta = linesearch(loss, thprev, fullstep, neggdotstepdir) self.sff(theta) surrafter, kloldnew, entnew = self.session.run( self.losses, feed_dict=feed) stats = {} numeptotal += len(episoderewards) stats["average_episode_std"] = average_episode_std stats["sum steps of episodes"] = sample_data[ "sum_episode_steps"] stats["Total number of episodes"] = numeptotal stats[ "Average sum of rewards per episode"] = episoderewards.mean( ) # stats["Entropy"] = entropy # exp = explained_variance(np.array(baseline_n), np.array(returns_n)) # stats["Baseline explained"] = exp stats["Time elapsed"] = "%.2f mins" % ( (time.time() - start_time) / 60.0) stats["KL between old and new distribution"] = kloldnew stats["Surrogate loss"] = surrafter stats["Surrogate loss prev"] = surr_prev stats["entropy"] = ent_prev stats["mean_advant"] = mean_advant log_data = [ average_episode_std, len(episoderewards), numeptotal, episoderewards.mean(), kloldnew, surrafter, surr_prev, surrafter - surr_prev, ent_prev, mean_advant ] self.master.logger.log_row(log_data) # for k, v in stats.iteritems(): # print(k + ": " + " " * (40 - len(k)) + str(v)) # # if entropy != entropy: # # exit(-1) # # if exp > 0.95: # # self.train = False if self.thread_id == 1: self.master.save_model("iter" + str(i)) print episoderewards.mean() i += 1 def test(self, model_name): self.load_model(model_name) for i in range(50): self.storage.get_single_path() def save_model(self, model_name): self.saver.save(self.session, "checkpoint/" + model_name + ".ckpt") def load_model(self, model_name): try: self.saver.restore(self.session, model_name) except: print "load model %s fail" % (model_name)