def __init__(self, act_space, lstm, gamma, scope="agent", **kwargs): self.act_space = act_space self.scope = scope self.s_t = kwargs.get("s") self.previous_actions = kwargs.get("prev_a") self.state_in = kwargs.get("state_in") self.slots = tf.cast(kwargs.get("slots"), tf.float32) prev_a = tf.one_hot(self.previous_actions, depth=act_space, dtype=tf.float32) s_feature, self.state_out = self.feature_net(self.s_t, lstm, prev_a, self.state_in, scope + "_feature") self.current_act_logits = self.a_net(s_feature, scope + "_a") self.current_act = tf.squeeze(categorical(self.current_act_logits), axis=-1) self.vf = self.v_net(s_feature, scope + "_value") * self.slots self.bootstrap_s = kwargs.get("bootstrap_s") if self.bootstrap_s is not None: self.bootstrap_slots = tf.cast(kwargs.get("bootstrap_slots"), tf.float32) self.r_t = kwargs.get("r") self.old_vf = kwargs.get("v_cur") self.old_act_logits = kwargs.get("a_logits") self.a_t = kwargs.get("a") a_onehot = tf.one_hot(self.a_t, depth=act_space, dtype=tf.float32) bootstrap_feature, _ = self.feature_net( self.bootstrap_s[:, None, :, :, :], lstm, a_onehot[:, -2:-1, :], self.state_out, scope + "_feature") bootstrap_feature = bootstrap_feature[:, -1, :] bootstrap_value = self.v_net( bootstrap_feature, scope + "_value") * self.bootstrap_slots vtrace = vtrace_from_logits( self.old_act_logits, self.current_act_logits, self.a_t, gamma * tf.ones_like(self.a_t, tf.float32), self.r_t, self.vf, bootstrap_value) self.vs = vtrace.vs self.adv = vtrace.advantages self.pg_adv = vtrace.pg_advantages
def __init__(self, act_space, rnn, use_rmc, use_hrnn, use_reward_prediction, after_rnn, use_pixel_control, use_pixel_reconstruction, scope="agent", **kwargs): self.act_space = act_space self.scope = scope self.use_rmc = use_rmc self.use_hrnn = use_hrnn self.s_t = kwargs.get("s") self.previous_actions = kwargs.get("prev_a") self.prev_r = kwargs.get("prev_r") self.state_in = kwargs.get("state_in") prev_a = tf.one_hot(self.previous_actions, depth=act_space, dtype=tf.float32) self.feature, self.cnn_feature, self.image_feature, self.state_out = self.feature_net( self.s_t, rnn, prev_a, self.prev_r, self.state_in, scope + "_current_feature") if self.use_hrnn: self.p_zs = self.feature["p_zs"] self.p_mus = self.feature["p_mus"] self.p_sigmas = self.feature["p_sigmas"] self.q_mus = self.feature["q_mus"] self.q_sigmas = self.feature["q_sigmas"] self.feature = self.feature["q_zs"] self.current_act_logits = self.a_net(self.feature, scope + "_acurrent") self.current_act = tf.squeeze(categorical(self.current_act_logits), axis=-1) self.current_value = self.v_net(self.feature, scope + "_ccurrent") advantage = kwargs.get("adv", None) if advantage is not None: self.old_current_value = kwargs.get("v_cur") self.ret = advantage + self.old_current_value self.a_t = kwargs.get("a") self.behavior_logits = kwargs.get("a_logits") self.r_t = kwargs.get("r") self.adv_mean = tf.reduce_mean(advantage, axis=[0, 1]) advantage -= self.adv_mean self.adv_std = tf.math.sqrt( tf.reduce_mean(advantage**2, axis=[0, 1])) self.advantage = advantage / tf.maximum(self.adv_std, 1e-12) self.slots = tf.cast(kwargs.get("slots"), tf.float32) if use_reward_prediction: if after_rnn: self.reward_prediction = self.r_net( self.feature, "r_net") else: self.reward_prediction = self.r_net( self.cnn_feature, "r_net") if use_pixel_reconstruction: self.pixel_reconstruction = self.reconstruct_net( self.feature) if use_pixel_control: self.pixel_control = self.control_net(self.feature)
def __init__(self, act_space, gamma, n_step, rnn, use_hrnn, use_rmc, use_amc, use_beta, use_reward_prediction, after_rnn, use_pixel_control, is_training=False, **kwargs): self.act_space = act_space self.n_step = n_step self.use_hrnn = use_hrnn self.use_rmc = use_rmc self.use_amc = use_amc self.s = kwargs.get("s") self.a = kwargs.get("a") self.r = kwargs.get("r") self.state_in = kwargs.get("state_in") feature, self.state_out = self.feature_net(self.s, rnn, self.a, self.r, self.state_in) if self.use_hrnn: self.p_zs = feature["p_zs"] self.p_mus = feature["p_mus"] self.p_sigmas = feature["p_sigmas"] self.q_mus = feature["q_mus"] self.q_sigmas = feature["q_sigmas"] feature = feature["q_zs"] with tf.variable_scope("alpha", reuse=tf.AUTO_REUSE): alpha = tf.get_variable(name="alpha", shape=(1, 1, 1), dtype=tf.float32, initializer=tf.zeros_initializer()) tf.summary.scalar("alpha", tf.reduce_mean(alpha)) alpha = tf.log(1.0 + tf.exp(alpha)) self.qf, self.current_value, self.current_act_logits = self.q_fn( feature, alpha, use_beta, "q") self.current_act = tf.squeeze(categorical(self.current_act_logits), axis=-1) if is_training: self.mask = tf.cast(kwargs.get("mask"), tf.float32) self.behavior_logits = kwargs.get("a_logits") self.old_vf = kwargs.get("v_cur") self.current_value = self.current_value * self.mask ''' get qa & qa1 & n_step_rewards ''' self.qa = tf.reduce_sum( tf.one_hot(self.a[:, 1:1 - self.n_step], depth=self.act_space, dtype=tf.float32) * self.qf[:, :-n_step], axis=-1) * self.mask[:, :-n_step] self.qf1, _, _ = self.q_fn(feature, alpha, use_beta, "q_target") q1f = self.qf[:, n_step:, :] q1f1 = self.qf1[:, n_step:, :] self.qa1 = doubleQ(q1f1, q1f) * self.mask[:, n_step:] # self.q1f = self.qf[:, n_step:, :] # self.qa1 = tf.reduce_max(self.q1f, axis=-1) * self.mask[:, n_step:] gammas = tf.pow( gamma, tf.range(0, get_shape(self.r)[1], dtype=tf.float32)) gammas_1 = 1.0 / gammas returns = tf.cumsum(self.r * gammas[None, :], axis=1) discount_n_step_rewards = returns[:, n_step:] - returns[:, : -n_step] self.n_step_rewards = discount_n_step_rewards * gammas_1[ None, :-n_step] self.n_step_qs = tf.stop_gradient(self.n_step_rewards + gamma**n_step * self.qa1) # target_values = tf.reduce_sum( # tf.one_hot( # self.a[:, 1: 1 - self.n_step], # depth=self.act_space, dtype=tf.float32 # ) * self.qf1[:, :-n_step], axis=-1) * self.mask[:, :-n_step] retrace = retrace_from_logits( self.behavior_logits[:, :-n_step, :], self.current_act_logits[:, :-n_step, :], self.a[:, 1:1 - n_step], gamma * tf.ones_like(self.a[:, 1:1 - n_step], tf.float32), tf.ones_like(self.a[:, 1:1 - n_step], tf.float32), self.r[:, 1:1 - n_step], self.qa, self.qa, self.qa1[:, -n_step]) self.retrace_qs = retrace.qs ''' get vtrace ''' vtrace = vtrace_from_logits( self.behavior_logits[:, :-n_step, :], self.current_act_logits[:, :-n_step, :], self.a[:, 1:1 - n_step], gamma * tf.ones_like(self.a[:, 1:1 - n_step], tf.float32), self.r[:, 1:1 - n_step], self.current_value[:, :-n_step], self.current_value[:, -n_step]) self.vs = vtrace.vs self.adv = vtrace.advantages self.pg_adv = vtrace.pg_advantages self.adv_mean = tf.reduce_mean(self.adv) advantages = self.adv - self.adv_mean self.adv_std = tf.math.sqrt(tf.reduce_mean(advantages**2)) if use_reward_prediction: if after_rnn: self.reward_prediction = self.r_net( feature[:, :-n_step, :]) else: raise ValueError("only after rnn") if use_pixel_control: self.pixel_control = self.control_net( feature[:, :-n_step, :])
def __init__(self, act_space, gamma, n_step, use_soft, rnn, use_hrnn, use_reward_prediction, after_rnn, use_pixel_control, is_training=False, **kwargs): self.act_space = act_space self.n_step = n_step self.use_hrnn = use_hrnn self.s = kwargs.get("s") self.a = kwargs.get("a") self.r = kwargs.get("r") self.state_in = kwargs.get("state_in") feature, self.state_out = self.feature_net( self.s, rnn, self.a, self.r, self.state_in) if self.use_hrnn: self.p_zs = feature["p_zs"] self.p_mus = feature["p_mus"] self.p_sigmas = feature["p_sigmas"] self.q_mus = feature["q_mus"] self.q_sigmas = feature["q_sigmas"] feature = feature["q_zs"] self.qf = self.q_fn(feature, "q") if use_soft: with tf.variable_scope("temperature", reuse=tf.AUTO_REUSE): temperature = tf.get_variable( name="temperature", shape=(1, 1, 1), dtype=tf.float32, initializer=tf.ones_initializer()) temperature = tf.log(1.0 + tf.exp(temperature)) self.qf_logits = temperature * self.qf self.current_act = tf.squeeze( categorical(self.qf_logits), axis=-1) else: self.current_act = tf.argmax(self.qf, axis=-1) if is_training: self.mask = tf.cast(kwargs.get("mask"), tf.float32) self.qa = tf.reduce_sum( tf.one_hot( self.a[:, 1: 1 - self.n_step], depth=self.act_space, dtype=tf.float32 ) * self.qf[:, :-self.n_step], axis=-1) * self.mask[:, :-n_step] feature1 = feature[:, n_step:, :] self.q1f1 = self.q_fn(feature1, "q_target") self.q1f = self.q_fn(feature1, "q") self.qa1 = doubleQ(self.q1f1, self.q1f) * self.mask[:, n_step:] gammas = tf.pow( gamma, tf.range(0, get_shape(self.r)[1], dtype=tf.float32)) gammas_1 = 1.0 / gammas returns = tf.cumsum(self.r * gammas[None, :], axis=1) discount_n_step_rewards = returns[:, n_step:] - returns[:, :-n_step] self.n_step_rewards = discount_n_step_rewards * gammas_1[None, :-n_step] if use_reward_prediction: if after_rnn: self.reward_prediction = self.r_net(feature[:, :-n_step, :]) else: raise ValueError("only after rnn") if use_pixel_control: self.pixel_control = self.control_net(feature[:, :-n_step, :])
def __init__(self, act_space, lstm, gamma, use_double, scope="agent", **kwargs): self.act_space = act_space self.scope = scope self.s = kwargs.get("s") self.prev_a = kwargs.get("prev_a") self.state_in = kwargs.get("state_in") self.slots = tf.cast(kwargs.get("slots"), tf.float32) feature, self.state_out = self.feature_net(self.s, lstm, self.prev_a, self.state_in) self.qf, self.vf, self.act_logits = self.head_fn( feature, self.slots, "current") self.act = tf.squeeze(categorical(self.act_logits), axis=-1) self.bootstrap_s = kwargs.get("bootstrap_s") if self.bootstrap_s is not None: self.bootstrap_prev_a = kwargs.get("bootstrap_prev_a") self.bootstrap_slots = tf.cast(kwargs.get("bootstrap_slots"), tf.float32) self.a = kwargs.get("a") self.r = kwargs.get("r") self.old_act_logits = kwargs.get("a_logits") self.n_step_r = kwargs.get("n_step_r") self.v_cur = kwargs.get("v_cur") self.advantage = kwargs.get("adv") self.v_tar = kwargs.get("v_tar") self.qa = tf.reduce_sum( tf.one_hot(self.a, depth=self.act_space, dtype=tf.float32) * self.qf, axis=-1) bootstrap_feature, _ = self.feature_net(self.bootstrap_s, lstm, self.bootstrap_prev_a, self.state_out) n_step = get_shape(bootstrap_feature)[1] feature1 = tf.concat([feature[:, n_step:, :], bootstrap_feature], axis=1) slots1 = tf.concat([self.slots[:, n_step:], self.bootstrap_slots], axis=1) self.q1f, self.v1f, _ = self.head_fn(feature1, slots1, "current") if use_double: self.q1f1, self.v1f1, _ = self.head_fn(feature1, slots1, "target") self.qa1 = doubleQ(self.q1f1, self.q1f) else: self.qa1 = tf.reduce_max(self.q1f, axis=-1) vtrace = vtrace_from_logits( self.old_act_logits, self.act_logits, self.a, gamma * tf.ones_like(self.a, tf.float32), self.r, h_inv(self.vf), h_inv(self.v1f[:, 0])) self.vtrace_advantage = vtrace.advantages self.vtrace_vf = h(vtrace.vs)