def f(inputs): input_state, input_action, input_frame = inputs[0], inputs[ 1], inputs[2] action_onehot = tf.one_hot(indices=input_action, depth=num_actions, on_value=1.0, off_value=0.0, axis=-1) net_se = network.Network([input_state], f_se, var_scope="state_encoder") se = net_se["se"].op if not self._with_ob: net_transition = network.Network([se, action_onehot], f_transition, var_scope="TranModel") net_decoder = network.Network( [tf.concat((se, se), axis=-1), input_frame], f_decoder, var_scope="Decoder") else: net_transition = network.Network([input_state, action_onehot], f_transition, var_scope="ObTranModel") net_decoder = network.Network([input_frame], f_decoder, var_scope="ObDecoder") return {}, { "se": net_se, "decoder": net_decoder, "transition": net_transition }
def init_network(self, f_create_net, state_shape, *args, **kwargs): input_state = tf.placeholder(dtype=tf.float32, shape=[None] + list(state_shape), name="input_state") net = network.Network([input_state], f_create_net, var_scope="learn") self._old_network = network.Network([input_state], f_create_net, var_scope="old") self._old_network_syncer = network.NetworkSyncer( net, self._old_network) return net
def f(inputs): state, action = input_state, input_action net_se = network.Network([state], f_se, var_scope="se") se = net_se["se"].op net_actor = network.Network([se], f_actor, var_scope="actor") net_critic = network.Network([se, input_action], f_critic, var_scope="critic") net_critic_for_a = net_critic([se, net_actor["action"].op], name_scope="v_critic") return { "action": net_actor["action"].op, "q": net_critic["q"].op, "v": net_critic_for_a["q"].op, }, { "se": net_se, "actor": net_actor, "critic": net_critic }
def init_network(self, f_iaa, state_shape, num_action, *args, **kwargs): input_state = tf.placeholder(dtype=tf.float32, shape=[None] + list(state_shape), name="input_state") input_action = tf.placeholder(dtype=tf.uint8, shape=[None], name="input_action") return network.Network([input_state, input_action, num_action], f_iaa, var_scope="learn")
def f_icm(inputs): """ :param inputs: a list, [state, next_state, action] :return: a dict of op """ f_se1 = network.Network([inputs[0]], f_se, var_scope='learn_se1') f_se1 = network.NetworkFunction(f_se1["se"]).output().op f_se2 = network.Network([inputs[1]], f_se, var_scope='learn_se2') f_se2 = network.NetworkFunction(f_se2["se"]).output().op f_ac_out = network.Network([f_se1], f_ac, var_scope='learn_ac') v = network.NetworkFunction(f_ac_out["v"]).output().op pi_dist = network.NetworkFunction(f_ac_out["pi"]).output().op one_hot_action = tf.one_hot(indices=inputs[2], depth=env.action_space.n, on_value=1.0, off_value=0.0, axis=-1) f_forward_out = network.Network([one_hot_action, f_se1], f_forward, var_scope='learn_forward') phi2_hat = network.NetworkFunction( f_forward_out["phi2_hat"]).output().op f_inverse_out = network.Network([f_se1, f_se2], f_inverse, var_scope='learn_inverse') logits = network.NetworkFunction( f_inverse_out["logits"]).output().op bonus = 0.05 * tf.reduce_sum(tf.square(f_se2 - phi2_hat), axis=1) return { "pi": pi_dist, "v": v, "logits": logits, "phi1": f_se1, "phi2": f_se2, "phi2_hat": phi2_hat, "bonus": bonus }
def init_network(self, f_se, f_transition, f_decoder, state_shape, num_actions, *args, **kwargs): def f(inputs): input_state, input_action, input_frame = inputs[0], inputs[ 1], inputs[2] action_onehot = tf.one_hot(indices=input_action, depth=num_actions, on_value=1.0, off_value=0.0, axis=-1) net_se = network.Network([input_state], f_se, var_scope="state_encoder") se = net_se["se"].op if not self._with_ob: net_transition = network.Network([se, action_onehot], f_transition, var_scope="TranModel") net_decoder = network.Network( [tf.concat((se, se), axis=-1), input_frame], f_decoder, var_scope="Decoder") else: net_transition = network.Network([input_state, action_onehot], f_transition, var_scope="ObTranModel") net_decoder = network.Network([input_frame], f_decoder, var_scope="ObDecoder") return {}, { "se": net_se, "decoder": net_decoder, "transition": net_transition } input_frame = tf.placeholder( dtype=tf.float32, shape=[None, state_shape[0], state_shape[1], 3], name="input_frame") input_state = tf.placeholder(dtype=tf.float32, shape=[None] + list(state_shape), name="input_state") input_action = tf.placeholder(dtype=tf.uint8, shape=[None], name="input_action") return network.Network([input_state, input_action, input_frame], f, var_scope="learn")
def f(inputs): state, action, noise = inputs se_net = network.Network([state], f_se, var_scope="se") se = se_net["se"].op q_net = network.Network([se, action], f_critic, var_scope="critic") pi_net = network.Network([se], f_actor, var_scope="actor") noise_net = network.Network([tf.stop_gradient(se), noise], f_noise, var_scope="noise") a_out, n_out = pi_net["action"].op, noise_net["noise"].op action_out = a_out + tf.abs(tf.sign(n_out) - a_out) * tf.tanh(n_out) return { "se": se, "action": action_out, "action_mean": a_out, "action_noise": n_out, "q": q_net["q"].op }, { "se": se_net, "actor": pi_net, "critic": q_net, "noise": noise_net }
def f_iaa(inputs): input_observation = inputs[0] if compute_with_diff: logging.warning("use diff 2333") diff_ob = [] for i in range(input_observation.shape[-1] / 3 - 1): diff_ob.append(input_observation[:, :, :, (i + 1) * 3:(i + 1) * 3 + 3] - input_observation[:, :, :, i * 3:i * 3 + 3]) net_se = network.Network([tf.concat(diff_ob[:], axis=3)], f_se, var_scope="se_1") self.processed_state_shape = copy.copy(state_shape) self.processed_state_shape[-1] = state_shape[-1] - 3 else: net_se = network.Network([input_observation], f_se, var_scope="se_1") self.processed_state_shape = state_shape input_action = inputs[1] action_dim = inputs[2] input_action = tf.one_hot(indices=input_action, depth=action_dim, on_value=1.0, off_value=0.0, axis=-1) se = net_se["se"].op input_reward = tf.placeholder(dtype=tf.float32, shape=[None, 1], name="input_reward") encode_state = tf.placeholder(dtype=tf.float32, shape=[None, se.shape.as_list()[-1]], name="encode_states") input_frame = tf.placeholder( dtype=tf.float32, shape=[None, state_shape[0], state_shape[1], 3], name="input_frame") rollout = network.Network([se], f_rollout, var_scope="rollout_policy") if not with_ob: net_model = network.Network([se, input_action], f_tran, var_scope="TranModel") net_decoder = network.Network([ tf.concat( (encode_state, encode_state), axis=-1), input_frame ], f_decoder, var_scope="Decoder") else: net_model = network.Network([input_observation, input_action], f_tran, var_scope="TranModelOB") net_decoder = network.Network([input_frame], f_decoder, var_scope="DecoderOB") rollout_encoder = network.Network( [tf.concat((se, se), axis=-1), input_reward], f_encoder, var_scope="rollout_encoder") current_state = se current_ob = input_observation for i in range(rollout_lane): for j in range(rollout_depth): current_rollout = rollout([current_state], name_scope="rollout_%d_%d" % (i, j)) # rollout_action_dist = tf.contrib.distributions.Categorical(rollout_action_function.output().op) # current_action = rollout_action_dist.sample() if not with_ob: tran_model = net_model([ current_state, current_rollout["rollout_action"].op ], name_scope="env_model_%d_%d" % (i, j)) else: tran_model = net_model( [current_ob, current_rollout["rollout_action"].op], name_scope="env_model_%d_%d" % (i, j)) next_goal = tran_model["next_state"].op reward = tran_model["reward"].op if not with_ob: current_state += next_goal else: current_ob = tf.concat( [current_ob[:, :, :, 3:], next_goal], axis=-1) next_goal = tf.stop_gradient( net_se([current_ob])["se"].op) if j == 0: encode_states = next_goal rollout_reward = reward else: encode_states = tf.concat([next_goal, encode_states], axis=-1) rollout_reward = tf.concat([rollout_reward, reward], axis=0) current_state = se current_ob = input_observation input_reward = tf.reshape(rollout_reward, [-1, rollout_depth]) input_reward = tf.split(input_reward, rollout_depth, axis=1) encode_state = tf.split(encode_states, rollout_depth, axis=1) for m in range(rollout_depth): if m == 0: rollout_encoder = rollout_encoder( [ tf.concat([ encode_state[-(m + 1)], encode_state[-(m + 1)] ], axis=-1), input_reward[-(m + 1)] ], name_scope="rollout_encoder_%d_%d" % (i, m)) re = rollout_encoder["re"].op else: rollout_encoder = rollout_encoder( [ tf.concat([re, encode_state[-(m + 1)]], axis=-1), input_reward[-(m + 1)] ], name_scope="rollout_encoder_%d_%d" % (i, m)) re = rollout_encoder["re"].op if i == 0: path = re else: path = tf.concat([path, re], axis=1) if policy_with_iaa: feature = tf.concat([path, se], axis=1) else: feature = se ac = network.Network([feature], f_ac, var_scope='ac') v = ac["v"].op pi_dist = ac["pi"].op return {"v": v, "pi": pi_dist, "rollout_action": None}, \ { "se": net_se, "transition": net_model, "state_decoder": net_decoder }