def build_inputs(observation_space: gym.Space, action_space: gym.Space, scale: bool = False) -> Tuple[tf.Tensor, ...]: """Builds placeholders and processed input Tensors. Observation `obs_*` and `next_obs_*` placeholders and processed input tensors have shape `(None,) + obs_space.shape`. The action `act_*` placeholder and processed input tensors have shape `(None,) + act_space.shape`. Args: observation_space: The observation space. action_space: The action space. scale: Only relevant for environments with Box spaces. If True, then processed input Tensors are automatically scaled to the interval [0, 1]. Returns: obs_ph: Placeholder for old observations. act_ph: Placeholder for actions. next_obs_ph: Placeholder for new observations. obs_inp: Network-ready float32 Tensor with processed old observations. act_inp: Network-ready float32 Tensor with processed actions. next_obs_inp: Network-ready float32 Tensor with processed new observations. """ obs_ph, obs_inp = observation_input(observation_space, name="obs", scale=scale) act_ph, act_inp = observation_input(action_space, name="act", scale=scale) next_obs_ph, next_obs_inp = observation_input(observation_space, name="next_obs", scale=scale) return obs_ph, act_ph, next_obs_ph, obs_inp, act_inp, next_obs_inp
def _setup_input(self): with tf.variable_scope('input', reuse=False): self.input_x, self.process_x = observation_input(self._observ_space, self._num_batch) self.next_input_x, self.next_process_x = observation_input(self._observ_space, self._num_batch) pdtype = make_proba_dist_type(self._action_space) self.actions_ph = pdtype.sample_placeholder([self._num_batch], name="action_ph") self.one_hot_actions = tf.one_hot(self.actions_ph, self._action_space.n) self.capacity_ph = tf.placeholder(tf.float32, [], name='capacity_ph')
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, scale=False, obs_phs=None, add_action_ph=False): self.n_env = n_env self.n_steps = n_steps self.n_batch = n_batch with tf.variable_scope("input", reuse=False): if obs_phs is None: self._obs_ph, self._processed_obs = observation_input( ob_space, n_batch, scale=scale) else: self._obs_ph, self._processed_obs = obs_phs self._action_ph = None if add_action_ph: self._action_ph = tf.placeholder(dtype=ac_space.dtype, shape=(n_batch, ) + ac_space.shape, name="action_ph") self.sess = sess self.reuse = reuse self.ob_space = ob_space self.ac_space = ac_space
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=256, reuse=False, scale=False): self.n_env = n_env self.n_steps = n_steps self.obs_ph, self.processed_x = observation_input(ob_space, n_batch, scale=scale) self.masks_ph = tf.placeholder(tf.float32, [n_batch]) # mask (done t-1) self.states_ph = tf.placeholder(tf.float32, [self.n_env, n_lstm * 2]) # states self.pdtype = make_proba_dist_type(ac_space) self.sess = sess self.reuse = reuse self.is_discrete = isinstance(ac_space, Discrete) self.policy = None self.proba_distribution = None self.value_fn = None self.ob_space = ob_space
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, scale=False, obs_phs=None, add_action_ph=False): self.n_env = n_env self.n_steps = n_steps with tf.variable_scope("input", reuse=False): if obs_phs is None: self._obs_ph, self._processed_obs = observation_input(ob_space, n_batch, scale=scale) else: self._obs_ph, self._processed_obs = obs_phs self._action_ph = None if add_action_ph: self._action_ph = tf.placeholder(dtype=ac_space.dtype, shape=(n_batch,) + ac_space.shape, name="action_ph") if isinstance(ac_space, spaces.MultiDiscrete): self._action_mask_ph = tf.placeholder(dtype=tf.float32, shape=[n_batch].extend(ac_space.nvec), name="action_mask_ph") elif isinstance(ac_space, spaces.Discrete) or isinstance(ac_space, spaces.MultiBinary): self._action_mask_ph = tf.placeholder(dtype=tf.float32, shape=(n_batch, ac_space.n), name="action_mask_ph") elif isinstance(ac_space, spaces.Box): self._action_mask_ph = tf.placeholder(dtype=tf.float32, shape=(n_batch, ac_space.shape[0]), name="action_mask_ph") self.sess = sess self.reuse = reuse self.ob_space = ob_space self.ac_space = ac_space
def test_conv_kernel(): """Test convolution kernel with various input formats.""" filter_size_1 = 4 # The size of squared filter for the first layer filter_size_2 = (3, 5) # The size of non-squared filter for the second layer target_shape_1 = [2, 52, 40, 32] # The desired shape of the first layer target_shape_2 = [2, 13, 9, 32] # The desired shape of the second layer kwargs = {} n_envs = 1 n_steps = 2 n_batch = n_envs * n_steps scale = False env = gym.make(ENV_ID) ob_space = env.observation_space with tf.Graph().as_default(): _, scaled_images = observation_input(ob_space, n_batch, scale=scale) activ = tf.nn.relu layer_1 = activ(conv(scaled_images, 'c1', n_filters=32, filter_size=filter_size_1, stride=4, init_scale=np.sqrt(2), **kwargs)) layer_2 = activ(conv(layer_1, 'c2', n_filters=32, filter_size=filter_size_2, stride=4, init_scale=np.sqrt(2), **kwargs)) assert layer_1.shape == target_shape_1, \ "The shape of layer based on the squared kernel matrix is not correct. " \ "The current shape is {} and the desired shape is {}".format(layer_1.shape, target_shape_1) assert layer_2.shape == target_shape_2, \ "The shape of layer based on the non-squared kernel matrix is not correct. " \ "The current shape is {} and the desired shape is {}".format(layer_2.shape, target_shape_2) env.close()
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=256, reuse=False, scale=False, obs_phs=None, add_action_ph=False): self.n_env = n_env self.n_steps = n_steps with tf.variable_scope("input", reuse=False): if obs_phs is None: self.obs_ph, self.processed_obs = observation_input( ob_space, n_batch, scale=scale) else: self.obs_ph, self.processed_obs = obs_phs self.masks_ph = tf.placeholder(tf.float32, [n_batch], name="masks_ph") # mask (done t-1) self.states_ph = tf.placeholder(tf.float32, [self.n_env, n_lstm * 2], name="states_ph") # states self.action_ph = None if add_action_ph: self.action_ph = tf.placeholder(dtype=ac_space.dtype, shape=(None, ) + ac_space.shape, name="action_ph") self.sess = sess self.reuse = reuse self.ob_space = ob_space self.ac_space = ac_space
def main(): env_id = 'BreakoutNoFrameskip-v4' num_env = 5 seed = 0 env_args = {'episode_life': False, 'clip_rewards': False} env = VecFrameStack(make_atari_env(env_id, num_env, seed, wrapper_kwargs=env_args), 4) graph = tf.Graph() with graph.as_default(): sess = tf_util.make_session(graph=graph) with tf.variable_scope('input', reuse=False): input_x, process_x = observation_input(env.observation_space, num_env) print(env.action_space.shape) pdtype = make_proba_dist_type(env.action_space) actions_ph = pdtype.sample_placeholder([num_env], name="action_ph") one_hot_actions = tf.one_hot(actions_ph, env.action_space.n) print(input_x, process_x) print('action', actions_ph, one_hot_actions) beta = 0.1 mu, sigma_sq, recons_x = build_network(process_x, one_hot_actions) print(mu) print(sigma_sq) print(recons_x) with tf.name_scope('losses'): recons_loss = tf.losses.mean_squared_error(input_x, recons_x, scope='recons_loss') kl_divergence = -tf.reduce_mean(0.5 * (tf.add(1., sigma_sq) - tf.pow(mu, 2) - tf.exp(sigma_sq)), name='kl_divergence') loss = tf.add(recons_loss, tf.multiply( kl_divergence, beta), name='objective') print(loss) summary = utility.summary({recons_loss: 'recons_loss', kl_divergence: 'kl_divergence', mu: 'phi_mu', sigma_sq: 'sigma_sq', recons_x: 'recons_x', input_x: 'input_x', }, env.observation_space.shape) optimizer = tf.train.AdamOptimizer(learning_rate=0.0002, beta1=0.5) train_op = optimizer.minimize(loss) for event_file in LOG_DIR.glob('event*'): event_file.unlink() writer = tf.summary.FileWriter(LOG_DIR.as_posix(), sess.graph) sess.run(tf.global_variables_initializer()) observ = env.reset() actions = [env.action_space.sample() for _ in range(num_env)] print(env.observation_space) print(observ.shape) recons_image, summary_ = sess.run([recons_x, summary], feed_dict={input_x: observ, actions_ph: actions}) writer.add_summary(summary_, 0)
def __init__(self, obs_space: gym.Space, act_space: gym.Space): """Builds BasicRewardModel: adds placeholders and spaces but nothing else. The spaces passed are used to define the `observation_space` and `action_space` properties, and also are used to determine how to preprocess the observation and action placeholders, made available as `self._proc_{obs,act,next_obs}`. Args: obs_space: The observation space. act_space: The action space. """ RewardModel.__init__(self) self._obs_space = obs_space self._act_space = act_space self._obs_ph, self._proc_obs = env_in.observation_input(obs_space) self._next_obs_ph, self._proc_next_obs = env_in.observation_input(obs_space) self._act_ph, self._proc_act = env_in.observation_input(act_space) self._dones_ph = tf.placeholder(name="dones", shape=(None,), dtype=tf.bool) self._proc_dones = tf.cast(self._dones_ph, dtype=tf.float32)
def __init__(self, observation_space, name=None): """ Creates an input placeholder tailored to a specific observation space :param observation_space: (Gym Space) observation space of the environment. Should be one of the gym.spaces types :param name: (str) tensorflow name of the underlying placeholder """ is_image = len(observation_space.shape) == 3 inpt, self.processed_inpt = observation_input(observation_space, name=name, scale=is_image) super().__init__(inpt)
def build_inputs(observation_space: gym.Space, action_space: gym.Space, scale: bool = False) -> Tuple[tf.Tensor, ...]: """Builds placeholders and processed input Tensors. Observation `obs_*` and `next_obs_*` placeholders and processed input tensors have shape `(None,) + obs_space.shape`. The action `act_*` placeholder and processed input tensors have shape `(None,) + act_space.shape`. Args: observation_space: The observation space. action_space: The action space. scale: Only relevant for environments with Box spaces. If True, then processed input Tensors are automatically scaled to the interval [0, 1]. Returns: (phs, inps) where phs is a tuple of: obs_ph: Placeholder for old observations. act_ph: Placeholder for actions. next_obs_ph: Placeholder for new observations. done_ph: Placeholder for boolean episode termination. and inps is a tuple of: obs_inp: Network-ready float32 Tensor with processed old observations. act_inp: Network-ready float32 Tensor with processed actions. next_obs_inp: Network-ready float32 Tensor with processed new observations. dones_inp: Network-ready float32 tensor, with booleans 0-1 coded. """ obs_ph, obs_inp = observation_input(observation_space, name="obs", scale=scale) act_ph, act_inp = observation_input(action_space, name="act", scale=scale) next_obs_ph, next_obs_inp = observation_input(observation_space, name="next_obs", scale=scale) done_ph = tf.placeholder(name="dones", shape=(None, ), dtype=tf.bool) done_inp = tf.cast(done_ph, dtype=tf.float32) phs = (obs_ph, act_ph, next_obs_ph, done_ph) inps = (obs_inp, act_inp, next_obs_inp, done_inp) return phs, inps
def setup_model(self): self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.make_session(num_cpu=None, graph=self.graph) self.observation_ph, self.processed_obs = observation_input( self.venv.observation_space, scale=(self.network_type == "cnn")) with tf.variable_scope("target_model"): if self.network_type == 'cnn': self.target_network = small_convnet( self.processed_obs, tf.nn.leaky_relu) elif self.network_type == 'mlp': self.target_network = tf_layers.mlp( self.processed_obs, [1024, 512]) self.target_network = tf_layers.linear( self.target_network, "out", 512) else: raise ValueError("Unknown network type {}!".format( self.network_type)) with tf.variable_scope("predictor_model"): if self.network_type == 'cnn': self.predictor_network = tf.nn.relu( small_convnet(self.processed_obs, tf.nn.leaky_relu)) elif self.network_type == 'mlp': self.predictor_network = tf_layers.mlp( self.processed_obs, [1024, 512]) self.predictor_network = tf.nn.relu( tf_layers.linear(self.predictor_network, "pred_fc1", 512)) self.predictor_network = tf_layers.linear( self.predictor_network, "out", 512) with tf.name_scope("loss"): self.int_reward = tf.reduce_mean(tf.square( tf.stop_gradient(self.target_network) - self.predictor_network), axis=1) self.aux_loss = tf.reduce_mean( tf.square( tf.stop_gradient(self.target_network) - self.predictor_network)) with tf.name_scope("train"): self.optimizer = tf.train.AdamOptimizer(self.learning_rate) self.training_op = self.optimizer.minimize(self.aux_loss) self.params = tf.trainable_variables() tf.global_variables_initializer().run(session=self.sess)
def get_obs_and_pdtype(self, ob_space, ac_space): """ Initialize probability distribution and get observation placeholder. :param ob_space: (Gym Spaces) the observation space :param ac_space: (Gym Spaces) the action space """ self.pdtype = pdtype = make_proba_dist_type(ac_space) if self.obs_ph is None: self.obs_ph, self.processed_x = observation_input(ob_space) else: assert self.processed_x is not None return self.obs_ph, pdtype
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, scale=False, obs_phs=None, add_action_ph=False): self.n_env = n_env self.n_steps = n_steps self.n_batch = n_batch self.action_mask = None with tf.variable_scope("input", reuse=False): if obs_phs is None: self._obs_ph, self._processed_obs = observation_input(ob_space, n_batch, scale=scale) else: self._obs_ph, self._processed_obs = obs_phs self._action_ph = None if add_action_ph: self._action_ph = tf.placeholder(dtype=ac_space.dtype, shape=(n_batch,) + ac_space.shape, name="action_ph") self._action_mask_phs = [] if isinstance(ac_space, MultiDiscrete): mask_shape = [None] zeros_shape = [1] for i, size in enumerate(ac_space.nvec): mask_shape.append(size) zeros_shape.append(size) no_mask = tf.zeros(shape=zeros_shape, dtype=tf.float32) action_mask_ph = tf.placeholder_with_default(no_mask, shape=mask_shape, name="action_mask_ph_{}".format(i)) self._action_mask_phs.append(action_mask_ph) elif isinstance(ac_space, Discrete): no_mask = tf.zeros(shape=(1, ac_space.n), dtype=tf.float32) action_mask_ph = tf.placeholder_with_default(no_mask, shape=(None, ac_space.n), name="action_mask_ph_1") self._action_mask_phs.append(action_mask_ph) self.sess = sess self.reuse = reuse self.ob_space = ob_space self.ac_space = ac_space
def __init__(self, sess, ob_space, sc_space, me_space, g_space, ac_space, n_env, n_steps, n_batch, reuse=False, scale=False, pgn_params=None, obs_phs=None, sca_phs=None, mea_phs=None, goal_phs=None, future_size=6): super(DFPPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=scale, obs_phs=obs_phs) with tf.variable_scope("input_fc", reuse=False): # if gm_phs is None: # self._gm_ph, self._processed_gm = observation_input( # gm_space, n_batch, scale=scale) # else: # self._gm_ph, self._processed_gm = gm_phs if sca_phs is None: self._sca_ph, self._processed_sca = observation_input( sc_space, n_batch, scale=scale) else: self._sca_ph, self._processed_sca = sca_phs if mea_phs is None: self._mea_ph, self._processed_mea = observation_input( me_space, n_batch, scale=scale) else: self._mea_ph, self._processed_mea = mea_phs if goal_phs is None: self._goal_ph, self._processed_goal = observation_input( g_space, n_batch, scale=scale) else: self._goal_ph, self._processed_goal = goal_phs self._action_ph = None self.n_actions = ac_space.n self.future_size = future_size with tf.variable_scope('model', reuse=reuse): with tf.variable_scope('img_cnn', reuse=reuse): # CNN提取棋盘特征 extracted_img = img_cnn(self.processed_obs) extracted_img = tf.layers.flatten(extracted_img) with tf.variable_scope('sca_fc', reuse=reuse): # 标量特征 # extracted_sca = simple_fc(self.processed_sca) extracted_sca = tf.layers.flatten(self.processed_sca) with tf.variable_scope('mea_fc', reuse=reuse): # 衡量值特征 extracted_mea = simple_fc(self.processed_mea, name='mea') extracted_mea = tf.layers.flatten(extracted_mea) with tf.variable_scope('goal_fc', reuse=reuse): # goal特征 extracted_goal = simple_fc(self.processed_goal, name='goal') extracted_goal = tf.layers.flatten(extracted_goal) with tf.variable_scope('concat', reuse=reuse): # 将所有特征拼接 extracted_input = tf.concat( [extracted_img, extracted_sca, extracted_mea, extracted_goal], axis=1, name='concat') activ = tf.nn.relu with tf.variable_scope('exp_fc', reuse=reuse): # expectation_stream expectation_stream_prev = activ(linear( extracted_input, 'exp_prev', n_hidden=512, init_scale=np.sqrt(2))) expectation_stream = activ(linear( expectation_stream_prev, 'exp', n_hidden=self.future_size, init_scale=np.sqrt(2))) if _constants.pgn: print() print("PGN and DFP...") print() if pgn_params: print("PGN Loading...") len_params = len(pgn_params) prev = [[None] * (_constants.n_actions - 1)] * len_params prev_stream = [[None] * (_constants.n_actions - 1)] * len_params for c in range(len_params): # c代表第几列网络 with tf.variable_scope('prev_fc' + str(c), reuse=reuse): for r in range(_constants.n_actions - 1): # r代表动作 没有最后一个动作 scope1 = 'action_fc/act_prev' + str(r) scope2 = 'action_fc/act' + str(r) prev[c][r] = activ(pgn_linear(extracted_input, scope1, ww=pgn_params[c][scope1 + '/w'], bb=pgn_params[c][scope1 + '/b'])) prev_stream[c][r] = activ(pgn_linear(prev[c][r], scope2, ww=pgn_params[c][scope2 + '/w'], bb=pgn_params[c][scope2 + '/b'])) action_prev = [None] * _constants.n_actions action_stream = [None] * _constants.n_actions with tf.variable_scope('action_fc', reuse=reuse): for i in range(_constants.n_actions - 1): # 第 i 个动作 action_prev[i] = linear( extracted_input, 'act_prev' + str(i), n_hidden=512, init_scale=np.sqrt(2)) for c in range(len_params): action_prev[i] = tf.add(action_prev[i], prev[c][i]) action_prev[i] = activ(tf.divide(action_prev[i], len_params + 1)) action_stream[i] = linear( action_prev[i], 'act' + str(i), n_hidden=self.future_size, init_scale=np.sqrt(2)) for c in range(len_params): action_stream[i] = tf.add(action_stream[i], prev_stream[c][i]) action_stream[i] = activ(tf.divide(action_stream[i], len_params + 1)) # 最后一个放置炸弹单独处理 action_prev[121] = activ(linear( extracted_input, 'act_prev' + str(121), n_hidden=512, init_scale=np.sqrt(2))) action_stream[121] = activ(linear( action_prev[121], 'act' + str(121), n_hidden=self.future_size, init_scale=np.sqrt(2))) else: print("DFP Loading...") action_prev = [None] * _constants.n_actions action_stream = [None] * _constants.n_actions with tf.variable_scope('action_fc', reuse=reuse): for i in range(_constants.n_actions): action_prev[i] = activ(linear( extracted_input, 'act_prev' + str(i), n_hidden=512, init_scale=np.sqrt(2))) action_stream[i] = activ(linear( action_prev[i], 'act' + str(i), n_hidden=self.future_size, init_scale=np.sqrt(2))) else: print() print("Pure DFP...") print() action_prev = [None] * _constants.n_actions action_stream = [None] * _constants.n_actions with tf.variable_scope('action_fc', reuse=reuse): for i in range(_constants.n_actions): action_prev[i] = activ(linear( extracted_input, 'act_prev' + str(i), n_hidden=512, init_scale=np.sqrt(2))) action_stream[i] = activ(linear( action_prev[i], 'act' + str(i), n_hidden=self.future_size, init_scale=np.sqrt(2))) n_actions = len(action_stream) # 求 sum action_sum = action_stream[0] for i in range(1, n_actions): action_sum = tf.add(action_sum, action_stream[i]) # 求 mean action_mean = tf.divide(action_sum, n_actions) # for i in range(n_actions): action_stream[i] = tf.subtract(action_stream[i], action_mean) action_stream[i] = tf.add(action_stream[i], expectation_stream) with tf.variable_scope('future', reuse=reuse): self._future_stream = tf.convert_to_tensor(action_stream) self._setup_init()
def __init__(self, cfg, env, arch_type, graph, sess): assert arch_type is 'train' or 'act', 'type should be either "train" or "act"' cfg_env = cfg['environment'] cfg_arch = cfg['architecture'] if arch_type is 'train': self.num_steps = math.floor(cfg_env['max_time'] / cfg_env['control_dt']) else: self.num_steps = 1 self.observation_space = env.observation_space self.action_space = env.action_space self.pdtype = make_proba_dist_type(self.action_space) self.n_env = cfg["environment"]["num_envs"] self.graph = graph with self.graph.as_default(): with tf.variable_scope("model", reuse=tf.AUTO_REUSE): batch_size = self.num_steps * self.n_env if arch_type is 'train': batch_size /= cfg["algorithm"]["minibatch"] self.obs_ph, self.processed_obs = observation_input( self.observation_space, batch_size, scale=False) act_fun = tf.nn.relu pi_latent = self.obs_ph vi_latent = self.obs_ph for idx, dec_layer_size in enumerate(cfg_arch["pi_net"]): pi_latent = act_fun( linear(pi_latent, "pi_net_fc{}".format(idx), dec_layer_size, init_scale=np.sqrt(2))) for idx, dec_layer_size in enumerate(cfg_arch["vi_net"]): vi_latent = act_fun( linear(vi_latent, "vi_net_fc{}".format(idx), dec_layer_size, init_scale=np.sqrt(2))) self.value_fn = linear(vi_latent, 'vf', 1) self.value = self.value_fn[:, 0] self.proba_distribution, self.policy, self.q_value = \ self.pdtype.proba_distribution_from_latent(pi_latent, vi_latent, init_scale=0.01) self.action_ph = self.pdtype.sample_placeholder( [None], name="action_ph") self.masks_ph = tf.placeholder(tf.float32, [None], "masks_ph") self.action = self.proba_distribution.sample() self.neglogp = self.proba_distribution.neglogp(self.action) self.initial_state = None self.sess = sess # continuous action diagonal covariance self.policy_proba = [ self.proba_distribution.mean, self.proba_distribution.std ]