Пример #1
0
def build_inputs(observation_space: gym.Space,
                 action_space: gym.Space,
                 scale: bool = False) -> Tuple[tf.Tensor, ...]:
    """Builds placeholders and processed input Tensors.

  Observation `obs_*` and `next_obs_*` placeholders and processed input
  tensors have shape `(None,) + obs_space.shape`.
  The action `act_*` placeholder and processed input tensors have shape
  `(None,) + act_space.shape`.

  Args:
    observation_space: The observation space.
    action_space: The action space.
    scale: Only relevant for environments with Box spaces. If True, then
      processed input Tensors are automatically scaled to the interval [0, 1].

  Returns:
    obs_ph: Placeholder for old observations.
    act_ph: Placeholder for actions.
    next_obs_ph: Placeholder for new observations.
    obs_inp: Network-ready float32 Tensor with processed old observations.
    act_inp: Network-ready float32 Tensor with processed actions.
    next_obs_inp: Network-ready float32 Tensor with processed new observations.
  """
    obs_ph, obs_inp = observation_input(observation_space,
                                        name="obs",
                                        scale=scale)
    act_ph, act_inp = observation_input(action_space, name="act", scale=scale)
    next_obs_ph, next_obs_inp = observation_input(observation_space,
                                                  name="next_obs",
                                                  scale=scale)
    return obs_ph, act_ph, next_obs_ph, obs_inp, act_inp, next_obs_inp
Пример #2
0
  def _setup_input(self):
    with tf.variable_scope('input', reuse=False):
      self.input_x, self.process_x = observation_input(self._observ_space, self._num_batch)
      self.next_input_x, self.next_process_x = observation_input(self._observ_space, self._num_batch)
      pdtype = make_proba_dist_type(self._action_space)
      self.actions_ph = pdtype.sample_placeholder([self._num_batch], name="action_ph")
      self.one_hot_actions = tf.one_hot(self.actions_ph, self._action_space.n)

      self.capacity_ph = tf.placeholder(tf.float32, [], name='capacity_ph')
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 n_env,
                 n_steps,
                 n_batch,
                 reuse=False,
                 scale=False,
                 obs_phs=None,
                 add_action_ph=False):
        self.n_env = n_env
        self.n_steps = n_steps
        self.n_batch = n_batch
        with tf.variable_scope("input", reuse=False):
            if obs_phs is None:
                self._obs_ph, self._processed_obs = observation_input(
                    ob_space, n_batch, scale=scale)
            else:
                self._obs_ph, self._processed_obs = obs_phs

            self._action_ph = None
            if add_action_ph:
                self._action_ph = tf.placeholder(dtype=ac_space.dtype,
                                                 shape=(n_batch, ) +
                                                 ac_space.shape,
                                                 name="action_ph")
        self.sess = sess
        self.reuse = reuse
        self.ob_space = ob_space
        self.ac_space = ac_space
Пример #4
0
 def __init__(self,
              sess,
              ob_space,
              ac_space,
              n_env,
              n_steps,
              n_batch,
              n_lstm=256,
              reuse=False,
              scale=False):
     self.n_env = n_env
     self.n_steps = n_steps
     self.obs_ph, self.processed_x = observation_input(ob_space,
                                                       n_batch,
                                                       scale=scale)
     self.masks_ph = tf.placeholder(tf.float32,
                                    [n_batch])  # mask (done t-1)
     self.states_ph = tf.placeholder(tf.float32,
                                     [self.n_env, n_lstm * 2])  # states
     self.pdtype = make_proba_dist_type(ac_space)
     self.sess = sess
     self.reuse = reuse
     self.is_discrete = isinstance(ac_space, Discrete)
     self.policy = None
     self.proba_distribution = None
     self.value_fn = None
     self.ob_space = ob_space
Пример #5
0
 def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, scale=False,
              obs_phs=None, add_action_ph=False):
     self.n_env = n_env
     self.n_steps = n_steps
     with tf.variable_scope("input", reuse=False):
         if obs_phs is None:
             self._obs_ph, self._processed_obs = observation_input(ob_space, n_batch, scale=scale)
         else:
             self._obs_ph, self._processed_obs = obs_phs
         self._action_ph = None
         if add_action_ph:
             self._action_ph = tf.placeholder(dtype=ac_space.dtype, shape=(n_batch,) + ac_space.shape,
                                              name="action_ph")
         if isinstance(ac_space, spaces.MultiDiscrete):
             self._action_mask_ph = tf.placeholder(dtype=tf.float32, shape=[n_batch].extend(ac_space.nvec),
                                                   name="action_mask_ph")
         elif isinstance(ac_space, spaces.Discrete) or isinstance(ac_space, spaces.MultiBinary):
             self._action_mask_ph = tf.placeholder(dtype=tf.float32, shape=(n_batch, ac_space.n),
                                                   name="action_mask_ph")
         elif isinstance(ac_space, spaces.Box):
             self._action_mask_ph = tf.placeholder(dtype=tf.float32, shape=(n_batch, ac_space.shape[0]),
                                                   name="action_mask_ph")
     self.sess = sess
     self.reuse = reuse
     self.ob_space = ob_space
     self.ac_space = ac_space
Пример #6
0
def test_conv_kernel():
    """Test convolution kernel with various input formats."""
    filter_size_1 = 4   # The size of squared filter for the first layer
    filter_size_2 = (3, 5)  # The size of non-squared filter for the second layer
    target_shape_1 = [2, 52, 40, 32]  # The desired shape of the first layer
    target_shape_2 = [2, 13, 9, 32]  # The desired shape of the second layer
    kwargs = {}
    n_envs = 1
    n_steps = 2
    n_batch = n_envs * n_steps
    scale = False
    env = gym.make(ENV_ID)
    ob_space = env.observation_space

    with tf.Graph().as_default():
        _, scaled_images = observation_input(ob_space, n_batch, scale=scale)
        activ = tf.nn.relu
        layer_1 = activ(conv(scaled_images, 'c1', n_filters=32, filter_size=filter_size_1,
                             stride=4, init_scale=np.sqrt(2), **kwargs))
        layer_2 = activ(conv(layer_1, 'c2', n_filters=32, filter_size=filter_size_2,
                             stride=4, init_scale=np.sqrt(2), **kwargs))
        assert layer_1.shape == target_shape_1, \
              "The shape of layer based on the squared kernel matrix is not correct. " \
              "The current shape is {} and the desired shape is {}".format(layer_1.shape, target_shape_1)
        assert layer_2.shape == target_shape_2, \
              "The shape of layer based on the non-squared kernel matrix is not correct. " \
              "The current shape is {} and the desired shape is {}".format(layer_2.shape, target_shape_2)
    env.close()
Пример #7
0
 def __init__(self,
              sess,
              ob_space,
              ac_space,
              n_env,
              n_steps,
              n_batch,
              n_lstm=256,
              reuse=False,
              scale=False,
              obs_phs=None,
              add_action_ph=False):
     self.n_env = n_env
     self.n_steps = n_steps
     with tf.variable_scope("input", reuse=False):
         if obs_phs is None:
             self.obs_ph, self.processed_obs = observation_input(
                 ob_space, n_batch, scale=scale)
         else:
             self.obs_ph, self.processed_obs = obs_phs
         self.masks_ph = tf.placeholder(tf.float32, [n_batch],
                                        name="masks_ph")  # mask (done t-1)
         self.states_ph = tf.placeholder(tf.float32,
                                         [self.n_env, n_lstm * 2],
                                         name="states_ph")  # states
         self.action_ph = None
         if add_action_ph:
             self.action_ph = tf.placeholder(dtype=ac_space.dtype,
                                             shape=(None, ) +
                                             ac_space.shape,
                                             name="action_ph")
     self.sess = sess
     self.reuse = reuse
     self.ob_space = ob_space
     self.ac_space = ac_space
def main():
  env_id = 'BreakoutNoFrameskip-v4'
  num_env = 5
  seed = 0
  env_args = {'episode_life': False, 'clip_rewards': False}
  env = VecFrameStack(make_atari_env(env_id, num_env, seed, wrapper_kwargs=env_args), 4)
  graph = tf.Graph()
  with graph.as_default():
    sess = tf_util.make_session(graph=graph)
    with tf.variable_scope('input', reuse=False):
      input_x, process_x = observation_input(env.observation_space, num_env)
      print(env.action_space.shape)
      pdtype = make_proba_dist_type(env.action_space)
      actions_ph = pdtype.sample_placeholder([num_env], name="action_ph")
      one_hot_actions = tf.one_hot(actions_ph, env.action_space.n)
      
    print(input_x, process_x)
    print('action', actions_ph, one_hot_actions)

    beta = 0.1
    mu, sigma_sq, recons_x = build_network(process_x, one_hot_actions)
    print(mu)
    print(sigma_sq)
    print(recons_x)

    with tf.name_scope('losses'):
      recons_loss = tf.losses.mean_squared_error(input_x, recons_x, scope='recons_loss')
      kl_divergence = -tf.reduce_mean(0.5 * (tf.add(1., sigma_sq) - tf.pow(mu, 2) - tf.exp(sigma_sq)),
                                      name='kl_divergence')
      loss = tf.add(recons_loss,
                    tf.multiply(
                      kl_divergence,
                      beta), name='objective')
      print(loss)
    summary = utility.summary({recons_loss: 'recons_loss',
                               kl_divergence: 'kl_divergence',
                               mu: 'phi_mu',
                               sigma_sq: 'sigma_sq',
                               recons_x: 'recons_x',
                               input_x: 'input_x',
                               }, env.observation_space.shape)
    optimizer = tf.train.AdamOptimizer(learning_rate=0.0002, beta1=0.5)
    train_op = optimizer.minimize(loss)

    for event_file in LOG_DIR.glob('event*'):
      event_file.unlink()
    writer = tf.summary.FileWriter(LOG_DIR.as_posix(), sess.graph)
    sess.run(tf.global_variables_initializer())

    observ = env.reset()
    actions = [env.action_space.sample() for _ in range(num_env)]
    print(env.observation_space)
    print(observ.shape)

    recons_image, summary_ = sess.run([recons_x, summary],
                                      feed_dict={input_x: observ,
                                                 actions_ph: actions})
    writer.add_summary(summary_, 0)
Пример #9
0
    def __init__(self, obs_space: gym.Space, act_space: gym.Space):
        """Builds BasicRewardModel: adds placeholders and spaces but nothing else.

        The spaces passed are used to define the `observation_space` and `action_space`
        properties, and also are used to determine how to preprocess the observation
        and action placeholders, made available as `self._proc_{obs,act,next_obs}`.

        Args:
            obs_space: The observation space.
            act_space: The action space.
        """
        RewardModel.__init__(self)
        self._obs_space = obs_space
        self._act_space = act_space
        self._obs_ph, self._proc_obs = env_in.observation_input(obs_space)
        self._next_obs_ph, self._proc_next_obs = env_in.observation_input(obs_space)
        self._act_ph, self._proc_act = env_in.observation_input(act_space)
        self._dones_ph = tf.placeholder(name="dones", shape=(None,), dtype=tf.bool)
        self._proc_dones = tf.cast(self._dones_ph, dtype=tf.float32)
Пример #10
0
    def __init__(self, observation_space, name=None):
        """
        Creates an input placeholder tailored to a specific observation space

        :param observation_space: (Gym Space) observation space of the environment. Should be one of the gym.spaces
            types
        :param name: (str) tensorflow name of the underlying placeholder
        """
        is_image = len(observation_space.shape) == 3
        inpt, self.processed_inpt = observation_input(observation_space, name=name, scale=is_image)
        super().__init__(inpt)
Пример #11
0
def build_inputs(observation_space: gym.Space,
                 action_space: gym.Space,
                 scale: bool = False) -> Tuple[tf.Tensor, ...]:
    """Builds placeholders and processed input Tensors.

    Observation `obs_*` and `next_obs_*` placeholders and processed input
    tensors have shape `(None,) + obs_space.shape`.
    The action `act_*` placeholder and processed input tensors have shape
    `(None,) + act_space.shape`.

    Args:
        observation_space: The observation space.
        action_space: The action space.
        scale: Only relevant for environments with Box spaces. If True, then
            processed input Tensors are automatically scaled to the interval [0, 1].

    Returns:
        (phs, inps) where phs is a tuple of:
            obs_ph: Placeholder for old observations.
            act_ph: Placeholder for actions.
            next_obs_ph: Placeholder for new observations.
            done_ph: Placeholder for boolean episode termination.
        and inps is a tuple of:
            obs_inp: Network-ready float32 Tensor with processed old observations.
            act_inp: Network-ready float32 Tensor with processed actions.
            next_obs_inp: Network-ready float32 Tensor with processed new observations.
            dones_inp: Network-ready float32 tensor, with booleans 0-1 coded.
    """
    obs_ph, obs_inp = observation_input(observation_space,
                                        name="obs",
                                        scale=scale)
    act_ph, act_inp = observation_input(action_space, name="act", scale=scale)
    next_obs_ph, next_obs_inp = observation_input(observation_space,
                                                  name="next_obs",
                                                  scale=scale)
    done_ph = tf.placeholder(name="dones", shape=(None, ), dtype=tf.bool)
    done_inp = tf.cast(done_ph, dtype=tf.float32)
    phs = (obs_ph, act_ph, next_obs_ph, done_ph)
    inps = (obs_inp, act_inp, next_obs_inp, done_inp)
    return phs, inps
    def setup_model(self):
        self.graph = tf.Graph()

        with self.graph.as_default():
            self.sess = tf_util.make_session(num_cpu=None, graph=self.graph)
            self.observation_ph, self.processed_obs = observation_input(
                self.venv.observation_space,
                scale=(self.network_type == "cnn"))

            with tf.variable_scope("target_model"):
                if self.network_type == 'cnn':
                    self.target_network = small_convnet(
                        self.processed_obs, tf.nn.leaky_relu)
                elif self.network_type == 'mlp':
                    self.target_network = tf_layers.mlp(
                        self.processed_obs, [1024, 512])
                    self.target_network = tf_layers.linear(
                        self.target_network, "out", 512)
                else:
                    raise ValueError("Unknown network type {}!".format(
                        self.network_type))

            with tf.variable_scope("predictor_model"):
                if self.network_type == 'cnn':
                    self.predictor_network = tf.nn.relu(
                        small_convnet(self.processed_obs, tf.nn.leaky_relu))
                elif self.network_type == 'mlp':
                    self.predictor_network = tf_layers.mlp(
                        self.processed_obs, [1024, 512])

                self.predictor_network = tf.nn.relu(
                    tf_layers.linear(self.predictor_network, "pred_fc1", 512))
                self.predictor_network = tf_layers.linear(
                    self.predictor_network, "out", 512)

            with tf.name_scope("loss"):
                self.int_reward = tf.reduce_mean(tf.square(
                    tf.stop_gradient(self.target_network) -
                    self.predictor_network),
                                                 axis=1)
                self.aux_loss = tf.reduce_mean(
                    tf.square(
                        tf.stop_gradient(self.target_network) -
                        self.predictor_network))

            with tf.name_scope("train"):
                self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
                self.training_op = self.optimizer.minimize(self.aux_loss)

            self.params = tf.trainable_variables()
            tf.global_variables_initializer().run(session=self.sess)
Пример #13
0
    def get_obs_and_pdtype(self, ob_space, ac_space):
        """
        Initialize probability distribution and get observation placeholder.

        :param ob_space: (Gym Spaces) the observation space
        :param ac_space: (Gym Spaces) the action space
        """
        self.pdtype = pdtype = make_proba_dist_type(ac_space)

        if self.obs_ph is None:
            self.obs_ph, self.processed_x = observation_input(ob_space)
        else:
            assert self.processed_x is not None

        return self.obs_ph, pdtype
Пример #14
0
    def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, scale=False,
                 obs_phs=None, add_action_ph=False):
        self.n_env = n_env
        self.n_steps = n_steps
        self.n_batch = n_batch
        self.action_mask = None
        with tf.variable_scope("input", reuse=False):
            if obs_phs is None:
                self._obs_ph, self._processed_obs = observation_input(ob_space, n_batch, scale=scale)
            else:
                self._obs_ph, self._processed_obs = obs_phs

            self._action_ph = None
            if add_action_ph:
                self._action_ph = tf.placeholder(dtype=ac_space.dtype, shape=(n_batch,) + ac_space.shape,
                                                 name="action_ph")

            self._action_mask_phs = []
            if isinstance(ac_space, MultiDiscrete):
                mask_shape = [None]
                zeros_shape = [1]
                for i, size in enumerate(ac_space.nvec):
                    mask_shape.append(size)
                    zeros_shape.append(size)
                    no_mask = tf.zeros(shape=zeros_shape, dtype=tf.float32)
                    action_mask_ph = tf.placeholder_with_default(no_mask, shape=mask_shape,
                                                                 name="action_mask_ph_{}".format(i))
                    self._action_mask_phs.append(action_mask_ph)
            elif isinstance(ac_space, Discrete):
                no_mask = tf.zeros(shape=(1, ac_space.n), dtype=tf.float32)
                action_mask_ph = tf.placeholder_with_default(no_mask, shape=(None, ac_space.n),
                                                             name="action_mask_ph_1")
                self._action_mask_phs.append(action_mask_ph)
        self.sess = sess
        self.reuse = reuse
        self.ob_space = ob_space
        self.ac_space = ac_space
Пример #15
0
    def __init__(self, sess, ob_space, sc_space, me_space, g_space, ac_space, n_env, n_steps, n_batch,
                 reuse=False, scale=False, pgn_params=None,
                 obs_phs=None, sca_phs=None, mea_phs=None, goal_phs=None, future_size=6):
        super(DFPPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=scale,
                                        obs_phs=obs_phs)
        with tf.variable_scope("input_fc", reuse=False):
            # if gm_phs is None:
            #     self._gm_ph, self._processed_gm = observation_input(
            #         gm_space, n_batch, scale=scale)
            # else:
            #     self._gm_ph, self._processed_gm = gm_phs

            if sca_phs is None:
                self._sca_ph, self._processed_sca = observation_input(
                    sc_space, n_batch, scale=scale)
            else:
                self._sca_ph, self._processed_sca = sca_phs

            if mea_phs is None:
                self._mea_ph, self._processed_mea = observation_input(
                    me_space, n_batch, scale=scale)
            else:
                self._mea_ph, self._processed_mea = mea_phs

            if goal_phs is None:
                self._goal_ph, self._processed_goal = observation_input(
                    g_space, n_batch, scale=scale)
            else:
                self._goal_ph, self._processed_goal = goal_phs

            self._action_ph = None

        self.n_actions = ac_space.n
        self.future_size = future_size

        with tf.variable_scope('model', reuse=reuse):
            with tf.variable_scope('img_cnn', reuse=reuse):
                # CNN提取棋盘特征
                extracted_img = img_cnn(self.processed_obs)
                extracted_img = tf.layers.flatten(extracted_img)

            with tf.variable_scope('sca_fc', reuse=reuse):
                # 标量特征
                # extracted_sca = simple_fc(self.processed_sca)
                extracted_sca = tf.layers.flatten(self.processed_sca)
            with tf.variable_scope('mea_fc', reuse=reuse):
                # 衡量值特征
                extracted_mea = simple_fc(self.processed_mea, name='mea')
                extracted_mea = tf.layers.flatten(extracted_mea)
            with tf.variable_scope('goal_fc', reuse=reuse):
                # goal特征
                extracted_goal = simple_fc(self.processed_goal, name='goal')
                extracted_goal = tf.layers.flatten(extracted_goal)

            with tf.variable_scope('concat', reuse=reuse):
                # 将所有特征拼接
                extracted_input = tf.concat(
                    [extracted_img, extracted_sca, extracted_mea, extracted_goal], axis=1, name='concat')

            activ = tf.nn.relu

            with tf.variable_scope('exp_fc', reuse=reuse):
                # expectation_stream
                expectation_stream_prev = activ(linear(
                    extracted_input, 'exp_prev', n_hidden=512, init_scale=np.sqrt(2)))
                expectation_stream = activ(linear(
                    expectation_stream_prev, 'exp', n_hidden=self.future_size, init_scale=np.sqrt(2)))

            if _constants.pgn:
                print()
                print("PGN and DFP...")
                print()
                if pgn_params:
                    print("PGN Loading...")

                    len_params = len(pgn_params)
                    prev = [[None] * (_constants.n_actions - 1)] * len_params
                    prev_stream = [[None] * (_constants.n_actions - 1)] * len_params
                    for c in range(len_params):  # c代表第几列网络
                        with tf.variable_scope('prev_fc' + str(c), reuse=reuse):
                            for r in range(_constants.n_actions - 1):  # r代表动作 没有最后一个动作
                                scope1 = 'action_fc/act_prev' + str(r)
                                scope2 = 'action_fc/act' + str(r)
                                prev[c][r] = activ(pgn_linear(extracted_input, scope1,
                                                              ww=pgn_params[c][scope1 + '/w'],
                                                              bb=pgn_params[c][scope1 + '/b']))
                                prev_stream[c][r] = activ(pgn_linear(prev[c][r], scope2,
                                                                     ww=pgn_params[c][scope2 + '/w'],
                                                                     bb=pgn_params[c][scope2 + '/b']))

                    action_prev = [None] * _constants.n_actions
                    action_stream = [None] * _constants.n_actions

                    with tf.variable_scope('action_fc', reuse=reuse):
                        for i in range(_constants.n_actions - 1):  # 第 i 个动作
                            action_prev[i] = linear(
                                extracted_input, 'act_prev' + str(i), n_hidden=512, init_scale=np.sqrt(2))
                            for c in range(len_params):
                                action_prev[i] = tf.add(action_prev[i], prev[c][i])
                            action_prev[i] = activ(tf.divide(action_prev[i], len_params + 1))

                            action_stream[i] = linear(
                                action_prev[i], 'act' + str(i), n_hidden=self.future_size, init_scale=np.sqrt(2))
                            for c in range(len_params):
                                action_stream[i] = tf.add(action_stream[i], prev_stream[c][i])

                            action_stream[i] = activ(tf.divide(action_stream[i], len_params + 1))

                        # 最后一个放置炸弹单独处理
                        action_prev[121] = activ(linear(
                            extracted_input, 'act_prev' + str(121), n_hidden=512, init_scale=np.sqrt(2)))
                        action_stream[121] = activ(linear(
                            action_prev[121], 'act' + str(121), n_hidden=self.future_size, init_scale=np.sqrt(2)))
                else:
                    print("DFP Loading...")

                    action_prev = [None] * _constants.n_actions
                    action_stream = [None] * _constants.n_actions

                    with tf.variable_scope('action_fc', reuse=reuse):
                        for i in range(_constants.n_actions):
                            action_prev[i] = activ(linear(
                                extracted_input, 'act_prev' + str(i), n_hidden=512, init_scale=np.sqrt(2)))
                            action_stream[i] = activ(linear(
                                action_prev[i], 'act' + str(i), n_hidden=self.future_size, init_scale=np.sqrt(2)))

            else:
                print()
                print("Pure DFP...")
                print()

                action_prev = [None] * _constants.n_actions
                action_stream = [None] * _constants.n_actions

                with tf.variable_scope('action_fc', reuse=reuse):
                    for i in range(_constants.n_actions):
                        action_prev[i] = activ(linear(
                            extracted_input, 'act_prev' + str(i), n_hidden=512, init_scale=np.sqrt(2)))
                        action_stream[i] = activ(linear(
                            action_prev[i], 'act' + str(i), n_hidden=self.future_size, init_scale=np.sqrt(2)))

            n_actions = len(action_stream)

            # 求 sum
            action_sum = action_stream[0]
            for i in range(1, n_actions):
                action_sum = tf.add(action_sum, action_stream[i])
            # 求 mean
            action_mean = tf.divide(action_sum, n_actions)
            #
            for i in range(n_actions):
                action_stream[i] = tf.subtract(action_stream[i], action_mean)
                action_stream[i] = tf.add(action_stream[i], expectation_stream)

            with tf.variable_scope('future', reuse=reuse):
                self._future_stream = tf.convert_to_tensor(action_stream)
                self._setup_init()
Пример #16
0
    def __init__(self, cfg, env, arch_type, graph, sess):

        assert arch_type is 'train' or 'act', 'type should be either "train" or "act"'

        cfg_env = cfg['environment']
        cfg_arch = cfg['architecture']

        if arch_type is 'train':
            self.num_steps = math.floor(cfg_env['max_time'] /
                                        cfg_env['control_dt'])
        else:
            self.num_steps = 1

        self.observation_space = env.observation_space
        self.action_space = env.action_space
        self.pdtype = make_proba_dist_type(self.action_space)
        self.n_env = cfg["environment"]["num_envs"]
        self.graph = graph

        with self.graph.as_default():
            with tf.variable_scope("model", reuse=tf.AUTO_REUSE):

                batch_size = self.num_steps * self.n_env

                if arch_type is 'train':
                    batch_size /= cfg["algorithm"]["minibatch"]

                self.obs_ph, self.processed_obs = observation_input(
                    self.observation_space, batch_size, scale=False)

                act_fun = tf.nn.relu

                pi_latent = self.obs_ph
                vi_latent = self.obs_ph

                for idx, dec_layer_size in enumerate(cfg_arch["pi_net"]):
                    pi_latent = act_fun(
                        linear(pi_latent,
                               "pi_net_fc{}".format(idx),
                               dec_layer_size,
                               init_scale=np.sqrt(2)))

                for idx, dec_layer_size in enumerate(cfg_arch["vi_net"]):
                    vi_latent = act_fun(
                        linear(vi_latent,
                               "vi_net_fc{}".format(idx),
                               dec_layer_size,
                               init_scale=np.sqrt(2)))

                self.value_fn = linear(vi_latent, 'vf', 1)
                self.value = self.value_fn[:, 0]
                self.proba_distribution, self.policy, self.q_value = \
                    self.pdtype.proba_distribution_from_latent(pi_latent, vi_latent, init_scale=0.01)
                self.action_ph = self.pdtype.sample_placeholder(
                    [None], name="action_ph")
                self.masks_ph = tf.placeholder(tf.float32, [None], "masks_ph")
                self.action = self.proba_distribution.sample()
                self.neglogp = self.proba_distribution.neglogp(self.action)

        self.initial_state = None
        self.sess = sess

        # continuous action diagonal covariance
        self.policy_proba = [
            self.proba_distribution.mean, self.proba_distribution.std
        ]