def __init__(self,
                 policy_name,
                 env_spec,
                 latent_sampler,
                 hidden_sizes=(32, 32),
                 hidden_nonlinearity=tf.nn.tanh,
                 prob_network=None):
        Serializable.quick_init(self, locals())
        name = policy_name
        self.latent_sampler = latent_sampler

        with tf.variable_scope(name):
            if prob_network is None:
                input_dim = env_spec.observation_space.flat_dim + self.latent_sampler.dim
                l_input = L.InputLayer(shape=(None, input_dim), name="input")
                prob_network = MLP(input_layer=l_input,
                                   output_dim=env_spec.action_space.n,
                                   hidden_sizes=hidden_sizes,
                                   hidden_nonlinearity=hidden_nonlinearity,
                                   output_nonlinearity=tf.nn.softmax,
                                   name="prob_network")
                self._output = prob_network.output
                self._inputs = prob_network.input_var

        super(CategoricalLatentVarMLPPolicy,
              self).__init__(name=name,
                             env_spec=env_spec,
                             prob_network=prob_network)
예제 #2
0
    def __init__(self,
                 name,
                 env_spec,
                 latent_sampler,
                 hidden_sizes=(32, 32),
                 std_hidden_sizes=(32, 32),
                 min_std=1e-6,
                 std_hidden_nonlinearity=tf.nn.tanh,
                 hidden_nonlinearity=tf.nn.tanh,
                 output_nonlinearity=None):
        Serializable.quick_init(self, locals())
        assert isinstance(env_spec.action_space, Box)
        self.latent_sampler = latent_sampler

        with tf.variable_scope(name):
            obs_dim = env_spec.observation_space.flat_dim + self.latent_sampler.dim
            action_dim = env_spec.action_space.flat_dim

            # create networks
            mean_network = MLP(
                name="mean_network",
                input_shape=(obs_dim, ),
                output_dim=action_dim,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=output_nonlinearity,
            )
            std_network = MLP(
                name="std_network",
                input_shape=(obs_dim, ),
                input_layer=mean_network.input_layer,
                output_dim=action_dim,
                hidden_sizes=std_hidden_sizes,
                hidden_nonlinearity=std_hidden_nonlinearity,
                output_nonlinearity=None,
            )

            super(GaussianLatentVarMLPPolicy,
                  self).__init__(name=name,
                                 env_spec=env_spec,
                                 mean_network=mean_network,
                                 std_network=std_network)
예제 #3
0
def get_policy_network(env):
    policy_network = MLP(
        name='mean_network',
        input_shape=env.observation_space.shape,
        output_dim=env.action_space.flat_dim,
        hidden_sizes=(64, 32),
        hidden_nonlinearity=tf.nn.relu,
        output_nonlinearity=tf.nn.softmax,
        batch_normalization=False,
    )
    return policy_network
예제 #4
0
def get_value_network(env):
    value_network = MLP(
        name='value_network',
        input_shape=env.observation_space.shape,
        output_dim=1,
        hidden_sizes=(32, ),
        hidden_nonlinearity=tf.nn.relu,
        output_nonlinearity=None,
        batch_normalization=False,
    )
    return value_network
예제 #5
0
def get_policy_network(env):
    policy_network = MLP(
        name='mean_network',
        input_shape=env.observation_space.shape,
        output_dim=env.action_space.flat_dim,
        hidden_sizes=(64, 32),
        hidden_nonlinearity=get_nonlinearity_for_trpo(),
        output_nonlinearity=None,
        batch_normalization=False,
    )
    return policy_network
    def __init__(
        self,
        name,
        env_spec,
        hidden_sizes=(32, 32),
        hidden_nonlinearity=tf.nn.tanh,
        output_nonlinearity=tf.nn.tanh,
        mean_network=None,
    ):
        """
        :param env_spec:
        :param hidden_sizes: list of sizes for the fully-connected hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param output_nonlinearity: nonlinearity for the output layer
        :param mean_network: custom network for the output mean
        :return:
        """
        Serializable.quick_init(self, locals())
        assert isinstance(env_spec.action_space, Box)

        with tf.variable_scope(name):

            obs_dim = env_spec.observation_space.flat_dim
            action_dim = env_spec.action_space.flat_dim

            # create network
            if mean_network is None:
                mean_network = MLP(
                    name="mean_network",
                    input_shape=(obs_dim, ),
                    output_dim=action_dim,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=output_nonlinearity,
                )
            self._mean_network = mean_network

            l_mean = mean_network.output_layer
            obs_var = mean_network.input_layer.input_var

            self._l_mean = l_mean
            action_var = L.get_output(self._l_mean, deterministic=True)

            LayersPowered.__init__(self, [l_mean])
            super(DeterministicMLPPolicy, self).__init__(env_spec)

            self._f_actions = tensor_utils.compile_function(
                inputs=[obs_var],
                outputs=action_var,
            )
예제 #7
0
    def _make_subnetwork(self,
                         input_layer,
                         dim_output,
                         hidden_sizes,
                         output_nonlinearity=tf.sigmoid,
                         name="pred_network"):

        prob_network = MLP(
            # input_shape=(env_spec.observation_space.flat_dim,),
            output_dim=dim_output,
            hidden_sizes=hidden_sizes,
            hidden_nonlinearity=lrelu,
            output_nonlinearity=output_nonlinearity,
            name=name,
            input_layer=input_layer)

        return L.get_output(
            prob_network.output_layer), prob_network.output_layer
예제 #8
0
    def __init__(
            self,
            name,
            env_spec,
            hidden_sizes=(32, 32),
            hidden_nonlinearity=tf.nn.tanh,
            prob_network=None,
    ):
        """
        :param env_spec: A spec for the mdp.
        :param hidden_sizes: list of sizes for the fully connected hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param prob_network: manually specified network for this policy, other network params
        are ignored
        :return:
        """
        Serializable.quick_init(self, locals())

        assert isinstance(env_spec.action_space, Discrete)

        with tf.variable_scope(name):
            if prob_network is None:
                prob_network = MLP(
                    input_shape=(env_spec.observation_space.flat_dim,),
                    output_dim=env_spec.action_space.n,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=tf.nn.softmax,
                    name="prob_network",
                )

            self._l_prob = prob_network.output_layer
            self._l_obs = prob_network.input_layer
            self._f_prob = tensor_utils.compile_function(
                [prob_network.input_layer.input_var],
                L.get_output(prob_network.output_layer)
            )

            self._dist = Categorical(env_spec.action_space.n)

            super(CategoricalMLPPolicy, self).__init__(env_spec)
            LayersPowered.__init__(self, [prob_network.output_layer])
    def __init__(self,
                 name,
                 env_spec,
                 hidden_sizes=(32, 32),
                 hidden_nonlinearity=tf.nn.relu,
                 output_nonlinearity=tf.nn.tanh,
                 prob_network=None,
                 bn=False):
        Serializable.quick_init(self, locals())

        ## Apply MC Dropout on the MLP networks here

        with tf.variable_scope(name):
            if prob_network is None:
                prob_network = MLP(
                    input_shape=(env_spec.observation_space.flat_dim, ),
                    output_dim=env_spec.action_space.flat_dim,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=output_nonlinearity,
                    # batch_normalization=True,
                    name="prob_network",
                )

            self._l_prob = prob_network.output_layer
            self._l_obs = prob_network.input_layer
            self._f_prob = tensor_utils.compile_function(
                [prob_network.input_layer.input_var],
                L.get_output(prob_network.output_layer, deterministic=True))

        self.prob_network = prob_network

        # Note the deterministic=True argument. It makes sure that when getting
        # actions from single observations, we do not update params in the
        # batch normalization layers.
        # TODO: this doesn't currently work properly in the tf version so we leave out batch_norm
        super(DeterministicMLPPolicy, self).__init__(env_spec)
        LayersPowered.__init__(self, [prob_network.output_layer])
예제 #10
0
    def __init__(self,
                 name,
                 abstract_dim,
                 hidden_sizes=(32),
                 min_std=1e-6,
                 hidden_nonlinearity=tf.nn.tanh,
                 output_nonlinearity=None,
                 optim=tf.train.AdamOptimizer(learning_rate=0.001)):
        self.obs_dim = abstract_dim
        self.min_std = min_std
        self.distribution = DiagonalGaussian(self.obs_dim)
        with tf.variable_scope(name):
            self.net = MLP(
                name="mu_log_sigma",
                input_shape=self.obs_dim,
                output_dim=2 * self.obs_dim,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=output_nonlinearity,
            )
        self.obs_var = self.net.input_layer.input_var
        self.output = L.get_output(self.net.output_layer, self.obs_var)
        self.mu, unstable_log_sigma = tf.split(self.output,
                                               [self.obs_dim, self.obs_dim], 1)
        self.log_sigma = tf.maximum(unstable_log_sigma, self.min_std)

        self.nexts = tf.placeholder(tf.float32, shape=(None, self.obs_dim))
        self.loss = -self.distribution.log_likelihood(
            self.nexts, dist_info=dict(mean=self.mu, log_stds=self.log_sigma))
        self.optimizer = optim
        self.train_op = optim.minimize(self.loss)

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)
        self.sess.run(tf.global_variables_initializer())
    def _make_subnetwork(self,
                         input_layer,
                         dim_output,
                         hidden_sizes,
                         output_nonlinearity=tf.sigmoid,
                         hidden_nonlinearity=tf.nn.tanh,
                         name="pred_network",
                         conv_filters=None,
                         conv_filter_sizes=None,
                         conv_strides=None,
                         conv_pads=None,
                         input_shape=None):

        if conv_filters is not None:
            input_layer = L.reshape(input_layer, ([0], ) + input_shape,
                                    name="reshape_input")
            prob_network = ConvNetwork(input_shape=input_shape,
                                       output_dim=dim_output,
                                       hidden_sizes=hidden_sizes,
                                       hidden_nonlinearity=hidden_nonlinearity,
                                       output_nonlinearity=output_nonlinearity,
                                       name=name,
                                       input_layer=input_layer,
                                       conv_filters=conv_filters,
                                       conv_filter_sizes=conv_filter_sizes,
                                       conv_strides=conv_strides,
                                       conv_pads=conv_pads)
        else:
            prob_network = MLP(output_dim=dim_output,
                               hidden_sizes=hidden_sizes,
                               hidden_nonlinearity=hidden_nonlinearity,
                               output_nonlinearity=output_nonlinearity,
                               name=name,
                               input_layer=input_layer)

        return prob_network.output_layer
    def __init__(self,
                 name,
                 env_spec,
                 num_models=5,
                 hidden_sizes=(512, 512),
                 hidden_nonlinearity=tf.nn.relu,
                 output_nonlinearity=None,
                 batch_size=500,
                 step_size=0.001,
                 weight_normalization=False,
                 normalize_input=True,
                 optimizer=tf.train.AdamOptimizer,
                 valid_split_ratio=0.2,
                 rolling_average_persitency=0.99):

        Serializable.quick_init(self, locals())

        self.normalization = None
        self.normalize_input = normalize_input
        self.next_batch = None

        self.valid_split_ratio = valid_split_ratio
        self.rolling_average_persitency = rolling_average_persitency

        self.batch_size = batch_size
        self.step_size = step_size
        self.num_models = num_models
        self.name = name

        # determine dimensionality of state and action space
        self.obs_space_dims = obs_space_dims = env_spec.observation_space.shape[
            0]
        self.action_space_dims = action_space_dims = env_spec.action_space.shape[
            0]
        """ computation graph for training and simple inference """
        with tf.variable_scope(name):
            # placeholders
            self.obs_ph = tf.placeholder(tf.float32,
                                         shape=(None, obs_space_dims))
            self.act_ph = tf.placeholder(tf.float32,
                                         shape=(None, action_space_dims))
            self.delta_ph = tf.placeholder(tf.float32,
                                           shape=(None, obs_space_dims))

            # concatenate action and observation --> NN input
            self.nn_input = tf.concat([self.obs_ph, self.act_ph], axis=1)

            # create MLP
            mlps = []
            delta_preds = []
            self.obs_next_pred = []
            for i in range(num_models):
                with tf.variable_scope('model_{}'.format(i)):
                    mlp = MLP(name,
                              obs_space_dims,
                              hidden_sizes,
                              hidden_nonlinearity,
                              output_nonlinearity,
                              input_var=self.nn_input,
                              input_shape=(obs_space_dims +
                                           action_space_dims, ),
                              weight_normalization=weight_normalization)
                    mlps.append(mlp)

                delta_preds.append(mlp.output)
            self.delta_pred = tf.stack(
                delta_preds, axis=2)  # shape: (batch_size, ndim_obs, n_models)

            # define loss and train_op
            self.loss = tf.reduce_mean(
                (self.delta_ph[:, :, None] - self.delta_pred)**2)
            self.optimizer = optimizer(self.step_size)
            self.train_op = self.optimizer.minimize(self.loss)

            # tensor_utils
            self.f_delta_pred = tensor_utils.compile_function(
                [self.obs_ph, self.act_ph], self.delta_pred)
        """ computation graph for inference where each of the models receives a different batch"""
        with tf.variable_scope(name, reuse=True):
            # placeholders
            self.obs_model_batches_stack_ph = tf.placeholder(
                tf.float32, shape=(None, obs_space_dims))
            self.act_model_batches_stack_ph = tf.placeholder(
                tf.float32, shape=(None, action_space_dims))
            self.delta_model_batches_stack_ph = tf.placeholder(
                tf.float32, shape=(None, obs_space_dims))

            # split stack into the batches for each model --> assume each model receives a batch of the same size
            self.obs_model_batches = tf.split(self.obs_model_batches_stack_ph,
                                              self.num_models,
                                              axis=0)
            self.act_model_batches = tf.split(self.act_model_batches_stack_ph,
                                              self.num_models,
                                              axis=0)
            self.delta_model_batches = tf.split(
                self.delta_model_batches_stack_ph, self.num_models, axis=0)

            # reuse previously created MLP but each model receives its own batch
            delta_preds = []
            self.obs_next_pred = []
            self.loss_model_batches = []
            self.train_op_model_batches = []
            for i in range(num_models):
                with tf.variable_scope('model_{}'.format(i), reuse=True):
                    # concatenate action and observation --> NN input
                    nn_input = tf.concat(
                        [self.obs_model_batches[i], self.act_model_batches[i]],
                        axis=1)
                    mlp = MLP(name,
                              obs_space_dims,
                              hidden_sizes,
                              hidden_nonlinearity,
                              output_nonlinearity,
                              input_var=nn_input,
                              input_shape=(obs_space_dims +
                                           action_space_dims, ),
                              weight_normalization=weight_normalization)
                delta_preds.append(mlp.output)
                loss = tf.reduce_mean(
                    (self.delta_model_batches[i] - mlp.output)**2)
                self.loss_model_batches.append(loss)
                self.train_op_model_batches.append(
                    optimizer(self.step_size).minimize(loss))
            self.delta_pred_model_batches_stack = tf.concat(
                delta_preds,
                axis=0)  # shape: (batch_size_per_model*num_models, ndim_obs)

            # tensor_utils
            self.f_delta_pred_model_batches = tensor_utils.compile_function([
                self.obs_model_batches_stack_ph,
                self.act_model_batches_stack_ph
            ], self.delta_pred_model_batches_stack)

        LayersPowered.__init__(self, [mlp.output_layer for mlp in mlps])
    params_log_file = osp.join(log_dir, 'params.json')

    # logger.log_parameters_lite(params_log_file, args)
    logger.add_text_output(text_log_file)
    logger.add_tabular_output(tabular_log_file)
    prev_snapshot_dir = logger.get_snapshot_dir()
    prev_mode = logger.get_snapshot_mode()
    logger.set_snapshot_dir(log_dir)
    logger.set_snapshot_mode('all')
    logger.set_log_tabular_only(False)
    logger.push_prefix("[%s] " % exp_name)

    feature_network = MLP(name='feature_net',
                          input_shape=(env.spec.observation_space.flat_dim +
                                       env.spec.action_space.flat_dim, ),
                          output_dim=7,
                          hidden_nonlinearity=tf.nn.tanh,
                          hidden_sizes=(32, 32),
                          output_nonlinearity=None)

    policy = GSMDPCategoricalGRUPolicy(feature_network=feature_network,
                                       env_spec=env.spec,
                                       name="policy")
    # policy = CategoricalMLPPolicy(env_spec=env.spec, name = "policy")
    baseline = LinearFeatureBaseline(env_spec=env.spec)
    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
                n_itr=750,
                max_path_length=100000,
                batch_size=20000,
예제 #14
0
파일: rurllab.py 프로젝트: zeyuan1987/MADRL
def rllab_envpolicy_parser(env, args):
    if isinstance(args, dict):
        args = tonamedtuple(args)

    env = RLLabEnv(env, mode=args.control)
    if args.algo[:2] == 'tf':
        env = TfEnv(env)

        # Policy
        if args.recurrent:
            if args.feature_net:
                feature_network = MLP(
                    name='feature_net',
                    input_shape=(env.spec.observation_space.flat_dim +
                                 env.spec.action_space.flat_dim, ),
                    output_dim=args.feature_output,
                    hidden_sizes=tuple(args.feature_hidden),
                    hidden_nonlinearity=tf.nn.tanh,
                    output_nonlinearity=None)
            elif args.conv:
                strides = tuple(args.conv_strides)
                chans = tuple(args.conv_channels)
                filts = tuple(args.conv_filters)

                assert len(strides) == len(chans) == len(
                    filts), "strides, chans and filts not equal"
                # only discrete actions supported, should be straightforward to extend to continuous
                assert isinstance(
                    env.spec.action_space,
                    Discrete), "Only discrete action spaces support conv"
                feature_network = ConvNetwork(
                    name='feature_net',
                    input_shape=env.spec.observation_space.shape,
                    output_dim=args.feature_output,
                    conv_filters=chans,
                    conv_filter_sizes=filts,
                    conv_strides=strides,
                    conv_pads=('VALID', ) * len(chans),
                    hidden_sizes=tuple(args.feature_hidden),
                    hidden_nonlinearity=tf.nn.relu,
                    output_nonlinearity=None)
            else:
                feature_network = None
            if args.recurrent == 'gru':
                if isinstance(env.spec.action_space, Box):
                    policy = GaussianGRUPolicy(env_spec=env.spec,
                                               feature_network=feature_network,
                                               hidden_dim=int(
                                                   args.policy_hidden[0]),
                                               name='policy')
                elif isinstance(env.spec.action_space, Discrete):
                    policy = CategoricalGRUPolicy(
                        env_spec=env.spec,
                        feature_network=feature_network,
                        hidden_dim=int(args.policy_hidden[0]),
                        name='policy',
                        state_include_action=False if args.conv else True)
                else:
                    raise NotImplementedError(env.spec.observation_space)

            elif args.recurrent == 'lstm':
                if isinstance(env.spec.action_space, Box):
                    policy = GaussianLSTMPolicy(
                        env_spec=env.spec,
                        feature_network=feature_network,
                        hidden_dim=int(args.policy_hidden),
                        name='policy')
                elif isinstance(env.spec.action_space, Discrete):
                    policy = CategoricalLSTMPolicy(
                        env_spec=env.spec,
                        feature_network=feature_network,
                        hidden_dim=int(args.policy_hidden),
                        name='policy')
                else:
                    raise NotImplementedError(env.spec.action_space)

            else:
                raise NotImplementedError(args.recurrent)
        elif args.conv:
            strides = tuple(args.conv_strides)
            chans = tuple(args.conv_channels)
            filts = tuple(args.conv_filters)

            assert len(strides) == len(chans) == len(
                filts), "strides, chans and filts not equal"
            # only discrete actions supported, should be straightforward to extend to continuous
            assert isinstance(
                env.spec.action_space,
                Discrete), "Only discrete action spaces support conv"
            feature_network = ConvNetwork(
                name='feature_net',
                input_shape=env.spec.observation_space.shape,
                output_dim=env.spec.action_space.n,
                conv_filters=chans,
                conv_filter_sizes=filts,
                conv_strides=strides,
                conv_pads=('VALID', ) * len(chans),
                hidden_sizes=tuple(args.policy_hidden),
                hidden_nonlinearity=tf.nn.relu,
                output_nonlinearity=tf.nn.softmax)
            policy = CategoricalMLPPolicy(name='policy',
                                          env_spec=env.spec,
                                          prob_network=feature_network)
        else:
            if isinstance(env.spec.action_space, Box):
                policy = GaussianMLPPolicy(env_spec=env.spec,
                                           hidden_sizes=tuple(
                                               args.policy_hidden),
                                           min_std=args.min_std,
                                           name='policy')
            elif isinstance(env.spec.action_space, Discrete):
                policy = CategoricalMLPPolicy(env_spec=env.spec,
                                              hidden_sizes=tuple(
                                                  args.policy_hidden),
                                              name='policy')
            else:
                raise NotImplementedError(env.spec.action_space)
    elif args.algo[:2] == 'th':
        # Policy
        if args.recurrent:
            if args.feature_net:
                feature_network = thMLP(
                    input_shape=(env.spec.observation_space.flat_dim +
                                 env.spec.action_space.flat_dim, ),
                    output_dim=args.feature_output,
                    hidden_sizes=tuple(args.feature_hidden),
                    hidden_nonlinearity=tf.nn.tanh,
                    output_nonlinearity=None)
            else:
                feature_network = None
            if args.recurrent == 'gru':
                if isinstance(env.spec.observation_space, thBox):
                    policy = thGaussianGRUPolicy(
                        env_spec=env.spec,
                        feature_network=feature_network,
                        hidden_dim=int(args.policy_hidden[0]),
                    )
                elif isinstance(env.spec.observation_space, thDiscrete):
                    policy = thCategoricalGRUPolicy(
                        env_spec=env.spec,
                        feature_network=feature_network,
                        hidden_dim=int(args.policy_hidden[0]),
                    )
                else:
                    raise NotImplementedError(env.spec.observation_space)

            # elif args.recurrent == 'lstm':
            #     if isinstance(env.spec.action_space, thBox):
            #         policy = thGaussianLSTMPolicy(env_spec=env.spec,
            #                                       feature_network=feature_network,
            #                                       hidden_dim=int(args.policy_hidden),
            #                                       name='policy')
            #     elif isinstance(env.spec.action_space, thDiscrete):
            #         policy = thCategoricalLSTMPolicy(env_spec=env.spec,
            #                                          feature_network=feature_network,
            #                                          hidden_dim=int(args.policy_hidden),
            #                                          name='policy')
            #     else:
            #         raise NotImplementedError(env.spec.action_space)

            else:
                raise NotImplementedError(args.recurrent)
        else:
            if args.algo == 'thddpg':
                assert isinstance(env.spec.action_space, thBox)
                policy = thDeterministicMLPPolicy(
                    env_spec=env.spec,
                    hidden_sizes=tuple(args.policy_hidden),
                )
            else:
                if isinstance(env.spec.action_space, thBox):
                    policy = thGaussianMLPPolicy(env_spec=env.spec,
                                                 hidden_sizes=tuple(
                                                     args.policy_hidden),
                                                 min_std=args.min_std)
                elif isinstance(env.spec.action_space, thDiscrete):
                    policy = thCategoricalMLPPolicy(env_spec=env.spec,
                                                    hidden_sizes=tuple(
                                                        args.policy_hidden),
                                                    min_std=args.min_std)
                else:
                    raise NotImplementedError(env.spec.action_space)

    if args.control == 'concurrent':
        return env, policies
    else:
        return env, policy
예제 #15
0
    def __init__(
            self,
            name,
            input_shape,
            output_dim,
            # observation_space,
            mean_network=None,
            hidden_sizes=(32, 32),
            hidden_nonlinearity=tf.nn.tanh,
            optimizer=None,
            use_trust_region=True,
            step_size=0.01,
            learn_std=True,
            init_std=1.0,
            adaptive_std=False,
            std_share_network=False,
            std_hidden_sizes=(32, 32),
            std_nonlinearity=None,
            normalize_inputs=True,
            normalize_outputs=True,
            subsample_factor=1.0
    ):
        """
        :param input_shape: Shape of the input data.
        :param output_dim: Dimension of output.
        :param hidden_sizes: Number of hidden units of each layer of the mean network.
        :param hidden_nonlinearity: Non-linearity used for each layer of the mean network.
        :param optimizer: Optimizer for minimizing the negative log-likelihood.
        :param use_trust_region: Whether to use trust region constraint.
        :param step_size: KL divergence constraint for each iteration
        :param learn_std: Whether to learn the standard deviations. Only effective if adaptive_std is False. If
        adaptive_std is True, this parameter is ignored, and the weights for the std network are always learned.
        :param adaptive_std: Whether to make the std a function of the states.
        :param std_share_network: Whether to use the same network as the mean.
        :param std_hidden_sizes: Number of hidden units of each layer of the std network. Only used if
        `std_share_network` is False. It defaults to the same architecture as the mean.
        :param std_nonlinearity: Non-linearity used for each layer of the std network. Only used if `std_share_network`
        is False. It defaults to the same non-linearity as the mean.
        """
        Serializable.quick_init(self, locals())

        with tf.variable_scope(name):

            if optimizer is None:
                if use_trust_region:
                    optimizer = PenaltyLbfgsOptimizer("optimizer")
                else:
                    optimizer = LbfgsOptimizer("optimizer")

            self._optimizer = optimizer
            self._subsample_factor = subsample_factor

            if mean_network is None:
                mean_network = MLP(
                    name="mean_network",
                    input_shape=input_shape,
                    output_dim=output_dim,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=None,
                )

            l_mean = mean_network.output_layer

            if adaptive_std:
                l_log_std = MLP(
                    name="log_std_network",
                    input_shape=input_shape,
                    input_var=mean_network.input_layer.input_var,
                    output_dim=output_dim,
                    hidden_sizes=std_hidden_sizes,
                    hidden_nonlinearity=std_nonlinearity,
                    output_nonlinearity=None,
                ).output_layer
            else:
                l_log_std = L.ParamLayer(
                    mean_network.input_layer,
                    num_units=output_dim,
                    param=tf.constant_initializer(np.log(init_std)),
                    name="output_log_std",
                    trainable=learn_std,
                )

            LayersPowered.__init__(self, [l_mean, l_log_std])

            xs_var = mean_network.input_layer.input_var
            ys_var = tf.placeholder(dtype=tf.float32, name="ys", shape=(None, output_dim))
            old_means_var = tf.placeholder(dtype=tf.float32, name="ys", shape=(None, output_dim))
            old_log_stds_var = tf.placeholder(dtype=tf.float32, name="old_log_stds", shape=(None, output_dim))

            x_mean_var = tf.Variable(
                np.zeros((1,) + input_shape, dtype=np.float32),
                name="x_mean",
            )
            x_std_var = tf.Variable(
                np.ones((1,) + input_shape, dtype=np.float32),
                name="x_std",
            )
            y_mean_var = tf.Variable(
                np.zeros((1, output_dim), dtype=np.float32),
                name="y_mean",
            )
            y_std_var = tf.Variable(
                np.ones((1, output_dim), dtype=np.float32),
                name="y_std",
            )

            self.x_mean_var = x_mean_var
            self.x_std_var = x_std_var
            self.y_mean_var = y_mean_var
            self.y_std_var = y_std_var
            # self.observation_space = observation_space

            normalized_xs_var = (xs_var - x_mean_var) / x_std_var
            normalized_ys_var = (ys_var - y_mean_var) / y_std_var

            normalized_means_var = L.get_output(l_mean, {mean_network.input_layer: normalized_xs_var})
            normalized_log_stds_var = L.get_output(l_log_std, {mean_network.input_layer: normalized_xs_var})

            means_var = normalized_means_var * y_std_var + y_mean_var
            log_stds_var = normalized_log_stds_var + tf.log(y_std_var)

            normalized_old_means_var = (old_means_var - y_mean_var) / y_std_var
            normalized_old_log_stds_var = old_log_stds_var - tf.log(y_std_var)

            dist = self._dist = DiagonalGaussian(output_dim)

            normalized_dist_info_vars = dict(mean=normalized_means_var, log_std=normalized_log_stds_var)

            mean_kl = tf.reduce_mean(dist.kl_sym(
                dict(mean=normalized_old_means_var, log_std=normalized_old_log_stds_var),
                normalized_dist_info_vars,
            ))

            loss = - tf.reduce_mean(dist.log_likelihood_sym(normalized_ys_var, normalized_dist_info_vars))

            self._f_predict = tensor_utils.compile_function([xs_var], means_var)
            self._f_pdists = tensor_utils.compile_function([xs_var], [means_var, log_stds_var])
            self._l_mean = l_mean
            self._l_log_std = l_log_std

            optimizer_args = dict(
                loss=loss,
                target=self,
                network_outputs=[normalized_means_var, normalized_log_stds_var],
            )

            if use_trust_region:
                optimizer_args["leq_constraint"] = (mean_kl, step_size)
                optimizer_args["inputs"] = [xs_var, ys_var, old_means_var, old_log_stds_var]
            else:
                optimizer_args["inputs"] = [xs_var, ys_var]

            self._optimizer.update_opt(**optimizer_args)

            self._use_trust_region = use_trust_region
            self._name = name

            self._normalize_inputs = normalize_inputs
            self._normalize_outputs = normalize_outputs
            self._mean_network = mean_network
            self._x_mean_var = x_mean_var
            self._x_std_var = x_std_var
            self._y_mean_var = y_mean_var
            self._y_std_var = y_std_var

            self.input_dim = input_shape[0]
            self.output_dim = output_dim
예제 #16
0
def main():
    now = datetime.datetime.now(dateutil.tz.tzlocal())
    rand_id = str(uuid.uuid4())[:5]
    timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z')
    default_exp_name = 'experiment_%s_%s' % (timestamp, rand_id)

    parser = argparse.ArgumentParser()
    parser.add_argument('--exp_name',
                        type=str,
                        default=default_exp_name,
                        help='Name of the experiment.')

    parser.add_argument('--discount', type=float, default=0.95)
    parser.add_argument('--gae_lambda', type=float, default=0.99)
    parser.add_argument('--reward_scale', type=float, default=1.0)
    parser.add_argument('--enable_obsnorm', action='store_true', default=False)
    parser.add_argument('--chunked', action='store_true', default=False)

    parser.add_argument('--n_iter', type=int, default=250)
    parser.add_argument('--sampler_workers', type=int, default=1)
    parser.add_argument('--max_traj_len', type=int, default=250)
    parser.add_argument('--update_curriculum',
                        action='store_true',
                        default=False)
    parser.add_argument('--anneal_step_size', type=int, default=0)

    parser.add_argument('--n_timesteps', type=int, default=8000)

    parser.add_argument('--control', type=str, default='centralized')
    parser.add_argument('--buffer_size', type=int, default=1)
    parser.add_argument('--radius', type=float, default=0.015)
    parser.add_argument('--n_evaders', type=int, default=10)
    parser.add_argument('--n_pursuers', type=int, default=8)
    parser.add_argument('--n_poison', type=int, default=10)
    parser.add_argument('--n_coop', type=int, default=4)
    parser.add_argument('--n_sensors', type=int, default=30)
    parser.add_argument('--sensor_range', type=str, default='0.2')
    parser.add_argument('--food_reward', type=float, default=5)
    parser.add_argument('--poison_reward', type=float, default=-1)
    parser.add_argument('--encounter_reward', type=float, default=0.05)
    parser.add_argument('--reward_mech', type=str, default='local')

    parser.add_argument('--recurrent', type=str, default=None)
    parser.add_argument('--baseline_type', type=str, default='linear')
    parser.add_argument('--policy_hidden_sizes', type=str, default='128,128')
    parser.add_argument('--baseline_hidden_sizes', type=str, default='128,128')

    parser.add_argument('--max_kl', type=float, default=0.01)

    parser.add_argument('--log_dir', type=str, required=False)
    parser.add_argument('--tabular_log_file',
                        type=str,
                        default='progress.csv',
                        help='Name of the tabular log file (in csv).')
    parser.add_argument('--text_log_file',
                        type=str,
                        default='debug.log',
                        help='Name of the text log file (in pure text).')
    parser.add_argument('--params_log_file',
                        type=str,
                        default='params.json',
                        help='Name of the parameter log file (in json).')
    parser.add_argument('--seed', type=int, help='Random seed for numpy')
    parser.add_argument('--args_data',
                        type=str,
                        help='Pickled data for stub objects')
    parser.add_argument('--snapshot_mode',
                        type=str,
                        default='all',
                        help='Mode to save the snapshot. Can be either "all" '
                        '(all iterations will be saved), "last" (only '
                        'the last iteration will be saved), or "none" '
                        '(do not save snapshots)')
    parser.add_argument(
        '--log_tabular_only',
        type=ast.literal_eval,
        default=False,
        help=
        'Whether to only print the tabular log information (in a horizontal format)'
    )

    args = parser.parse_args()

    parallel_sampler.initialize(n_parallel=args.sampler_workers)

    if args.seed is not None:
        set_seed(args.seed)
        parallel_sampler.set_seed(args.seed)

    args.hidden_sizes = tuple(map(int, args.policy_hidden_sizes.split(',')))

    centralized = True if args.control == 'centralized' else False

    sensor_range = np.array(map(float, args.sensor_range.split(',')))
    if len(sensor_range) == 1:
        sensor_range = sensor_range[0]
    else:
        assert sensor_range.shape == (args.n_pursuers, )

    env = MAWaterWorld(args.n_pursuers,
                       args.n_evaders,
                       args.n_coop,
                       args.n_poison,
                       radius=args.radius,
                       n_sensors=args.n_sensors,
                       food_reward=args.food_reward,
                       poison_reward=args.poison_reward,
                       encounter_reward=args.encounter_reward,
                       reward_mech=args.reward_mech,
                       sensor_range=sensor_range,
                       obstacle_loc=None)

    env = TfEnv(
        RLLabEnv(StandardizedEnv(env,
                                 scale_reward=args.reward_scale,
                                 enable_obsnorm=args.enable_obsnorm),
                 mode=args.control))

    if args.buffer_size > 1:
        env = ObservationBuffer(env, args.buffer_size)

    if args.recurrent:
        feature_network = MLP(
            name='feature_net',
            input_shape=(env.spec.observation_space.flat_dim +
                         env.spec.action_space.flat_dim, ),
            output_dim=16,
            hidden_sizes=(128, 64, 32),
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None)
        if args.recurrent == 'gru':
            policy = GaussianGRUPolicy(env_spec=env.spec,
                                       feature_network=feature_network,
                                       hidden_dim=int(
                                           args.policy_hidden_sizes),
                                       name='policy')
        elif args.recurrent == 'lstm':
            policy = GaussianLSTMPolicy(env_spec=env.spec,
                                        feature_network=feature_network,
                                        hidden_dim=int(
                                            args.policy_hidden_sizes),
                                        name='policy')
    else:
        policy = GaussianMLPPolicy(
            name='policy',
            env_spec=env.spec,
            hidden_sizes=tuple(map(int, args.policy_hidden_sizes.split(','))),
            min_std=10e-5)

    if args.baseline_type == 'linear':
        baseline = LinearFeatureBaseline(env_spec=env.spec)
    elif args.baseline_type == 'mlp':
        raise NotImplementedError()
        # baseline = GaussianMLPBaseline(
        #     env_spec=env.spec, hidden_sizes=tuple(map(int, args.baseline_hidden_sizes.split(','))))
    else:
        baseline = ZeroBaseline(env_spec=env.spec)

    # logger
    default_log_dir = config.LOG_DIR
    if args.log_dir is None:
        log_dir = osp.join(default_log_dir, args.exp_name)
    else:
        log_dir = args.log_dir
    tabular_log_file = osp.join(log_dir, args.tabular_log_file)
    text_log_file = osp.join(log_dir, args.text_log_file)
    params_log_file = osp.join(log_dir, args.params_log_file)

    logger.log_parameters_lite(params_log_file, args)
    logger.add_text_output(text_log_file)
    logger.add_tabular_output(tabular_log_file)
    prev_snapshot_dir = logger.get_snapshot_dir()
    prev_mode = logger.get_snapshot_mode()
    logger.set_snapshot_dir(log_dir)
    logger.set_snapshot_mode(args.snapshot_mode)
    logger.set_log_tabular_only(args.log_tabular_only)
    logger.push_prefix("[%s] " % args.exp_name)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=args.n_timesteps,
        max_path_length=args.max_traj_len,
        #max_path_length_limit=args.max_path_length_limit,
        update_max_path_length=args.update_curriculum,
        anneal_step_size=args.anneal_step_size,
        n_itr=args.n_iter,
        discount=args.discount,
        gae_lambda=args.gae_lambda,
        step_size=args.max_kl,
        optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(
            base_eps=1e-5)) if args.recurrent else None,
        mode=args.control
        if not args.chunked else 'chunk_{}'.format(args.control),
    )

    algo.train()
예제 #17
0
    def __init__(
            self,
            name,
            input_shape,
            output_dim,
            prob_network=None,
            hidden_sizes=(32, 32),
            hidden_nonlinearity=tf.nn.tanh,
            optimizer=None,
            tr_optimizer=None,
            use_trust_region=True,
            step_size=0.01,
            normalize_inputs=True,
            no_initial_trust_region=True,
    ):
        """
        :param input_shape: Shape of the input data.
        :param output_dim: Dimension of output.
        :param hidden_sizes: Number of hidden units of each layer of the mean network.
        :param hidden_nonlinearity: Non-linearity used for each layer of the mean network.
        :param optimizer: Optimizer for minimizing the negative log-likelihood.
        :param use_trust_region: Whether to use trust region constraint.
        :param step_size: KL divergence constraint for each iteration
        """
        Serializable.quick_init(self, locals())

        with tf.variable_scope(name):
            if optimizer is None:
                optimizer = LbfgsOptimizer(name="optimizer")
            if tr_optimizer is None:
                tr_optimizer = ConjugateGradientOptimizer()

            self.input_dim = input_shape[0]
            self.observation_space = Discrete(self.input_dim)
            self.action_space = Discrete(output_dim)


            self.output_dim = output_dim
            self.optimizer = optimizer
            self.tr_optimizer = tr_optimizer

            if prob_network is None:
                prob_network = MLP(
                    input_shape=input_shape,
                    output_dim=output_dim,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=tf.nn.softmax,
                    name="prob_network"
                )

            l_prob = prob_network.output_layer

            LayersPowered.__init__(self, [l_prob])

            xs_var = prob_network.input_layer.input_var
            ys_var = tf.placeholder(dtype=tf.float32, shape=[None, output_dim], name="ys")
            old_prob_var = tf.placeholder(dtype=tf.float32, shape=[None, output_dim], name="old_prob")

            x_mean_var = tf.get_variable(
                name="x_mean",
                shape=(1,) + input_shape,
                initializer=tf.constant_initializer(0., dtype=tf.float32)
            )
            x_std_var = tf.get_variable(
                name="x_std",
                shape=(1,) + input_shape,
                initializer=tf.constant_initializer(1., dtype=tf.float32)
            )

            self.x_mean_var = x_mean_var
            self.x_std_var = x_std_var

            normalized_xs_var = (xs_var - x_mean_var) / x_std_var

            prob_var = L.get_output(l_prob, {prob_network.input_layer: normalized_xs_var})

            old_info_vars = dict(prob=old_prob_var)
            info_vars = dict(prob=prob_var)

            dist = self._dist = Categorical(output_dim)

            mean_kl = tf.reduce_mean(dist.kl_sym(old_info_vars, info_vars))

            loss = - tf.reduce_mean(dist.log_likelihood_sym(ys_var, info_vars))

            predicted = tensor_utils.to_onehot_sym(tf.argmax(prob_var, axis=1), output_dim)

            self.prob_network = prob_network
            self.f_predict = tensor_utils.compile_function([xs_var], predicted)
            self.f_prob = tensor_utils.compile_function([xs_var], prob_var)
            self.l_prob = l_prob

            self.optimizer.update_opt(loss=loss, target=self, network_outputs=[prob_var], inputs=[xs_var, ys_var])
            self.tr_optimizer.update_opt(loss=loss, target=self, network_outputs=[prob_var],
                                         inputs=[xs_var, ys_var, old_prob_var],
                                         leq_constraint=(mean_kl, step_size)
                                         )

            self.use_trust_region = use_trust_region
            self.name = name

            self.normalize_inputs = normalize_inputs
            self.x_mean_var = x_mean_var
            self.x_std_var = x_std_var
            self.first_optimized = not no_initial_trust_region
    def __init__(
        self,
        name,
        env_spec,
        num_ensembles=5,
        num_models_per_ensemble=3,
        hidden_sizes=(512, 512),
        hidden_nonlinearity=tf.nn.relu,
        output_nonlinearity=None,
        batch_size=500,
        step_size=0.001,
        weight_normalization=False,
        normalize_input=True,
    ):

        Serializable.quick_init(self, locals())

        self.normalization = None
        self.normalize_input = normalize_input

        self.batch_size = batch_size
        self.step_size = step_size
        self.num_ensembles = num_ensembles
        self.num_models_per_ensemble = num_models_per_ensemble
        self.num_models = num_ensembles * num_models_per_ensemble

        # determine dimensionality of state and action space
        self.obs_space_dims = obs_space_dims = env_spec.observation_space.shape[
            0]
        self.action_space_dims = action_space_dims = env_spec.action_space.shape[
            0]

        # set model - ensemble assignment
        self.model_ensemble_assignment = [
            list(
                range(i * self.num_models_per_ensemble,
                      (i + 1) * self.num_models_per_ensemble))
            for i in range(self.num_ensembles)
        ]

        with tf.variable_scope(name):
            # placeholders
            self.obs_ph = tf.placeholder(tf.float32,
                                         shape=(None, obs_space_dims))
            self.act_ph = tf.placeholder(tf.float32,
                                         shape=(None, action_space_dims))
            self.delta_ph = tf.placeholder(tf.float32,
                                           shape=(None, obs_space_dims))

            # concatenate action and observation --> NN input
            self.nn_input = tf.concat([self.obs_ph, self.act_ph], axis=1)

            # create MLP
            mlps = []
            delta_preds = []
            self.obs_next_pred = []
            for i in range(self.num_models):
                with tf.variable_scope('model_{}'.format(i)):
                    mlp = MLP(name,
                              obs_space_dims,
                              hidden_sizes,
                              hidden_nonlinearity,
                              output_nonlinearity,
                              input_var=self.nn_input,
                              input_shape=(obs_space_dims +
                                           action_space_dims, ),
                              weight_normalization=weight_normalization)
                    mlps.append(mlp)

                delta_preds.append(mlp.output)
            self.delta_pred = tf.stack(
                delta_preds, axis=2)  # shape: (batch_size, ndim_obs, n_models)

            # define loss and train_op
            self.loss = tf.reduce_mean(
                (self.delta_ph[:, :, None] - self.delta_pred)**2)
            self.optimizer = tf.train.AdamOptimizer(self.step_size)
            self.train_op = self.optimizer.minimize(self.loss)

            # tensor_utils
            self.f_delta_pred = tensor_utils.compile_function(
                [self.obs_ph, self.act_ph], self.delta_pred)

        LayersPowered.__init__(self, [mlp.output_layer for mlp in mlps])
예제 #19
0
    def __init__(self,
                 name,
                 env_spec,
                 hidden_sizes=(500, 500),
                 hidden_nonlinearity=tf.nn.relu,
                 output_nonlinearity=None,
                 batch_size=500,
                 step_size=0.001,
                 weight_normalization=True,
                 normalize_input=True,
                 optimizer=tf.train.AdamOptimizer,
                 valid_split_ratio=0.2,
                 rolling_average_persitency=0.99):

        Serializable.quick_init(self, locals())

        self.normalization = None
        self.normalize_input = normalize_input

        self.valid_split_ratio = valid_split_ratio
        self.rolling_average_persitency = rolling_average_persitency

        with tf.variable_scope(name):
            self.batch_size = batch_size
            self.step_size = step_size

            # determine dimensionality of state and action space
            self.obs_space_dims = env_spec.observation_space.shape[0]
            self.action_space_dims = env_spec.action_space.shape[0]

            # placeholders
            self.obs_ph = tf.placeholder(tf.float32,
                                         shape=(None, self.obs_space_dims))
            self.act_ph = tf.placeholder(tf.float32,
                                         shape=(None, self.action_space_dims))
            self.delta_ph = tf.placeholder(tf.float32,
                                           shape=(None, self.obs_space_dims))

            # concatenate action and observation --> NN input
            self.nn_input = tf.concat([self.obs_ph, self.act_ph], axis=1)

            # create MLP
            mlp = MLP(name,
                      self.obs_space_dims,
                      hidden_sizes,
                      hidden_nonlinearity,
                      output_nonlinearity,
                      input_var=self.nn_input,
                      input_shape=(self.obs_space_dims +
                                   self.action_space_dims, ),
                      weight_normalization=weight_normalization)

            self.delta_pred = mlp.output

            # define loss and train_op
            self.loss = tf.reduce_mean((self.delta_ph - self.delta_pred)**2)
            self.optimizer = optimizer(self.step_size)
            self.train_op = self.optimizer.minimize(self.loss)

            # tensor_utils
            self.f_delta_pred = tensor_utils.compile_function(
                [self.obs_ph, self.act_ph], self.delta_pred)

        LayersPowered.__init__(self, [mlp.output_layer])
예제 #20
0
    def init_opt(self):

        # First, create "target" policy and Q functions
        with tf.variable_scope("target_policy"):
            target_policy = Serializable.clone(self.policy)
        with tf.variable_scope("target_qf"):
            target_qf = Serializable.clone(self.qf)

        # y need to be computed first
        obs = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1,
        )

        # The yi values are computed separately as above and then passed to
        # the training functions below
        action = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1,
        )

        yvar = tensor_utils.new_tensor(
            'ys',
            ndim=1,
            dtype=tf.float32,
        )

        obs_offpolicy = self.env.observation_space.new_tensor_variable(
            'obs_offpolicy',
            extra_dims=1,
        )

        action_offpolicy = self.env.action_space.new_tensor_variable(
            'action_offpolicy',
            extra_dims=1,
        )

        yvar = tensor_utils.new_tensor(
            'ys',
            ndim=1,
            dtype=tf.float32,
        )

        yvar_offpolicy = tensor_utils.new_tensor(
            'ys_offpolicy',
            ndim=1,
            dtype=tf.float32,
        )

        qf_weight_decay_term = 0.5 * self.qf_weight_decay * \
                               sum([tf.reduce_sum(tf.square(param)) for param in
                                    self.qf.get_params(regularizable=True)])

        qval = self.qf.get_qval_sym(obs, action)
        qval_off = self.qf.get_qval_sym(obs_offpolicy, action_offpolicy)

        qf_loss = tf.reduce_mean(tf.square(yvar - qval))
        qf_loss_off = tf.reduce_mean(tf.square(yvar_offpolicy - qval_off))

        # TODO: penalize dramatic changes in gating_func
        # if PENALIZE_GATING_DISTRIBUTION_DIVERGENCE:


        policy_weight_decay_term = 0.5 * self.policy_weight_decay * \
                                   sum([tf.reduce_sum(tf.square(param))
                                        for param in self.policy.get_params(regularizable=True)])
        policy_qval = self.qf.get_qval_sym(obs,
                                           self.policy.get_action_sym(obs),
                                           deterministic=True)

        policy_qval_off = self.qf.get_qval_sym(
            obs_offpolicy,
            self.policy.get_action_sym(obs_offpolicy),
            deterministic=True)

        policy_surr = -tf.reduce_mean(policy_qval)
        policy_surr_off = -tf.reduce_mean(policy_qval_off)

        if self.sigma_type == 'unified-gated' or self.sigma_type == 'unified-gated-decaying':
            print("Using Gated Sigma!")

            input_to_gates = tf.concat([obs, obs_offpolicy], axis=1)

            assert input_to_gates.get_shape().as_list()[-1] == obs.get_shape(
            ).as_list()[-1] + obs_offpolicy.get_shape().as_list()[-1]

            # TODO: right now this is a soft-gate, should make a hard-gate (options vs mixtures)
            gating_func = MLP(
                name="sigma_gate",
                output_dim=1,
                hidden_sizes=(64, 64),
                hidden_nonlinearity=tf.nn.relu,
                output_nonlinearity=tf.nn.sigmoid,
                input_var=input_to_gates,
                input_shape=tuple(
                    input_to_gates.get_shape().as_list()[1:])).output
        elif self.sigma_type == 'unified':
            # sample a bernoulli random variable
            print("Using Bernoulli sigma!")
            gating_func = tf.cast(self.random_dist.sample(qf_loss.get_shape()),
                                  tf.float32)
        elif self.sigma_type == 'unified-decaying':
            print("Using decaying sigma!")
            gating_func = tf.train.exponential_decay(1.0,
                                                     self.train_step,
                                                     20,
                                                     0.96,
                                                     staircase=True)
        else:
            raise Exception("sigma type not supported")

        qf_inputs_list = [
            yvar, obs, action, yvar_offpolicy, obs_offpolicy, action_offpolicy,
            self.train_step
        ]
        qf_reg_loss = qf_loss * (1.0 - gating_func) + qf_loss_off * (
            gating_func) + qf_weight_decay_term

        policy_input_list = [obs, obs_offpolicy, self.train_step]
        policy_reg_surr = policy_surr * (
            1.0 - gating_func) + policy_surr_off * (
                gating_func) + policy_weight_decay_term

        if self.sigma_type == 'unified-gated-decaying':
            print("Adding a decaying factor to gated sigma!")
            decaying_factor = tf.train.exponential_decay(.5,
                                                         self.train_step,
                                                         20,
                                                         0.96,
                                                         staircase=True)
            penalty = decaying_factor * tf.nn.l2_loss(gating_func)
            qf_reg_loss += penalty
            policy_reg_surr += penalty

        self.qf_update_method.update_opt(qf_reg_loss,
                                         target=self.qf,
                                         inputs=qf_inputs_list)

        self.policy_update_method.update_opt(policy_reg_surr,
                                             target=self.policy,
                                             inputs=policy_input_list)

        f_train_qf = tensor_utils.compile_function(
            inputs=qf_inputs_list,
            outputs=[qf_loss, qval, self.qf_update_method._train_op],
        )

        f_train_policy = tensor_utils.compile_function(
            inputs=policy_input_list,
            outputs=[policy_surr, self.policy_update_method._train_op],
        )

        self.opt_info = dict(
            f_train_qf=f_train_qf,
            f_train_policy=f_train_policy,
            target_qf=target_qf,
            target_policy=target_policy,
        )
예제 #21
0
    def __init__(
        self,
        name,
        input_shape,
        output_dim,
        network=None,
        hidden_sizes=(32, 32),
        hidden_nonlinearity=tf.nn.tanh,
        output_nonlinearity=None,
        optimizer=None,
        normalize_inputs=True,
    ):
        """
        :param input_shape: Shape of the input data.
        :param output_dim: Dimension of output.
        :param hidden_sizes: Number of hidden units of each layer of the mean network.
        :param hidden_nonlinearity: Non-linearity used for each layer of the mean network.
        :param optimizer: Optimizer for minimizing the negative log-likelihood.
        """
        Serializable.quick_init(self, locals())

        with tf.variable_scope(name):

            if optimizer is None:
                optimizer = LbfgsOptimizer(name="optimizer")

            self.output_dim = output_dim
            self.optimizer = optimizer

            if network is None:
                network = MLP(input_shape=input_shape,
                              output_dim=output_dim,
                              hidden_sizes=hidden_sizes,
                              hidden_nonlinearity=hidden_nonlinearity,
                              output_nonlinearity=output_nonlinearity,
                              name="network")

            l_out = network.output_layer

            LayersPowered.__init__(self, [l_out])

            xs_var = network.input_layer.input_var
            ys_var = tf.placeholder(dtype=tf.float32,
                                    shape=[None, output_dim],
                                    name="ys")

            x_mean_var = tf.get_variable(name="x_mean",
                                         shape=(1, ) + input_shape,
                                         initializer=tf.constant_initializer(
                                             0., dtype=tf.float32))
            x_std_var = tf.get_variable(name="x_std",
                                        shape=(1, ) + input_shape,
                                        initializer=tf.constant_initializer(
                                            1., dtype=tf.float32))

            normalized_xs_var = (xs_var - x_mean_var) / x_std_var

            fit_ys_var = L.get_output(l_out,
                                      {network.input_layer: normalized_xs_var})

            loss = -tf.reduce_mean(tf.square(fit_ys_var - ys_var))

            self.f_predict = tensor_utils.compile_function([xs_var],
                                                           fit_ys_var)

            optimizer_args = dict(
                loss=loss,
                target=self,
                network_outputs=[fit_ys_var],
            )

            optimizer_args["inputs"] = [xs_var, ys_var]

            self.optimizer.update_opt(**optimizer_args)

            self.name = name
            self.l_out = l_out

            self.normalize_inputs = normalize_inputs
            self.x_mean_var = x_mean_var
            self.x_std_var = x_std_var
예제 #22
0
    def __init__(
        self,
        input_shape,
        output_dim,
        name,
        hidden_sizes=(32, 32),
        hidden_nonlinearity=tf.nn.relu,
        optimizer=None,
        tr_optimizer=None,
        use_trust_region=True,
        step_size=0.01,
        normalize_inputs=True,
        no_initial_trust_region=True,
    ):
        """
        :param input_shape: Shape of the input data.
        :param output_dim: Dimension of output.
        :param hidden_sizes: Number of hidden units of each layer of the mean network.
        :param hidden_nonlinearity: Non-linearity used for each layer of the mean network.
        :param optimizer: Optimizer for minimizing the negative log-likelihood.
        :param use_trust_region: Whether to use trust region constraint.
        :param step_size: KL divergence constraint for each iteration
        """
        Serializable.quick_init(self, locals())

        with tf.variable_scope(name):

            if optimizer is None:
                optimizer = LbfgsOptimizer(name="optimizer")
            if tr_optimizer is None:
                tr_optimizer = ConjugateGradientOptimizer()

            self.output_dim = output_dim
            self.optimizer = optimizer
            self.tr_optimizer = tr_optimizer

            p_network = MLP(input_shape=input_shape,
                            output_dim=output_dim,
                            hidden_sizes=hidden_sizes,
                            hidden_nonlinearity=hidden_nonlinearity,
                            output_nonlinearity=tf.nn.sigmoid,
                            name="p_network")

            l_p = p_network.output_layer

            LayersPowered.__init__(self, [l_p])

            xs_var = p_network.input_layer.input_var
            ys_var = tf.placeholder(dtype=tf.float32,
                                    shape=(None, output_dim),
                                    name="ys")
            old_p_var = tf.placeholder(dtype=tf.float32,
                                       shape=(None, output_dim),
                                       name="old_p")

            x_mean_var = tf.get_variable(name="x_mean",
                                         initializer=tf.zeros_initializer,
                                         shape=(1, ) + input_shape)
            x_std_var = tf.get_variable(name="x_std",
                                        initializer=tf.ones_initializer,
                                        shape=(1, ) + input_shape)

            normalized_xs_var = (xs_var - x_mean_var) / x_std_var

            p_var = L.get_output(l_p,
                                 {p_network.input_layer: normalized_xs_var})

            old_info_vars = dict(p=old_p_var)
            info_vars = dict(p=p_var)

            dist = self._dist = Bernoulli(output_dim)

            mean_kl = tf.reduce_mean(dist.kl_sym(old_info_vars, info_vars))

            loss = -tf.reduce_mean(dist.log_likelihood_sym(ys_var, info_vars))

            predicted = p_var >= 0.5

            self.f_predict = tensor_utils.compile_function([xs_var], predicted)
            self.f_p = tensor_utils.compile_function([xs_var], p_var)
            self.l_p = l_p

            self.optimizer.update_opt(loss=loss,
                                      target=self,
                                      network_outputs=[p_var],
                                      inputs=[xs_var, ys_var])
            self.tr_optimizer.update_opt(loss=loss,
                                         target=self,
                                         network_outputs=[p_var],
                                         inputs=[xs_var, ys_var, old_p_var],
                                         leq_constraint=(mean_kl, step_size))

            self.use_trust_region = use_trust_region
            self.name = name

            self.normalize_inputs = normalize_inputs
            self.x_mean_var = x_mean_var
            self.x_std_var = x_std_var
            self.first_optimized = not no_initial_trust_region
예제 #23
0
    def parse_env_args(self, env, args):

        if isinstance(args, dict):
            args = to_named_tuple(args)

        # Multi-agent wrapper
        env = RLLabEnv(env, ma_mode=args.control)
        env = MATfEnv(env)

        # Policy
        if args.recurrent:
            if args.feature_net:
                feature_network = MLP(
                    name='feature_net',
                    input_shape=(env.spec.observation_space.flat_dim +
                                 env.spec.action_space.flat_dim, ),
                    output_dim=args.feature_output,
                    hidden_sizes=tuple(args.feature_hidden),
                    hidden_nonlinearity=tf.nn.tanh,
                    output_nonlinearity=None)
            elif args.conv:
                strides = tuple(args.conv_strides)
                chans = tuple(args.conv_channels)
                filts = tuple(args.conv_filters)

                assert len(strides) == len(chans) == len(
                    filts), "strides, chans and filts not equal"
                # only discrete actions supported, should be straightforward to extend to continuous
                assert isinstance(
                    env.spec.action_space,
                    Discrete), "Only discrete action spaces support conv"
                feature_network = ConvNetwork(
                    name='feature_net',
                    input_shape=env.spec.observation_space.shape,
                    output_dim=args.feature_output,
                    conv_filters=chans,
                    conv_filter_sizes=filts,
                    conv_strides=strides,
                    conv_pads=('VALID', ) * len(chans),
                    hidden_sizes=tuple(args.feature_hidden),
                    hidden_nonlinearity=tf.nn.relu,
                    output_nonlinearity=None)
            else:
                feature_network = None
            if args.recurrent == 'gru':
                if isinstance(env.spec.action_space, Box):
                    if args.control == 'concurrent':
                        policies = [
                            GaussianGRUPolicy(env_spec=env.spec,
                                              feature_network=feature_network,
                                              hidden_dim=int(
                                                  args.policy_hidden[0]),
                                              name='policy_{}'.format(agid))
                            for agid in range(len(env.agents))
                        ]
                    policy = GaussianGRUPolicy(env_spec=env.spec,
                                               feature_network=feature_network,
                                               hidden_dim=int(
                                                   args.policy_hidden[0]),
                                               name='policy')
                elif isinstance(env.spec.action_space, Discrete):
                    if args.control == 'concurrent':
                        policies = [
                            CategoricalGRUPolicy(
                                env_spec=env.spec,
                                feature_network=feature_network,
                                hidden_dim=int(args.policy_hidden[0]),
                                name='policy_{}'.format(agid),
                                state_include_action=False
                                if args.conv else True)
                            for agid in range(len(env.agents))
                        ]
                    q_network = CategoricalGRUPolicy(
                        env_spec=env.spec,
                        feature_network=feature_network,
                        hidden_dim=int(args.policy_hidden[0]),
                        name='q_network',
                        state_include_action=False if args.conv else True)
                    target_q_network = CategoricalGRUPolicy(
                        env_spec=env.spec,
                        feature_network=feature_network,
                        hidden_dim=int(args.policy_hidden[0]),
                        name='target_q_network',
                        state_include_action=False if args.conv else True)
                    policy = {
                        'q_network': q_network,
                        'target_q_network': target_q_network
                    }
                else:
                    raise NotImplementedError(env.spec.observation_space)

            elif args.recurrent == 'lstm':
                if isinstance(env.spec.action_space, Box):
                    if args.control == 'concurrent':
                        policies = [
                            GaussianLSTMPolicy(env_spec=env.spec,
                                               feature_network=feature_network,
                                               hidden_dim=int(
                                                   args.policy_hidden),
                                               name='policy_{}'.format(agid))
                            for agid in range(len(env.agents))
                        ]
                    policy = GaussianLSTMPolicy(
                        env_spec=env.spec,
                        feature_network=feature_network,
                        hidden_dim=int(args.policy_hidden),
                        name='policy')
                elif isinstance(env.spec.action_space, Discrete):
                    if args.control == 'concurrent':
                        policies = [
                            CategoricalLSTMPolicy(
                                env_spec=env.spec,
                                feature_network=feature_network,
                                hidden_dim=int(args.policy_hidden),
                                name='policy_{}'.format(agid))
                            for agid in range(len(env.agents))
                        ]
                    q_network = CategoricalLSTMPolicy(
                        env_spec=env.spec,
                        feature_network=feature_network,
                        hidden_dim=int(args.policy_hidden),
                        name='q_network')
                    target_q_network = CategoricalLSTMPolicy(
                        env_spec=env.spec,
                        feature_network=feature_network,
                        hidden_dim=int(args.policy_hidden),
                        name='target_q_network')
                    policy = {
                        'q_network': q_network,
                        'target_q_network': target_q_network
                    }
                else:
                    raise NotImplementedError(env.spec.action_space)

            else:
                raise NotImplementedError(args.recurrent)
        elif args.conv:
            strides = tuple(args.conv_strides)
            chans = tuple(args.conv_channels)
            filts = tuple(args.conv_filters)

            assert len(strides) == len(chans) == len(
                filts), "strides, chans and filts not equal"
            # only discrete actions supported, should be straightforward to extend to continuous
            assert isinstance(
                env.spec.action_space,
                Discrete), "Only discrete action spaces support conv"
            feature_network = ConvNetwork(
                name='feature_net',
                input_shape=env.spec.observation_space.shape,
                output_dim=env.spec.action_space.n,
                conv_filters=chans,
                conv_filter_sizes=filts,
                conv_strides=strides,
                conv_pads=(args.conv_pads, ) * len(chans),
                hidden_sizes=tuple(args.policy_hidden),
                hidden_nonlinearity=tf.nn.relu,
                output_nonlinearity=tf.nn.softmax,
                batch_normalization=args.batch_normalization)
            if args.algo == 'dqn':
                q_network = CategoricalMLPPolicy(name='q_network',
                                                 env_spec=env.spec,
                                                 prob_network=feature_network)
                target_q_network = CategoricalMLPPolicy(
                    name='target_q_network',
                    env_spec=env.spec,
                    prob_network=feature_network)
                policy = {
                    'q_network': q_network,
                    'target_q_network': target_q_network
                }

            else:
                policy = CategoricalMLPPolicy(name='policy',
                                              env_spec=env.spec,
                                              prob_network=feature_network)
        else:
            if env.spec is None:

                networks = [
                    DQNNetwork(i,
                               env,
                               target_network_update_freq=self.args.
                               target_network_update,
                               discount_factor=self.args.discount,
                               batch_size=self.args.batch_size,
                               learning_rate=self.args.qfunc_lr)
                    for i in range(env.n)
                ]

                policy = networks

            elif isinstance(env.spec.action_space, Box):
                policy = GaussianMLPPolicy(env_spec=env.spec,
                                           hidden_sizes=tuple(
                                               args.policy_hidden),
                                           min_std=args.min_std,
                                           name='policy')
            elif isinstance(env.spec.action_space, Discrete):
                policy = CategoricalMLPPolicy(env_spec=env.spec,
                                              hidden_sizes=tuple(
                                                  args.policy_hidden),
                                              name='policy')
            else:
                raise NotImplementedError(env.spec.action_space)

        return env, policy
예제 #24
0
    def make_network(self,
                     dim_input,
                     dim_output,
                     nn_input=None,
                     target=None,
                     hidden_sizes=(50, )):
        """
        An example a network in tf that has both state and image inputs.
        Args:
            dim_input: Dimensionality of input. expecting 2d tuple (num_frames x num_batches)
            dim_output: Dimensionality of the output.
            batch_size: Batch size.
            network_config: dictionary of network structure parameters
        Returns:
            A tfMap object that stores inputs, outputs, and scalar loss.
        """

        if nn_input is None:
            nn_input = tf.placeholder('float',
                                      [None, dim_input[0], dim_input[1]],
                                      name='nn_input')

        if target is None:
            target = tf.placeholder('float', [None, dim_output],
                                    name='targets')

        l_in = L.InputLayer(shape=(None, ) + tuple(dim_input),
                            input_var=nn_input,
                            name="input")

        prob_network = MLP(output_dim=dim_output,
                           hidden_sizes=hidden_sizes,
                           hidden_nonlinearity=tf.nn.relu,
                           output_nonlinearity=None,
                           name="pred_network",
                           input_layer=l_in)

        fc_output = L.get_output(prob_network.output_layer)

        loss, optimizer = self.get_loss_layer(pred=fc_output,
                                              target_output=target)

        self.class_target = target
        self.nn_input = nn_input
        self.discrimination_logits = fc_output
        self.optimizer = optimizer
        self.loss = loss

        label_accuracy = tf.equal(
            tf.round(tf.nn.sigmoid(self.discrimination_logits)),
            tf.round(self.class_target))

        self.label_accuracy = tf.reduce_mean(
            tf.cast(label_accuracy, tf.float32))
        self.mse = tf.reduce_mean(
            tf.nn.l2_loss(
                tf.nn.sigmoid(self.discrimination_logits) - self.class_target))

        ones = tf.ones_like(self.class_target)

        true_positives = tf.round(tf.nn.sigmoid(
            self.discrimination_logits)) * tf.round(self.class_target)
        predicted_positives = tf.round(
            tf.nn.sigmoid(self.discrimination_logits))

        false_negatives = tf.logical_not(
            tf.logical_xor(
                tf.equal(tf.round(tf.nn.sigmoid(self.discrimination_logits)),
                         ones), tf.equal(tf.round(self.class_target), ones)))

        self.label_precision = tf.reduce_sum(
            tf.cast(true_positives, tf.float32)) / tf.reduce_sum(
                tf.cast(predicted_positives, tf.float32))
        self.label_recall = tf.reduce_sum(tf.cast(
            true_positives, tf.float32)) / (
                tf.reduce_sum(tf.cast(true_positives, tf.float32)) +
                tf.reduce_sum(tf.cast(false_negatives, tf.float32)))
    def __init__(
            self,
            name,
            env_spec,
            hidden_sizes=(32, 32),
            learn_std=True,
            init_std=1.0,
            adaptive_std=False,
            std_share_network=False,
            std_hidden_sizes=(32, 32),
            min_std=1e-6,
            std_hidden_nonlinearity=tf.nn.tanh,
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
            mean_network=None,
            std_network=None,
            std_parametrization='exp'
    ):
        """
        :param env_spec:
        :param hidden_sizes: list of sizes for the fully-connected hidden layers
        :param learn_std: Is std trainable
        :param init_std: Initial std
        :param adaptive_std:
        :param std_share_network:
        :param std_hidden_sizes: list of sizes for the fully-connected layers for std
        :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues
        :param std_hidden_nonlinearity:
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param output_nonlinearity: nonlinearity for the output layer
        :param mean_network: custom network for the output mean
        :param std_network: custom network for the output log std
        :param std_parametrization: how the std should be parametrized. There are a few options:
            - exp: the logarithm of the std will be stored, and applied a exponential transformation
            - softplus: the std will be computed as log(1+exp(x))
        :return:
        """
        Serializable.quick_init(self, locals())
        assert isinstance(env_spec.action_space, Box)

        with tf.variable_scope(name):

            obs_dim = env_spec.observation_space.flat_dim
            action_dim = env_spec.action_space.flat_dim

            # create network
            if mean_network is None:
                mean_network = MLP(
                    name="mean_network",
                    input_shape=(obs_dim,),
                    output_dim=action_dim,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=output_nonlinearity,
                )
            self._mean_network = mean_network

            l_mean = mean_network.output_layer
            obs_var = mean_network.input_layer.input_var

            if std_network is not None:
                l_std_param = std_network.output_layer
            else:
                if adaptive_std:
                    std_network = MLP(
                        name="std_network",
                        input_shape=(obs_dim,),
                        input_layer=mean_network.input_layer,
                        output_dim=action_dim,
                        hidden_sizes=std_hidden_sizes,
                        hidden_nonlinearity=std_hidden_nonlinearity,
                        output_nonlinearity=None,
                    )
                    l_std_param = std_network.output_layer
                else:
                    if std_parametrization == 'exp':
                        init_std_param = np.log(init_std)
                    elif std_parametrization == 'softplus':
                        init_std_param = np.log(np.exp(init_std) - 1)
                    else:
                        raise NotImplementedError
                    l_std_param = L.ParamLayer(
                        mean_network.input_layer,
                        num_units=action_dim,
                        param=tf.constant_initializer(init_std_param),
                        name="output_std_param",
                        trainable=learn_std,
                    )

            self.std_parametrization = std_parametrization

            if std_parametrization == 'exp':
                min_std_param = np.log(min_std)
            elif std_parametrization == 'softplus':
                min_std_param = np.log(np.exp(min_std) - 1)
            else:
                raise NotImplementedError

            self.min_std_param = min_std_param

            # mean_var, log_std_var = L.get_output([l_mean, l_std_param])
            #
            # if self.min_std_param is not None:
            #     log_std_var = tf.maximum(log_std_var, np.log(min_std))
            #
            # self._mean_var, self._log_std_var = mean_var, log_std_var

            self._l_mean = l_mean
            self._l_std_param = l_std_param

            self._dist = DiagonalGaussian(action_dim)

            LayersPowered.__init__(self, [l_mean, l_std_param])
            super(GaussianMLPPolicy, self).__init__(env_spec)

            dist_info_sym = self.dist_info_sym(mean_network.input_layer.input_var, dict())
            mean_var = dist_info_sym["mean"]
            log_std_var = dist_info_sym["log_std"]

            self._f_dist = tensor_utils.compile_function(
                inputs=[obs_var],
                outputs=[mean_var, log_std_var],
            )
예제 #26
0
def main():
    now = datetime.datetime.now(dateutil.tz.tzlocal())
    rand_id = str(uuid.uuid4())[:5]
    timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z')
    default_exp_name = 'experiment_%s_%s' % (timestamp, rand_id)

    parser = argparse.ArgumentParser()
    parser.add_argument('--exp_name',
                        type=str,
                        default=default_exp_name,
                        help='Name of the experiment.')

    parser.add_argument('--discount', type=float, default=0.99)
    parser.add_argument('--gae_lambda', type=float, default=1.0)
    parser.add_argument('--reward_scale', type=float, default=1.0)

    parser.add_argument('--n_iter', type=int, default=250)
    parser.add_argument('--sampler_workers', type=int, default=1)
    parser.add_argument('--max_traj_len', type=int, default=250)
    parser.add_argument('--update_curriculum',
                        action='store_true',
                        default=False)
    parser.add_argument('--n_timesteps', type=int, default=8000)
    parser.add_argument('--control', type=str, default='centralized')

    parser.add_argument('--rectangle', type=str, default='10,10')
    parser.add_argument('--map_type', type=str, default='rectangle')
    parser.add_argument('--n_evaders', type=int, default=5)
    parser.add_argument('--n_pursuers', type=int, default=2)
    parser.add_argument('--obs_range', type=int, default=3)
    parser.add_argument('--n_catch', type=int, default=2)
    parser.add_argument('--urgency', type=float, default=0.0)
    parser.add_argument('--pursuit', dest='train_pursuit', action='store_true')
    parser.add_argument('--evade', dest='train_pursuit', action='store_false')
    parser.set_defaults(train_pursuit=True)
    parser.add_argument('--surround', action='store_true', default=False)
    parser.add_argument('--constraint_window', type=float, default=1.0)
    parser.add_argument('--sample_maps', action='store_true', default=False)
    parser.add_argument('--map_file', type=str, default='../maps/map_pool.npy')
    parser.add_argument('--flatten', action='store_true', default=False)
    parser.add_argument('--reward_mech', type=str, default='global')
    parser.add_argument('--catchr', type=float, default=0.1)
    parser.add_argument('--term_pursuit', type=float, default=5.0)

    parser.add_argument('--recurrent', type=str, default=None)
    parser.add_argument('--policy_hidden_sizes', type=str, default='128,128')
    parser.add_argument('--baselin_hidden_sizes', type=str, default='128,128')
    parser.add_argument('--baseline_type', type=str, default='linear')

    parser.add_argument('--conv', action='store_true', default=False)

    parser.add_argument('--max_kl', type=float, default=0.01)

    parser.add_argument('--checkpoint', type=str, default=None)

    parser.add_argument('--log_dir', type=str, required=False)
    parser.add_argument('--tabular_log_file',
                        type=str,
                        default='progress.csv',
                        help='Name of the tabular log file (in csv).')
    parser.add_argument('--text_log_file',
                        type=str,
                        default='debug.log',
                        help='Name of the text log file (in pure text).')
    parser.add_argument('--params_log_file',
                        type=str,
                        default='params.json',
                        help='Name of the parameter log file (in json).')
    parser.add_argument('--seed', type=int, help='Random seed for numpy')
    parser.add_argument('--args_data',
                        type=str,
                        help='Pickled data for stub objects')
    parser.add_argument('--snapshot_mode',
                        type=str,
                        default='all',
                        help='Mode to save the snapshot. Can be either "all" '
                        '(all iterations will be saved), "last" (only '
                        'the last iteration will be saved), or "none" '
                        '(do not save snapshots)')
    parser.add_argument(
        '--log_tabular_only',
        type=ast.literal_eval,
        default=False,
        help=
        'Whether to only print the tabular log information (in a horizontal format)'
    )

    args = parser.parse_args()

    parallel_sampler.initialize(n_parallel=args.sampler_workers)

    if args.seed is not None:
        set_seed(args.seed)
        parallel_sampler.set_seed(args.seed)

    args.hidden_sizes = tuple(map(int, args.policy_hidden_sizes.split(',')))

    if args.checkpoint:
        with tf.Session() as sess:
            data = joblib.load(args.checkpoint)
            policy = data['policy']
            env = data['env']
    else:
        if args.sample_maps:
            map_pool = np.load(args.map_file)
        else:
            if args.map_type == 'rectangle':
                env_map = TwoDMaps.rectangle_map(
                    *map(int, args.rectangle.split(',')))
            elif args.map_type == 'complex':
                env_map = TwoDMaps.complex_map(
                    *map(int, args.rectangle.split(',')))
            else:
                raise NotImplementedError()
            map_pool = [env_map]

        env = PursuitEvade(map_pool,
                           n_evaders=args.n_evaders,
                           n_pursuers=args.n_pursuers,
                           obs_range=args.obs_range,
                           n_catch=args.n_catch,
                           train_pursuit=args.train_pursuit,
                           urgency_reward=args.urgency,
                           surround=args.surround,
                           sample_maps=args.sample_maps,
                           constraint_window=args.constraint_window,
                           flatten=args.flatten,
                           reward_mech=args.reward_mech,
                           catchr=args.catchr,
                           term_pursuit=args.term_pursuit)

        env = TfEnv(
            RLLabEnv(StandardizedEnv(env,
                                     scale_reward=args.reward_scale,
                                     enable_obsnorm=False),
                     mode=args.control))

        if args.recurrent:
            if args.conv:
                feature_network = ConvNetwork(
                    name='feature_net',
                    input_shape=emv.spec.observation_space.shape,
                    output_dim=5,
                    conv_filters=(16, 32, 32),
                    conv_filter_sizes=(3, 3, 3),
                    conv_strides=(1, 1, 1),
                    conv_pads=('VALID', 'VALID', 'VALID'),
                    hidden_sizes=(64, ),
                    hidden_nonlinearity=tf.nn.relu,
                    output_nonlinearity=tf.nn.softmax)
            else:
                feature_network = MLP(
                    name='feature_net',
                    input_shape=(env.spec.observation_space.flat_dim +
                                 env.spec.action_space.flat_dim, ),
                    output_dim=5,
                    hidden_sizes=(256, 128, 64),
                    hidden_nonlinearity=tf.nn.tanh,
                    output_nonlinearity=None)
            if args.recurrent == 'gru':
                policy = CategoricalGRUPolicy(env_spec=env.spec,
                                              feature_network=feature_network,
                                              hidden_dim=int(
                                                  args.policy_hidden_sizes),
                                              name='policy')
            elif args.recurrent == 'lstm':
                policy = CategoricalLSTMPolicy(env_spec=env.spec,
                                               feature_network=feature_network,
                                               hidden_dim=int(
                                                   args.policy_hidden_sizes),
                                               name='policy')
        elif args.conv:
            feature_network = ConvNetwork(
                name='feature_net',
                input_shape=env.spec.observation_space.shape,
                output_dim=5,
                conv_filters=(8, 16),
                conv_filter_sizes=(3, 3),
                conv_strides=(2, 1),
                conv_pads=('VALID', 'VALID'),
                hidden_sizes=(32, ),
                hidden_nonlinearity=tf.nn.relu,
                output_nonlinearity=tf.nn.softmax)
            policy = CategoricalMLPPolicy(name='policy',
                                          env_spec=env.spec,
                                          prob_network=feature_network)
        else:
            policy = CategoricalMLPPolicy(name='policy',
                                          env_spec=env.spec,
                                          hidden_sizes=args.hidden_sizes)

    if args.baseline_type == 'linear':
        baseline = LinearFeatureBaseline(env_spec=env.spec)
    else:
        baseline = ZeroBaseline(env_spec=env.spec)

    # logger
    default_log_dir = config.LOG_DIR
    if args.log_dir is None:
        log_dir = osp.join(default_log_dir, args.exp_name)
    else:
        log_dir = args.log_dir
    tabular_log_file = osp.join(log_dir, args.tabular_log_file)
    text_log_file = osp.join(log_dir, args.text_log_file)
    params_log_file = osp.join(log_dir, args.params_log_file)

    logger.log_parameters_lite(params_log_file, args)
    logger.add_text_output(text_log_file)
    logger.add_tabular_output(tabular_log_file)
    prev_snapshot_dir = logger.get_snapshot_dir()
    prev_mode = logger.get_snapshot_mode()
    logger.set_snapshot_dir(log_dir)
    logger.set_snapshot_mode(args.snapshot_mode)
    logger.set_log_tabular_only(args.log_tabular_only)
    logger.push_prefix("[%s] " % args.exp_name)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=args.n_timesteps,
        max_path_length=args.max_traj_len,
        n_itr=args.n_iter,
        discount=args.discount,
        gae_lambda=args.gae_lambda,
        step_size=args.max_kl,
        optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(
            base_eps=1e-5)) if args.recurrent else None,
        mode=args.control,
    )

    algo.train()